summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/bpf_trace.c180
-rw-r--r--kernel/trace/ftrace.c100
-rw-r--r--kernel/trace/ring_buffer.c196
-rw-r--r--kernel/trace/synth_event_gen_test.c11
-rw-r--r--kernel/trace/trace.c36
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_events_hist.c12
-rw-r--r--kernel/trace/trace_events_synth.c4
-rw-r--r--kernel/trace/trace_events_user.c4
-rw-r--r--kernel/trace/trace_output.c6
-rw-r--r--kernel/trace/trace_uprobe.c2
11 files changed, 338 insertions, 214 deletions
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 84e8a0f6e4e0..7ac6c52b25eb 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -24,6 +24,7 @@
#include <linux/key.h>
#include <linux/verification.h>
#include <linux/namei.h>
+#include <linux/fileattr.h>
#include <net/bpf_sk_storage.h>
@@ -41,6 +42,9 @@
#define bpf_event_rcu_dereference(p) \
rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))
+#define MAX_UPROBE_MULTI_CNT (1U << 20)
+#define MAX_KPROBE_MULTI_CNT (1U << 20)
+
#ifdef CONFIG_MODULES
struct bpf_trace_module {
struct module *module;
@@ -1376,6 +1380,8 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
struct bpf_dynptr_kern *sig_ptr,
struct bpf_key *trusted_keyring)
{
+ const void *data, *sig;
+ u32 data_len, sig_len;
int ret;
if (trusted_keyring->has_ref) {
@@ -1392,10 +1398,12 @@ __bpf_kfunc int bpf_verify_pkcs7_signature(struct bpf_dynptr_kern *data_ptr,
return ret;
}
- return verify_pkcs7_signature(data_ptr->data,
- __bpf_dynptr_size(data_ptr),
- sig_ptr->data,
- __bpf_dynptr_size(sig_ptr),
+ data_len = __bpf_dynptr_size(data_ptr);
+ data = __bpf_dynptr_data(data_ptr, data_len);
+ sig_len = __bpf_dynptr_size(sig_ptr);
+ sig = __bpf_dynptr_data(sig_ptr, sig_len);
+
+ return verify_pkcs7_signature(data, data_len, sig, sig_len,
trusted_keyring->key,
VERIFYING_UNSPECIFIED_SIGNATURE, NULL,
NULL);
@@ -1427,6 +1435,72 @@ static int __init bpf_key_sig_kfuncs_init(void)
late_initcall(bpf_key_sig_kfuncs_init);
#endif /* CONFIG_KEYS */
+/* filesystem kfuncs */
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_file_xattr - get xattr of a file
+ * @file: file to get xattr from
+ * @name__str: name of the xattr
+ * @value_ptr: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *file* and store the output in *value_ptr*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str,
+ struct bpf_dynptr_kern *value_ptr)
+{
+ struct dentry *dentry;
+ u32 value_len;
+ void *value;
+ int ret;
+
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ dentry = file_dentry(file);
+ ret = inode_permission(&nop_mnt_idmap, dentry->d_inode, MAY_READ);
+ if (ret)
+ return ret;
+ return __vfs_getxattr(dentry, dentry->d_inode, name__str, value, value_len);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(fs_kfunc_set_ids)
+BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS)
+BTF_SET8_END(fs_kfunc_set_ids)
+
+static int bpf_get_file_xattr_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&fs_kfunc_set_ids, kfunc_id))
+ return 0;
+
+ /* Only allow to attach from LSM hooks, to avoid recursion */
+ return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
+}
+
+static const struct btf_kfunc_id_set bpf_fs_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &fs_kfunc_set_ids,
+ .filter = bpf_get_file_xattr_filter,
+};
+
+static int __init bpf_fs_kfuncs_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fs_kfunc_set);
+}
+
+late_initcall(bpf_fs_kfuncs_init);
+
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -2899,6 +2973,8 @@ int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
cnt = attr->link_create.kprobe_multi.cnt;
if (!cnt)
return -EINVAL;
+ if (cnt > MAX_KPROBE_MULTI_CNT)
+ return -E2BIG;
size = cnt * sizeof(*addrs);
addrs = kvmalloc_array(cnt, sizeof(*addrs), GFP_KERNEL);
@@ -3029,6 +3105,7 @@ struct bpf_uprobe_multi_link;
struct bpf_uprobe {
struct bpf_uprobe_multi_link *link;
loff_t offset;
+ unsigned long ref_ctr_offset;
u64 cookie;
struct uprobe_consumer consumer;
};
@@ -3037,6 +3114,7 @@ struct bpf_uprobe_multi_link {
struct path path;
struct bpf_link link;
u32 cnt;
+ u32 flags;
struct bpf_uprobe *uprobes;
struct task_struct *task;
};
@@ -3078,9 +3156,79 @@ static void bpf_uprobe_multi_link_dealloc(struct bpf_link *link)
kfree(umulti_link);
}
+static int bpf_uprobe_multi_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ u64 __user *uref_ctr_offsets = u64_to_user_ptr(info->uprobe_multi.ref_ctr_offsets);
+ u64 __user *ucookies = u64_to_user_ptr(info->uprobe_multi.cookies);
+ u64 __user *uoffsets = u64_to_user_ptr(info->uprobe_multi.offsets);
+ u64 __user *upath = u64_to_user_ptr(info->uprobe_multi.path);
+ u32 upath_size = info->uprobe_multi.path_size;
+ struct bpf_uprobe_multi_link *umulti_link;
+ u32 ucount = info->uprobe_multi.count;
+ int err = 0, i;
+ long left;
+
+ if (!upath ^ !upath_size)
+ return -EINVAL;
+
+ if ((uoffsets || uref_ctr_offsets || ucookies) && !ucount)
+ return -EINVAL;
+
+ umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
+ info->uprobe_multi.count = umulti_link->cnt;
+ info->uprobe_multi.flags = umulti_link->flags;
+ info->uprobe_multi.pid = umulti_link->task ?
+ task_pid_nr_ns(umulti_link->task, task_active_pid_ns(current)) : 0;
+
+ if (upath) {
+ char *p, *buf;
+
+ upath_size = min_t(u32, upath_size, PATH_MAX);
+
+ buf = kmalloc(upath_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = d_path(&umulti_link->path, buf, upath_size);
+ if (IS_ERR(p)) {
+ kfree(buf);
+ return PTR_ERR(p);
+ }
+ upath_size = buf + upath_size - p;
+ left = copy_to_user(upath, p, upath_size);
+ kfree(buf);
+ if (left)
+ return -EFAULT;
+ info->uprobe_multi.path_size = upath_size;
+ }
+
+ if (!uoffsets && !ucookies && !uref_ctr_offsets)
+ return 0;
+
+ if (ucount < umulti_link->cnt)
+ err = -ENOSPC;
+ else
+ ucount = umulti_link->cnt;
+
+ for (i = 0; i < ucount; i++) {
+ if (uoffsets &&
+ put_user(umulti_link->uprobes[i].offset, uoffsets + i))
+ return -EFAULT;
+ if (uref_ctr_offsets &&
+ put_user(umulti_link->uprobes[i].ref_ctr_offset, uref_ctr_offsets + i))
+ return -EFAULT;
+ if (ucookies &&
+ put_user(umulti_link->uprobes[i].cookie, ucookies + i))
+ return -EFAULT;
+ }
+
+ return err;
+}
+
static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
.release = bpf_uprobe_multi_link_release,
.dealloc = bpf_uprobe_multi_link_dealloc,
+ .fill_link_info = bpf_uprobe_multi_link_fill_link_info,
};
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
@@ -3168,7 +3316,6 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
{
struct bpf_uprobe_multi_link *link = NULL;
unsigned long __user *uref_ctr_offsets;
- unsigned long *ref_ctr_offsets = NULL;
struct bpf_link_primer link_primer;
struct bpf_uprobe *uprobes = NULL;
struct task_struct *task = NULL;
@@ -3202,6 +3349,8 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!upath || !uoffsets || !cnt)
return -EINVAL;
+ if (cnt > MAX_UPROBE_MULTI_CNT)
+ return -E2BIG;
uref_ctr_offsets = u64_to_user_ptr(attr->link_create.uprobe_multi.ref_ctr_offsets);
ucookies = u64_to_user_ptr(attr->link_create.uprobe_multi.cookies);
@@ -3241,22 +3390,20 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (!uprobes || !link)
goto error_free;
- if (uref_ctr_offsets) {
- ref_ctr_offsets = kvcalloc(cnt, sizeof(*ref_ctr_offsets), GFP_KERNEL);
- if (!ref_ctr_offsets)
- goto error_free;
- }
-
for (i = 0; i < cnt; i++) {
- if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
+ if (__get_user(uprobes[i].offset, uoffsets + i)) {
err = -EFAULT;
goto error_free;
}
- if (uref_ctr_offsets && __get_user(ref_ctr_offsets[i], uref_ctr_offsets + i)) {
+ if (uprobes[i].offset < 0) {
+ err = -EINVAL;
+ goto error_free;
+ }
+ if (uref_ctr_offsets && __get_user(uprobes[i].ref_ctr_offset, uref_ctr_offsets + i)) {
err = -EFAULT;
goto error_free;
}
- if (__get_user(uprobes[i].offset, uoffsets + i)) {
+ if (ucookies && __get_user(uprobes[i].cookie, ucookies + i)) {
err = -EFAULT;
goto error_free;
}
@@ -3276,6 +3423,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
link->uprobes = uprobes;
link->path = path;
link->task = task;
+ link->flags = flags;
bpf_link_init(&link->link, BPF_LINK_TYPE_UPROBE_MULTI,
&bpf_uprobe_multi_link_lops, prog);
@@ -3283,7 +3431,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
for (i = 0; i < cnt; i++) {
err = uprobe_register_refctr(d_real_inode(link->path.dentry),
uprobes[i].offset,
- ref_ctr_offsets ? ref_ctr_offsets[i] : 0,
+ uprobes[i].ref_ctr_offset,
&uprobes[i].consumer);
if (err) {
bpf_uprobe_unregister(&path, uprobes, i);
@@ -3295,11 +3443,9 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (err)
goto error_free;
- kvfree(ref_ctr_offsets);
return bpf_link_settle(&link_primer);
error_free:
- kvfree(ref_ctr_offsets);
kvfree(uprobes);
kfree(link);
if (task)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8de8bec5f366..b01ae7d36021 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1183,18 +1183,19 @@ static void __add_hash_entry(struct ftrace_hash *hash,
hash->count++;
}
-static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+static struct ftrace_func_entry *
+add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
{
struct ftrace_func_entry *entry;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
- return -ENOMEM;
+ return NULL;
entry->ip = ip;
__add_hash_entry(hash, entry);
- return 0;
+ return entry;
}
static void
@@ -1349,7 +1350,6 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
struct ftrace_func_entry *entry;
struct ftrace_hash *new_hash;
int size;
- int ret;
int i;
new_hash = alloc_ftrace_hash(size_bits);
@@ -1366,8 +1366,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- ret = add_hash_entry(new_hash, entry->ip);
- if (ret < 0)
+ if (add_hash_entry(new_hash, entry->ip) == NULL)
goto free_hash;
}
}
@@ -2536,7 +2535,7 @@ ftrace_find_unique_ops(struct dyn_ftrace *rec)
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
/* Protected by rcu_tasks for reading, and direct_mutex for writing */
-static struct ftrace_hash *direct_functions = EMPTY_HASH;
+static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
static DEFINE_MUTEX(direct_mutex);
int ftrace_direct_func_count;
@@ -2555,39 +2554,6 @@ unsigned long ftrace_find_rec_direct(unsigned long ip)
return entry->direct;
}
-static struct ftrace_func_entry*
-ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
- struct ftrace_hash **free_hash)
-{
- struct ftrace_func_entry *entry;
-
- if (ftrace_hash_empty(direct_functions) ||
- direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
- struct ftrace_hash *new_hash;
- int size = ftrace_hash_empty(direct_functions) ? 0 :
- direct_functions->count + 1;
-
- if (size < 32)
- size = 32;
-
- new_hash = dup_hash(direct_functions, size);
- if (!new_hash)
- return NULL;
-
- *free_hash = direct_functions;
- direct_functions = new_hash;
- }
-
- entry = kmalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- return NULL;
-
- entry->ip = ip;
- entry->direct = addr;
- __add_hash_entry(direct_functions, entry);
- return entry;
-}
-
static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops *ops, struct ftrace_regs *fregs)
{
@@ -4223,8 +4189,8 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
/* Do nothing if it exists */
if (entry)
return 0;
-
- ret = add_hash_entry(hash, rec->ip);
+ if (add_hash_entry(hash, rec->ip) == NULL)
+ ret = -ENOMEM;
}
return ret;
}
@@ -5266,7 +5232,8 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return 0;
}
- return add_hash_entry(hash, ip);
+ entry = add_hash_entry(hash, ip);
+ return entry ? 0 : -ENOMEM;
}
static int
@@ -5410,7 +5377,7 @@ static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long
*/
int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
{
- struct ftrace_hash *hash, *free_hash = NULL;
+ struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
struct ftrace_func_entry *entry, *new;
int err = -EBUSY, size, i;
@@ -5436,17 +5403,44 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
}
}
- /* ... and insert them to direct_functions hash. */
err = -ENOMEM;
+
+ /* Make a copy hash to place the new and the old entries in */
+ size = hash->count + direct_functions->count;
+ if (size > 32)
+ size = 32;
+ new_hash = alloc_ftrace_hash(fls(size));
+ if (!new_hash)
+ goto out_unlock;
+
+ /* Now copy over the existing direct entries */
+ size = 1 << direct_functions->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
+ new = add_hash_entry(new_hash, entry->ip);
+ if (!new)
+ goto out_unlock;
+ new->direct = entry->direct;
+ }
+ }
+
+ /* ... and add the new entries */
+ size = 1 << hash->size_bits;
for (i = 0; i < size; i++) {
hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
- new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
+ new = add_hash_entry(new_hash, entry->ip);
if (!new)
- goto out_remove;
+ goto out_unlock;
+ /* Update both the copy and the hash entry */
+ new->direct = addr;
entry->direct = addr;
}
}
+ free_hash = direct_functions;
+ rcu_assign_pointer(direct_functions, new_hash);
+ new_hash = NULL;
+
ops->func = call_direct_funcs;
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
@@ -5454,17 +5448,17 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr)
err = register_ftrace_function_nolock(ops);
- out_remove:
- if (err)
- remove_direct_functions_hash(hash, addr);
-
out_unlock:
mutex_unlock(&direct_mutex);
- if (free_hash) {
+ if (free_hash && free_hash != EMPTY_HASH) {
synchronize_rcu_tasks();
free_ftrace_hash(free_hash);
}
+
+ if (new_hash)
+ free_ftrace_hash(new_hash);
+
return err;
}
EXPORT_SYMBOL_GPL(register_ftrace_direct);
@@ -6309,7 +6303,7 @@ ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer)
if (entry)
continue;
- if (add_hash_entry(hash, rec->ip) < 0)
+ if (add_hash_entry(hash, rec->ip) == NULL)
goto out;
} else {
if (entry) {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8d2a4f00eca9..9286f88fcd32 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
*cnt = rb_time_cnt(top);
- /* If top and msb counts don't match, this interrupted a write */
- if (*cnt != rb_time_cnt(msb))
+ /* If top, msb or bottom counts don't match, this interrupted a write */
+ if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
return false;
/* The shift to msb will lose its cnt bits */
@@ -700,44 +700,6 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
return local_try_cmpxchg(l, &expect, set);
}
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
- unsigned long cnt, top, bottom, msb;
- unsigned long cnt2, top2, bottom2, msb2;
- u64 val;
-
- /* The cmpxchg always fails if it interrupted an update */
- if (!__rb_time_read(t, &val, &cnt2))
- return false;
-
- if (val != expect)
- return false;
-
- cnt = local_read(&t->cnt);
- if ((cnt & 3) != cnt2)
- return false;
-
- cnt2 = cnt + 1;
-
- rb_time_split(val, &top, &bottom, &msb);
- top = rb_time_val_cnt(top, cnt);
- bottom = rb_time_val_cnt(bottom, cnt);
-
- rb_time_split(set, &top2, &bottom2, &msb2);
- top2 = rb_time_val_cnt(top2, cnt2);
- bottom2 = rb_time_val_cnt(bottom2, cnt2);
-
- if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
- return false;
- if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
- return false;
- if (!rb_time_read_cmpxchg(&t->top, top, top2))
- return false;
- if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
- return false;
- return true;
-}
-
#else /* 64 bits */
/* local64_t always succeeds */
@@ -751,11 +713,6 @@ static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
-
-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
-{
- return local64_try_cmpxchg(&t->time, &expect, set);
-}
#endif
/*
@@ -924,9 +881,14 @@ static __always_inline bool full_hit(struct trace_buffer *buffer, int cpu, int f
if (!nr_pages || !full)
return true;
- dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+ /*
+ * Add one as dirty will never equal nr_pages, as the sub-buffer
+ * that the writer is on is not counted as dirty.
+ * This is needed if "buffer_percent" is set to 100.
+ */
+ dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1;
- return (dirty * 100) > (full * nr_pages);
+ return (dirty * 100) >= (full * nr_pages);
}
/*
@@ -987,7 +949,8 @@ void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu)
/* make sure the waiters see the new index */
smp_wmb();
- rb_wake_up_waiters(&rbwork->work);
+ /* This can be called in any context */
+ irq_work_queue(&rbwork->work);
}
/**
@@ -1787,6 +1750,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
free_buffer_page(bpage);
}
+ free_page((unsigned long)cpu_buffer->free_page);
+
kfree(cpu_buffer);
}
@@ -2407,7 +2372,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_MAX_DATA_SIZE)
+ if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
/* Writer corrupted the read? */
goto reset;
@@ -2981,25 +2946,6 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-static u64 rb_time_delta(struct ring_buffer_event *event)
-{
- switch (event->type_len) {
- case RINGBUF_TYPE_PADDING:
- return 0;
-
- case RINGBUF_TYPE_TIME_EXTEND:
- return rb_event_time_stamp(event);
-
- case RINGBUF_TYPE_TIME_STAMP:
- return 0;
-
- case RINGBUF_TYPE_DATA:
- return event->time_delta;
- default:
- return 0;
- }
-}
-
static inline bool
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
@@ -3007,8 +2953,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long new_index, old_index;
struct buffer_page *bpage;
unsigned long addr;
- u64 write_stamp;
- u64 delta;
new_index = rb_event_index(event);
old_index = new_index + rb_event_ts_length(event);
@@ -3017,14 +2961,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
bpage = READ_ONCE(cpu_buffer->tail_page);
- delta = rb_time_delta(event);
-
- if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
- return false;
-
- /* Make sure the write stamp is read before testing the location */
- barrier();
-
+ /*
+ * Make sure the tail_page is still the same and
+ * the next write location is the end of this event
+ */
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
@@ -3035,20 +2975,20 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
* to make sure that the next event adds an absolute
* value and does not rely on the saved write stamp, which
* is now going to be bogus.
+ *
+ * By setting the before_stamp to zero, the next event
+ * is not going to use the write_stamp and will instead
+ * create an absolute timestamp. This means there's no
+ * reason to update the wirte_stamp!
*/
rb_time_set(&cpu_buffer->before_stamp, 0);
- /* Something came in, can't discard */
- if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
- write_stamp, write_stamp - delta))
- return false;
-
/*
* If an event were to come in now, it would see that the
* write_stamp and the before_stamp are different, and assume
* that this event just added itself before updating
* the write stamp. The interrupting event will fix the
- * write stamp for us, and use the before stamp as its delta.
+ * write stamp for us, and use an absolute timestamp.
*/
/*
@@ -3485,7 +3425,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
return;
/*
- * If this interrupted another event,
+ * If this interrupted another event,
*/
if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
goto out;
@@ -3579,7 +3519,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
* absolute timestamp.
* Don't bother if this is the start of a new page (w == 0).
*/
- if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+ if (!w) {
+ /* Use the sub-buffer timestamp */
+ info->delta = 0;
+ } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
} else {
@@ -3602,26 +3545,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* See if we shot pass the end of this buffer page */
if (unlikely(write > BUF_PAGE_SIZE)) {
- /* before and after may now different, fix it up*/
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- if (a_ok && b_ok && info->before != info->after)
- (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
- info->before, info->after);
- if (a_ok && b_ok)
- check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
+ check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
if (likely(tail == w)) {
- u64 save_before;
- bool s_ok;
-
/* Nothing interrupted us between A and C */
/*D*/ rb_time_set(&cpu_buffer->write_stamp, info->ts);
- barrier();
- /*E*/ s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
- RB_WARN_ON(cpu_buffer, !s_ok);
+ /*
+ * If something came in between C and D, the write stamp
+ * may now not be in sync. But that's fine as the before_stamp
+ * will be different and then next event will just be forced
+ * to use an absolute timestamp.
+ */
if (likely(!(info->add_timestamp &
(RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
/* This did not interrupt any time update */
@@ -3629,41 +3565,40 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
else
/* Just use full timestamp for interrupting event */
info->delta = info->ts;
- barrier();
check_buffer(cpu_buffer, info, tail);
- if (unlikely(info->ts != save_before)) {
- /* SLOW PATH - Interrupted between C and E */
-
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- RB_WARN_ON(cpu_buffer, !a_ok);
-
- /* Write stamp must only go forward */
- if (save_before > info->after) {
- /*
- * We do not care about the result, only that
- * it gets updated atomically.
- */
- (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
- info->after, save_before);
- }
- }
} else {
u64 ts;
/* SLOW PATH - Interrupted between A and C */
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- /* Was interrupted before here, write_stamp must be valid */
+
+ /* Save the old before_stamp */
+ a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
RB_WARN_ON(cpu_buffer, !a_ok);
+
+ /*
+ * Read a new timestamp and update the before_stamp to make
+ * the next event after this one force using an absolute
+ * timestamp. This is in case an interrupt were to come in
+ * between E and F.
+ */
ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_set(&cpu_buffer->before_stamp, ts);
+
barrier();
- /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
- info->after < ts &&
- rb_time_cmpxchg(&cpu_buffer->write_stamp,
- info->after, ts)) {
- /* Nothing came after this event between C and E */
+ /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ /* Was interrupted before here, write_stamp must be valid */
+ RB_WARN_ON(cpu_buffer, !a_ok);
+ barrier();
+ /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+ info->after == info->before && info->after < ts) {
+ /*
+ * Nothing came after this event between C and F, it is
+ * safe to use info->after for the delta as it
+ * matched info->before and is still valid.
+ */
info->delta = ts - info->after;
} else {
/*
- * Interrupted between C and E:
+ * Interrupted between C and F:
* Lost the previous events time stamp. Just set the
* delta to zero, and this will be the same time as
* the event this event interrupted. And the events that
@@ -3714,6 +3649,12 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
+ /* ring buffer does cmpxchg, make sure it is safe in NMI context */
+ if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ (unlikely(in_nmi()))) {
+ return NULL;
+ }
+
rb_start_commit(cpu_buffer);
/* The commit page can not change after this */
@@ -3737,6 +3678,8 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
+ if (info.length > BUF_MAX_DATA_SIZE)
+ goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
}
@@ -5118,7 +5061,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
if (!iter)
return NULL;
- iter->event = kmalloc(BUF_MAX_DATA_SIZE, flags);
+ /* Holds the entire event: data and meta data */
+ iter->event = kmalloc(BUF_PAGE_SIZE, flags);
if (!iter->event) {
kfree(iter);
return NULL;
diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c
index 8dfe85499d4a..354c2117be43 100644
--- a/kernel/trace/synth_event_gen_test.c
+++ b/kernel/trace/synth_event_gen_test.c
@@ -477,6 +477,17 @@ static int __init synth_event_gen_test_init(void)
ret = test_trace_synth_event();
WARN_ON(ret);
+
+ /* Disable when done */
+ trace_array_set_clr_event(gen_synth_test->tr,
+ "synthetic",
+ "gen_synth_test", false);
+ trace_array_set_clr_event(empty_synth_test->tr,
+ "synthetic",
+ "empty_synth_test", false);
+ trace_array_set_clr_event(create_synth_test->tr,
+ "synthetic",
+ "create_synth_test", false);
out:
return ret;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fbcd3bafb93e..a0defe156b57 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1894,6 +1894,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu,
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&tr->max_lock);
+
+ /* Any waiters on the old snapshot buffer need to wake up */
+ ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
}
/**
@@ -1945,12 +1948,23 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
static int wait_on_pipe(struct trace_iterator *iter, int full)
{
+ int ret;
+
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
- full);
+ ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /*
+ * Make sure this is still the snapshot buffer, as if a snapshot were
+ * to happen, this would now be the main buffer.
+ */
+ if (iter->snapshot)
+ iter->array_buffer = &iter->tr->max_buffer;
+#endif
+ return ret;
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -4722,7 +4736,11 @@ static int s_show(struct seq_file *m, void *v)
iter->leftover = ret;
} else {
- print_trace_line(iter);
+ ret = print_trace_line(iter);
+ if (ret == TRACE_TYPE_PARTIAL_LINE) {
+ iter->seq.full = 0;
+ trace_seq_puts(&iter->seq, "[LINE TOO BIG]\n");
+ }
ret = trace_print_seq(m, &iter->seq);
/*
* If we overflow the seq_file buffer, then it will
@@ -4964,6 +4982,12 @@ int tracing_release_file_tr(struct inode *inode, struct file *filp)
return 0;
}
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp)
+{
+ tracing_release_file_tr(inode, filp);
+ return single_release(inode, filp);
+}
+
static int tracing_mark_open(struct inode *inode, struct file *filp)
{
stream_open(inode, filp);
@@ -6344,7 +6368,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
if (!tr->array_buffer.buffer)
return 0;
- /* Do not allow tracing while resizng ring buffer */
+ /* Do not allow tracing while resizing ring buffer */
tracing_stop_tr(tr);
ret = ring_buffer_resize(tr->array_buffer.buffer, size, cpu);
@@ -6352,7 +6376,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
goto out_start;
#ifdef CONFIG_TRACER_MAX_TRACE
- if (!tr->current_trace->use_max_tr)
+ if (!tr->allocated_snapshot)
goto out;
ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
@@ -8507,7 +8531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
wait_index = READ_ONCE(iter->wait_index);
- ret = wait_on_pipe(iter, iter->tr->buffer_percent);
+ ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
if (ret)
goto out;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b7f4ea25a194..0489e72c8169 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -617,6 +617,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
+int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
bool tracing_is_disabled(void);
bool tracer_tracing_is_on(struct trace_array *tr);
void tracer_tracing_on(struct trace_array *tr);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1abc07fba1b9..5ecf3c8bde20 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5623,10 +5623,12 @@ static int event_hist_open(struct inode *inode, struct file *file)
{
int ret;
- ret = security_locked_down(LOCKDOWN_TRACEFS);
+ ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ /* Clear private_data to avoid warning in single_open() */
+ file->private_data = NULL;
return single_open(file, hist_show, file);
}
@@ -5634,7 +5636,7 @@ const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_file_tr,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5900,10 +5902,12 @@ static int event_hist_debug_open(struct inode *inode, struct file *file)
{
int ret;
- ret = security_locked_down(LOCKDOWN_TRACEFS);
+ ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ /* Clear private_data to avoid warning in single_open() */
+ file->private_data = NULL;
return single_open(file, hist_debug_show, file);
}
@@ -5911,7 +5915,7 @@ const struct file_operations event_hist_debug_fops = {
.open = event_hist_debug_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_file_tr,
};
#endif
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 846e02c0fb59..e7af286af4f1 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -1137,7 +1137,7 @@ EXPORT_SYMBOL_GPL(synth_event_add_fields);
* @cmd: A pointer to the dynevent_cmd struct representing the new event
* @name: The name of the synthetic event
* @mod: The module creating the event, NULL if not created from a module
- * @args: Variable number of arg (pairs), one pair for each field
+ * @...: Variable number of arg (pairs), one pair for each field
*
* NOTE: Users normally won't want to call this function directly, but
* rather use the synth_event_gen_cmd_start() wrapper, which
@@ -1695,7 +1695,7 @@ __synth_event_trace_end(struct synth_event_trace_state *trace_state)
* synth_event_trace - Trace a synthetic event
* @file: The trace_event_file representing the synthetic event
* @n_vals: The number of values in vals
- * @args: Variable number of args containing the event values
+ * @...: Variable number of args containing the event values
*
* Trace a synthetic event using the values passed in the variable
* argument list.
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 9365ce407426..e76f5e1efdf2 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2177,14 +2177,12 @@ static int user_events_open(struct inode *node, struct file *file)
static ssize_t user_events_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *ppos)
{
- struct iovec iov;
struct iov_iter i;
if (unlikely(*ppos != 0))
return -EFAULT;
- if (unlikely(import_single_range(ITER_SOURCE, (char __user *)ubuf,
- count, &iov, &i)))
+ if (unlikely(import_ubuf(ITER_SOURCE, (char __user *)ubuf, count, &i)))
return -EFAULT;
return user_events_write_core(file, &i);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..3e7fa44dc2b2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1587,11 +1587,12 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
{
struct print_entry *field;
struct trace_seq *s = &iter->seq;
+ int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
seq_print_ip_sym(s, field->ip, flags);
- trace_seq_printf(s, ": %s", field->buf);
+ trace_seq_printf(s, ": %.*s", max, field->buf);
return trace_handle_return(s);
}
@@ -1600,10 +1601,11 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
struct print_entry *field;
+ int max = iter->ent_size - offsetof(struct print_entry, buf);
trace_assign_type(field, iter->ent);
- trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf);
+ trace_seq_printf(&iter->seq, "# %lx %.*s", field->ip, max, field->buf);
return trace_handle_return(&iter->seq);
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 99c051de412a..a84b85d8aac1 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -151,7 +151,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
return -ENOMEM;
if (addr == FETCH_TOKEN_COMM)
- ret = strlcpy(dst, current->comm, maxlen);
+ ret = strscpy(dst, current->comm, maxlen);
else
ret = strncpy_from_user(dst, src, maxlen);
if (ret >= 0) {