summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/acct.c6
-rw-r--r--kernel/bpf/cgroup.c55
-rw-r--r--kernel/bpf/devmap.c28
-rw-r--r--kernel/bpf/stackmap.c17
-rw-r--r--kernel/bpf/syscall.c44
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/cgroup/cpuset.c4
-rw-r--r--kernel/context_tracking.c14
-rw-r--r--kernel/debug/debug_core.c10
-rw-r--r--kernel/debug/gdbstub.c6
-rw-r--r--kernel/debug/kdb/kdb_bt.c15
-rw-r--r--kernel/debug/kdb/kdb_io.c72
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_support.c7
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/events/uprobes.c36
-rw-r--r--kernel/exit.c11
-rw-r--r--kernel/fork.c15
-rw-r--r--kernel/futex.c4
-rw-r--r--kernel/gcov/Kconfig28
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/gcc_3_4.c573
-rw-r--r--kernel/irq/Kconfig4
-rw-r--r--kernel/kcov.c26
-rw-r--r--kernel/kcsan/Makefile14
-rw-r--r--kernel/kcsan/atomic.h20
-rw-r--r--kernel/kcsan/core.c850
-rw-r--r--kernel/kcsan/debugfs.c349
-rw-r--r--kernel/kcsan/encoding.h95
-rw-r--r--kernel/kcsan/kcsan.h142
-rw-r--r--kernel/kcsan/report.c634
-rw-r--r--kernel/kcsan/test.c131
-rw-r--r--kernel/kexec_file.c34
-rw-r--r--kernel/kprobes.c61
-rw-r--r--kernel/kthread.c80
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c8
-rw-r--r--kernel/locking/rtmutex-debug.c2
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/nsproxy.c2
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/power/Kconfig26
-rw-r--r--kernel/power/snapshot.c1
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/printk/printk.c24
-rw-r--r--kernel/printk/printk_safe.c7
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched/Makefile6
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/sched/idle.c15
-rw-r--r--kernel/scs.c2
-rw-r--r--kernel/softirq.c44
-rw-r--r--kernel/sys.c37
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/Kconfig52
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/blktrace.c66
-rw-r--r--kernel/trace/bpf_trace.c166
-rw-r--r--kernel/trace/ftrace.c28
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_boot.c10
-rw-r--r--kernel/trace/trace_entries.h14
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_hist.c2073
-rw-r--r--kernel/trace/trace_events_synth.c1789
-rw-r--r--kernel/trace/trace_events_trigger.c21
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_kprobe.c74
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_preemptirq.c10
-rw-r--r--kernel/trace/trace_probe.c6
-rw-r--r--kernel/trace/trace_probe.h2
-rw-r--r--kernel/trace/trace_stack.c5
-rw-r--r--kernel/trace/trace_synth.h36
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/watch_queue.c655
-rw-r--r--kernel/workqueue.c10
83 files changed, 5792 insertions, 2886 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index c332eb9d4841..f3218bc5ec69 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,9 @@ endif
# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
# in coverage traces.
KCOV_INSTRUMENT_softirq.o := n
+# Avoid KCSAN instrumentation in softirq ("No shared variables, all the data
+# are CPU local" => assume no data races), to reduce overhead in interrupts.
+KCSAN_SANITIZE_softirq.o = n
# These are called from save_stack_trace() on slub debug path,
# and produce insane amounts of uninteresting coverage.
KCOV_INSTRUMENT_module.o := n
@@ -31,6 +34,7 @@ KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
KASAN_SANITIZE_kcov.o := n
+KCSAN_SANITIZE_kcov.o := n
CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
# cond_syscall is currently not LTO compatible
@@ -103,6 +107,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
+obj-$(CONFIG_KCSAN) += kcsan/
obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
obj-$(CONFIG_PERF_EVENTS) += events/
@@ -116,11 +121,13 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_RSEQ) += rseq.o
+obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
+KCSAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
$(obj)/configs.o: $(obj)/config_data.gz
diff --git a/kernel/acct.c b/kernel/acct.c
index 11ff4a596d6b..b0c5b3a9f5af 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -40,7 +40,7 @@
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
- * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
@@ -541,13 +541,13 @@ void acct_collect(long exitcode, int group_dead)
if (group_dead && current->mm) {
struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
vma = current->mm->mmap;
while (vma) {
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
}
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index fdf7836750a3..ac53102e244a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -378,7 +378,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
}
list_for_each_entry(pl, progs, node) {
- if (prog && pl->prog == prog)
+ if (prog && pl->prog == prog && prog != replace_prog)
/* disallow attaching the same prog twice */
return ERR_PTR(-EINVAL);
if (link && pl->link == link)
@@ -1276,16 +1276,23 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
{
- if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ if (unlikely(max_optlen < 0))
return -EINVAL;
+ if (unlikely(max_optlen > PAGE_SIZE)) {
+ /* We don't expose optvals that are greater than PAGE_SIZE
+ * to the BPF program.
+ */
+ max_optlen = PAGE_SIZE;
+ }
+
ctx->optval = kzalloc(max_optlen, GFP_USER);
if (!ctx->optval)
return -ENOMEM;
ctx->optval_end = ctx->optval + max_optlen;
- return 0;
+ return max_optlen;
}
static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
@@ -1319,13 +1326,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
max_optlen = max_t(int, 16, *optlen);
- ret = sockopt_alloc_buf(&ctx, max_optlen);
- if (ret)
- return ret;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ if (max_optlen < 0)
+ return max_optlen;
ctx.optlen = *optlen;
- if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
@@ -1353,8 +1360,14 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
/* export any potential modifications */
*level = ctx.level;
*optname = ctx.optname;
- *optlen = ctx.optlen;
- *kernel_optval = ctx.optval;
+
+ /* optlen == 0 from BPF indicates that we should
+ * use original userspace data.
+ */
+ if (ctx.optlen != 0) {
+ *optlen = ctx.optlen;
+ *kernel_optval = ctx.optval;
+ }
}
out:
@@ -1385,12 +1398,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
return retval;
- ret = sockopt_alloc_buf(&ctx, max_optlen);
- if (ret)
- return ret;
-
ctx.optlen = max_optlen;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen);
+ if (max_optlen < 0)
+ return max_optlen;
+
if (!retval) {
/* If kernel getsockopt finished successfully,
* copy whatever was returned to the user back
@@ -1404,10 +1417,8 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (ctx.optlen > max_optlen)
- ctx.optlen = max_optlen;
-
- if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ if (copy_from_user(ctx.optval, optval,
+ min(ctx.optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
@@ -1436,10 +1447,12 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
- ret = -EFAULT;
- goto out;
+ if (ctx.optlen != 0) {
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
ret = ctx.retval;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 854b09beb16b..5fdbc776a760 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -60,15 +60,6 @@ struct xdp_dev_bulk_queue {
unsigned int count;
};
-/* DEVMAP values */
-struct bpf_devmap_val {
- u32 ifindex; /* device index */
- union {
- int fd; /* prog fd on map write */
- u32 id; /* prog id on map read */
- } bpf_prog;
-};
-
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
@@ -95,12 +86,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static struct hlist_head *dev_map_create_hash(unsigned int entries)
+static struct hlist_head *dev_map_create_hash(unsigned int entries,
+ int numa_node)
{
int i;
struct hlist_head *hash;
- hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
+ hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
@@ -154,7 +146,8 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
return -EINVAL;
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
+ dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
+ dtab->map.numa_node);
if (!dtab->dev_index_head)
goto free_charge;
@@ -241,7 +234,7 @@ static void dev_map_free(struct bpf_map *map)
}
}
- kfree(dtab->dev_index_head);
+ bpf_map_area_free(dtab->dev_index_head);
} else {
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
@@ -479,6 +472,7 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev,
struct xdp_txq_info txq = { .dev = dev };
u32 act;
+ xdp_set_data_meta_invalid(xdp);
xdp->txq = &txq;
act = bpf_prog_run_xdp(xdp_prog, xdp);
@@ -618,7 +612,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
if (!dev->dev)
goto err_out;
- if (val->bpf_prog.fd >= 0) {
+ if (val->bpf_prog.fd > 0) {
prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
BPF_PROG_TYPE_XDP, false);
if (IS_ERR(prog))
@@ -652,8 +646,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct bpf_devmap_val val = { .bpf_prog.fd = -1 };
struct bpf_dtab_netdev *dev, *old_dev;
+ struct bpf_devmap_val val = {};
u32 i = *(u32 *)key;
if (unlikely(map_flags > BPF_EXIST))
@@ -669,7 +663,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
if (!val.ifindex) {
dev = NULL;
/* can not specify fd if ifindex is 0 */
- if (val.bpf_prog.fd != -1)
+ if (val.bpf_prog.fd > 0)
return -EINVAL;
} else {
dev = __dev_map_alloc_node(net, dtab, &val, i);
@@ -699,8 +693,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
void *key, void *value, u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct bpf_devmap_val val = { .bpf_prog.fd = -1 };
struct bpf_dtab_netdev *dev, *old_dev;
+ struct bpf_devmap_val val = {};
u32 idx = *(u32 *)key;
unsigned long flags;
int err = -EEXIST;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 7b8381ce40a0..599488f25e40 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -33,7 +33,7 @@ struct bpf_stack_map {
/* irq_work to run up_read() for build_id lookup in nmi context */
struct stack_map_irq_work {
struct irq_work irq_work;
- struct rw_semaphore *sem;
+ struct mm_struct *mm;
};
static void do_up_read(struct irq_work *entry)
@@ -44,8 +44,7 @@ static void do_up_read(struct irq_work *entry)
return;
work = container_of(entry, struct stack_map_irq_work, irq_work);
- up_read_non_owner(work->sem);
- work->sem = NULL;
+ mmap_read_unlock_non_owner(work->mm);
}
static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
@@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
* with build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
- down_read_trylock(&current->mm->mmap_sem) == 0) {
+ !mmap_read_trylock_non_owner(current->mm)) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
if (!work) {
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock_non_owner(current->mm);
} else {
- work->sem = &current->mm->mmap_sem;
+ work->mm = current->mm;
irq_work_queue(&work->irq_work);
- /*
- * The irq_work will release the mmap_sem with
- * up_read_non_owner(). The rwsem_release() is called
- * here to release the lock from lockdep's perspective.
- */
- rwsem_release(&current->mm->mmap_sem.dep_map, _RET_IP_);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4d530b1d5683..8da159936bab 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -25,7 +25,7 @@
#include <linux/nospec.h>
#include <linux/audit.h>
#include <uapi/linux/btf.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <linux/bpf_lsm.h>
#include <linux/poll.h>
#include <linux/bpf-netns.h>
@@ -74,32 +74,19 @@ int bpf_check_uarg_tail_zero(void __user *uaddr,
size_t expected_size,
size_t actual_size)
{
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
- int err;
+ unsigned char __user *addr = uaddr + expected_size;
+ int res;
if (unlikely(actual_size > PAGE_SIZE)) /* silly large */
return -E2BIG;
- if (unlikely(!access_ok(uaddr, actual_size)))
- return -EFAULT;
-
if (actual_size <= expected_size)
return 0;
- addr = uaddr + expected_size;
- end = uaddr + actual_size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
-
- return 0;
+ res = check_zeroed_user(addr, actual_size - expected_size);
+ if (res < 0)
+ return res;
+ return res ? 0 : -E2BIG;
}
const struct bpf_map_ops bpf_map_offload_ops = {
@@ -3158,6 +3145,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
struct bpf_insn *insns;
u32 off, type;
u64 imm;
+ u8 code;
int i;
insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog),
@@ -3166,21 +3154,27 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
return insns;
for (i = 0; i < prog->len; i++) {
- if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) {
+ code = insns[i].code;
+
+ if (code == (BPF_JMP | BPF_TAIL_CALL)) {
insns[i].code = BPF_JMP | BPF_CALL;
insns[i].imm = BPF_FUNC_tail_call;
/* fall-through */
}
- if (insns[i].code == (BPF_JMP | BPF_CALL) ||
- insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) {
- if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS))
+ if (code == (BPF_JMP | BPF_CALL) ||
+ code == (BPF_JMP | BPF_CALL_ARGS)) {
+ if (code == (BPF_JMP | BPF_CALL_ARGS))
insns[i].code = BPF_JMP | BPF_CALL;
if (!bpf_dump_raw_ok())
insns[i].imm = 0;
continue;
}
+ if (BPF_CLASS(code) == BPF_LDX && BPF_MODE(code) == BPF_PROBE_MEM) {
+ insns[i].code = BPF_LDX | BPF_SIZE(code) | BPF_MEM;
+ continue;
+ }
- if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW))
+ if (code != (BPF_LD | BPF_IMM | BPF_DW))
continue;
imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5c7bbaac81ef..34cde841ab68 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7552,7 +7552,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
const struct btf *btf;
void __user *urecord;
u32 prev_offset = 0;
- int ret = 0;
+ int ret = -ENOMEM;
nfuncs = attr->func_info_cnt;
if (!nfuncs)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 729d3a5c772e..642415b8c3c9 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1655,7 +1655,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
guarantee_online_mems(cs, &newmems);
/*
- * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+ * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
@@ -1760,7 +1760,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
*
* Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
*/
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index ce430885c26c..36a98c48aedc 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(context_tracking_key);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
-static bool context_tracking_recursion_enter(void)
+static noinstr bool context_tracking_recursion_enter(void)
{
int recursion;
@@ -45,7 +45,7 @@ static bool context_tracking_recursion_enter(void)
return false;
}
-static void context_tracking_recursion_exit(void)
+static __always_inline void context_tracking_recursion_exit(void)
{
__this_cpu_dec(context_tracking.recursion);
}
@@ -59,7 +59,7 @@ static void context_tracking_recursion_exit(void)
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void __context_tracking_enter(enum ctx_state state)
+void noinstr __context_tracking_enter(enum ctx_state state)
{
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
@@ -77,8 +77,10 @@ void __context_tracking_enter(enum ctx_state state)
* on the tick.
*/
if (state == CONTEXT_USER) {
+ instrumentation_begin();
trace_user_enter(0);
vtime_user_enter(current);
+ instrumentation_end();
}
rcu_user_enter();
}
@@ -99,7 +101,6 @@ void __context_tracking_enter(enum ctx_state state)
}
context_tracking_recursion_exit();
}
-NOKPROBE_SYMBOL(__context_tracking_enter);
EXPORT_SYMBOL_GPL(__context_tracking_enter);
void context_tracking_enter(enum ctx_state state)
@@ -142,7 +143,7 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void __context_tracking_exit(enum ctx_state state)
+void noinstr __context_tracking_exit(enum ctx_state state)
{
if (!context_tracking_recursion_enter())
return;
@@ -155,15 +156,16 @@ void __context_tracking_exit(enum ctx_state state)
*/
rcu_user_exit();
if (state == CONTEXT_USER) {
+ instrumentation_begin();
vtime_user_exit(current);
trace_user_exit(0);
+ instrumentation_end();
}
}
__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
}
-NOKPROBE_SYMBOL(__context_tracking_exit);
EXPORT_SYMBOL_GPL(__context_tracking_exit);
void context_tracking_exit(enum ctx_state state)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index ccc0f98abdd4..9e5934780f41 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -169,18 +169,18 @@ int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
{
int err;
- err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
+ err = copy_from_kernel_nofault(bpt->saved_instr, (char *)bpt->bpt_addr,
BREAK_INSTR_SIZE);
if (err)
return err;
- err = probe_kernel_write((char *)bpt->bpt_addr,
+ err = copy_to_kernel_nofault((char *)bpt->bpt_addr,
arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
return err;
}
int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
{
- return probe_kernel_write((char *)bpt->bpt_addr,
+ return copy_to_kernel_nofault((char *)bpt->bpt_addr,
(char *)bpt->saved_instr, BREAK_INSTR_SIZE);
}
@@ -587,6 +587,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
arch_kgdb_ops.disable_hw_break(regs);
acquirelock:
+ rcu_read_lock();
/*
* Interrupts will be restored by the 'trap return' code, except when
* single stepping.
@@ -646,6 +647,7 @@ return_normal:
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return 0;
}
cpu_relax();
@@ -664,6 +666,7 @@ return_normal:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
goto acquirelock;
}
@@ -787,6 +790,7 @@ kgdb_restore:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return kgdb_info[cpu].ret_state;
}
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 4b280fc7dd67..61774aec46b4 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -247,7 +247,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
*/
tmp = buf + count;
- err = probe_kernel_read(tmp, mem, count);
+ err = copy_from_kernel_nofault(tmp, mem, count);
if (err)
return NULL;
while (count > 0) {
@@ -283,7 +283,7 @@ int kgdb_hex2mem(char *buf, char *mem, int count)
*tmp_raw |= hex_to_bin(*tmp_hex--) << 4;
}
- return probe_kernel_write(mem, tmp_raw, count);
+ return copy_to_kernel_nofault(mem, tmp_raw, count);
}
/*
@@ -335,7 +335,7 @@ static int kgdb_ebin2mem(char *buf, char *mem, int count)
size++;
}
- return probe_kernel_write(mem, c, size);
+ return copy_to_kernel_nofault(mem, c, size);
}
#if DBG_MAX_REG_NUM > 0
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 3de0cc780c16..18e03aba2cfc 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -21,17 +21,18 @@
static void kdb_show_stack(struct task_struct *p, void *addr)
{
- int old_lvl = console_loglevel;
-
- console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_trap_printk++;
- if (!addr && kdb_task_has_cpu(p))
+ if (!addr && kdb_task_has_cpu(p)) {
+ int old_lvl = console_loglevel;
+
+ console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
kdb_dump_stack_on_cpu(kdb_process_cpu(p));
- else
- show_stack(p, addr);
+ console_loglevel = old_lvl;
+ } else {
+ show_stack(p, addr, KERN_EMERG);
+ }
- console_loglevel = old_lvl;
kdb_trap_printk--;
}
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 924bc9298a42..683a799618ad 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -542,6 +542,44 @@ static int kdb_search_string(char *searched, char *searchfor)
return 0;
}
+static void kdb_msg_write(const char *msg, int msg_len)
+{
+ struct console *c;
+
+ if (msg_len == 0)
+ return;
+
+ if (dbg_io_ops) {
+ const char *cp = msg;
+ int len = msg_len;
+
+ while (len--) {
+ dbg_io_ops->write_char(*cp);
+ cp++;
+ }
+ }
+
+ for_each_console(c) {
+ if (!(c->flags & CON_ENABLED))
+ continue;
+ if (c == dbg_io_ops->cons)
+ continue;
+ /*
+ * Set oops_in_progress to encourage the console drivers to
+ * disregard their internal spin locks: in the current calling
+ * context the risk of deadlock is a bigger problem than risks
+ * due to re-entering the console driver. We operate directly on
+ * oops_in_progress rather than using bust_spinlocks() because
+ * the calls bust_spinlocks() makes on exit are not appropriate
+ * for this calling context.
+ */
+ ++oops_in_progress;
+ c->write(c, msg, msg_len);
+ --oops_in_progress;
+ touch_nmi_watchdog();
+ }
+}
+
int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
{
int diag;
@@ -553,7 +591,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
int this_cpu, old_cpu;
char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
char *moreprompt = "more> ";
- struct console *c;
unsigned long uninitialized_var(flags);
/* Serialize kdb_printf if multiple cpus try to write at once.
@@ -687,22 +724,11 @@ kdb_printit:
*/
retlen = strlen(kdb_buffer);
cp = (char *) printk_skip_headers(kdb_buffer);
- if (!dbg_kdb_mode && kgdb_connected) {
+ if (!dbg_kdb_mode && kgdb_connected)
gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
- } else {
- if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = retlen - (cp - kdb_buffer);
- cp2 = cp;
- while (len--) {
- dbg_io_ops->write_char(*cp2);
- cp2++;
- }
- }
- for_each_console(c) {
- c->write(c, cp, retlen - (cp - kdb_buffer));
- touch_nmi_watchdog();
- }
- }
+ else
+ kdb_msg_write(cp, retlen - (cp - kdb_buffer));
+
if (logging) {
saved_loglevel = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_SILENT;
@@ -751,19 +777,7 @@ kdb_printit:
moreprompt = "more> ";
kdb_input_flush();
-
- if (dbg_io_ops && !dbg_io_ops->is_console) {
- len = strlen(moreprompt);
- cp = moreprompt;
- while (len--) {
- dbg_io_ops->write_char(*cp);
- cp++;
- }
- }
- for_each_console(c) {
- c->write(c, moreprompt, strlen(moreprompt));
- touch_nmi_watchdog();
- }
+ kdb_msg_write(moreprompt, strlen(moreprompt));
if (logging)
printk("%s", moreprompt);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index ec190569f690..5c7949061671 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2326,7 +2326,8 @@ void kdb_ps1(const struct task_struct *p)
int cpu;
unsigned long tmp;
- if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ if (!p ||
+ copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return;
cpu = kdb_process_cpu(p);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index b8e6306e7e13..004c5b6c87f8 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -325,7 +325,7 @@ char *kdb_strdup(const char *str, gfp_t type)
*/
int kdb_getarea_size(void *res, unsigned long addr, size_t size)
{
- int ret = probe_kernel_read((char *)res, (char *)addr, size);
+ int ret = copy_from_kernel_nofault((char *)res, (char *)addr, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
@@ -350,7 +350,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = probe_kernel_read((char *)addr, (char *)res, size);
+ int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
@@ -624,7 +624,8 @@ char kdb_task_state_char (const struct task_struct *p)
char state;
unsigned long tmp;
- if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
+ if (!p ||
+ copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
cpu = kdb_process_cpu(p);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 63d66bbebbd5..856d98c36f56 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1316,7 +1316,7 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event::child_mutex;
* perf_event_context::lock
* perf_event::mmap_mutex
- * mmap_sem
+ * mmap_lock
* perf_addr_filters_head::lock
*
* cpu_hotplug_lock
@@ -3080,7 +3080,7 @@ static int perf_event_stop(struct perf_event *event, int restart)
* pre-existing mappings, called once when new filters arrive via SET_FILTER
* ioctl;
* (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
- * registered mapping, called for every new mmap(), with mm::mmap_sem down
+ * registered mapping, called for every new mmap(), with mm::mmap_lock down
* for reading;
* (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
* of exec.
@@ -9742,7 +9742,7 @@ static void perf_addr_filters_splice(struct perf_event *event,
/*
* Scan through mm's vmas and see if one of them matches the
* @filter; if so, adjust filter's address range.
- * Called with mm::mmap_sem down for reading.
+ * Called with mm::mmap_lock down for reading.
*/
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
struct mm_struct *mm,
@@ -9784,7 +9784,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
if (!mm)
goto restart;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
}
raw_spin_lock_irqsave(&ifh->lock, flags);
@@ -9810,7 +9810,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
raw_spin_unlock_irqrestore(&ifh->lock, flags);
if (ifh->nr_file_filters) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmput(mm);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index e51ec844c87c..bb0862873dba 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -457,7 +457,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
- * Called with mm->mmap_sem held for write.
+ * Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
@@ -861,10 +861,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* uprobe_write_opcode() assumes we don't cross page boundary */
- BUG_ON((uprobe->offset & ~PAGE_MASK) +
- UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-
smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
@@ -1058,7 +1054,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
file_inode(vma->vm_file) != uprobe->inode)
@@ -1080,7 +1076,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
}
unlock:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
free:
mmput(mm);
info = free_map_info(info);
@@ -1160,6 +1156,15 @@ static int __uprobe_register(struct inode *inode, loff_t offset,
if (offset > i_size_read(inode))
return -EINVAL;
+ /*
+ * This ensures that copy_from_page(), copy_to_page() and
+ * __update_ref_ctr() can't cross page boundary.
+ */
+ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
+ return -EINVAL;
+ if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
+ return -EINVAL;
+
retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
@@ -1235,7 +1240,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
struct vm_area_struct *vma;
int err = 0;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
unsigned long vaddr;
loff_t offset;
@@ -1252,7 +1257,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
vaddr = offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, mm, vaddr);
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -1349,7 +1354,7 @@ static int delayed_ref_ctr_inc(struct vm_area_struct *vma)
}
/*
- * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
+ * Called from mmap_region/vma_adjust with mm->mmap_lock acquired.
*
* Currently we ignore all errors and always return 0, the callers
* can't handle the failure anyway.
@@ -1439,7 +1444,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
struct vm_area_struct *vma;
int ret;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
if (mm->uprobes_state.xol_area) {
@@ -1469,7 +1474,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
/* pairs with get_xol_area() */
smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
fail:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
@@ -2008,6 +2013,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
uprobe_opcode_t opcode;
int result;
+ if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
+ return -EINVAL;
+
pagefault_disable();
result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
@@ -2039,7 +2047,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, bp_vaddr);
if (vma && vma->vm_start <= bp_vaddr) {
if (valid_vma(vma, false)) {
@@ -2057,7 +2065,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
mmf_recalc_uprobes(mm);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return uprobe;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index c300253a7b8e..727150f28103 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -66,7 +66,6 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
-#include <asm/pgtable.h>
#include <asm/mmu_context.h>
static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -441,17 +440,17 @@ static void exit_mm(void)
sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
- * We must hold mmap_sem around checking core_state
+ * We must hold mmap_lock around checking core_state
* and clearing tsk->mm. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
self.task = current;
self.next = xchg(&core_state->dumper.next, &self);
@@ -469,14 +468,14 @@ static void exit_mm(void)
freezable_schedule();
}
__set_current_state(TASK_RUNNING);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
}
mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
current->mm = NULL;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
enter_lazy_tlb(mm, current);
task_unlock(current);
mm_update_next_owner(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index cefe8745c46e..142b23645d82 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,7 +96,6 @@
#include <linux/kasan.h>
#include <linux/scs.h>
-#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -493,7 +492,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
LIST_HEAD(uf);
uprobe_start_dup_mmap();
- if (down_write_killable(&oldmm->mmap_sem)) {
+ if (mmap_write_lock_killable(oldmm)) {
retval = -EINTR;
goto fail_uprobe_end;
}
@@ -502,7 +501,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
/*
* Not linked in yet - no deadlock potential:
*/
- down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
/* No ordering required: file already has been exposed. */
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
@@ -618,9 +617,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
/* a new mm has just been created */
retval = arch_dup_mmap(oldmm, mm);
out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
flush_tlb_mm(oldmm);
- up_write(&oldmm->mmap_sem);
+ mmap_write_unlock(oldmm);
dup_userfaultfd_complete(&uf);
fail_uprobe_end:
uprobe_end_dup_mmap();
@@ -650,9 +649,9 @@ static inline void mm_free_pgd(struct mm_struct *mm)
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- down_write(&oldmm->mmap_sem);
+ mmap_write_lock(oldmm);
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
- up_write(&oldmm->mmap_sem);
+ mmap_write_unlock(oldmm);
return 0;
}
#define mm_alloc_pgd(mm) (0)
@@ -1023,7 +1022,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
- init_rwsem(&mm->mmap_sem);
+ mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
diff --git a/kernel/futex.c b/kernel/futex.c
index b4b9f960b610..e646661f6282 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -698,10 +698,10 @@ static int fault_in_user_writeable(u32 __user *uaddr)
struct mm_struct *mm = current->mm;
int ret;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
FAULT_FLAG_WRITE, NULL);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret < 0 ? ret : 0;
}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 3941a9c48f83..3110c77230c7 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -6,7 +6,7 @@ config GCOV_KERNEL
depends on DEBUG_FS
select CONSTRUCTORS if !UML
default n
- ---help---
+ help
This option enables gcov-based code profiling (e.g. for code coverage
measurements).
@@ -42,7 +42,7 @@ config GCOV_PROFILE_ALL
depends on GCOV_KERNEL
depends on ARCH_HAS_GCOV_PROFILE_ALL
default n
- ---help---
+ help
This options activates profiling for the entire kernel.
If unsure, say N.
@@ -51,28 +51,4 @@ config GCOV_PROFILE_ALL
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
-choice
- prompt "Specify GCOV format"
- depends on GCOV_KERNEL
- depends on CC_IS_GCC
- ---help---
- The gcov format is usually determined by the GCC version, and the
- default is chosen according to your GCC version. However, there are
- exceptions where format changes are integrated in lower-version GCCs.
- In such a case, change this option to adjust the format used in the
- kernel accordingly.
-
-config GCOV_FORMAT_3_4
- bool "GCC 3.4 format"
- depends on GCC_VERSION < 40700
- ---help---
- Select this option to use the format defined by GCC 3.4.
-
-config GCOV_FORMAT_4_7
- bool "GCC 4.7 format"
- ---help---
- Select this option to use the format defined by GCC 4.7.
-
-endchoice
-
endmenu
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index d66a74b0f100..16f8ecc7d882 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -2,6 +2,5 @@
ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-y := base.o fs.o
-obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o
-obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o
+obj-$(CONFIG_CC_IS_GCC) += gcc_base.o gcc_4_7.o
obj-$(CONFIG_CC_IS_CLANG) += clang.o
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
deleted file mode 100644
index acb83558e5df..000000000000
--- a/kernel/gcov/gcc_3_4.c
+++ /dev/null
@@ -1,573 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This code provides functions to handle gcc's profiling data format
- * introduced with gcc 3.4. Future versions of gcc may change the gcov
- * format (as happened before), so all format-specific information needs
- * to be kept modular and easily exchangeable.
- *
- * This file is based on gcc-internal definitions. Functions and data
- * structures are defined to be compatible with gcc counterparts.
- * For a better understanding, refer to gcc source: gcc/gcov-io.h.
- *
- * Copyright IBM Corp. 2009
- * Author(s): Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
- *
- * Uses gcc-internal data definitions.
- */
-
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/seq_file.h>
-#include <linux/vmalloc.h>
-#include "gcov.h"
-
-#define GCOV_COUNTERS 5
-
-static struct gcov_info *gcov_info_head;
-
-/**
- * struct gcov_fn_info - profiling meta data per function
- * @ident: object file-unique function identifier
- * @checksum: function checksum
- * @n_ctrs: number of values per counter type belonging to this function
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time.
- */
-struct gcov_fn_info {
- unsigned int ident;
- unsigned int checksum;
- unsigned int n_ctrs[];
-};
-
-/**
- * struct gcov_ctr_info - profiling data per counter type
- * @num: number of counter values for this type
- * @values: array of counter values for this type
- * @merge: merge function for counter values of this type (unused)
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the values array.
- */
-struct gcov_ctr_info {
- unsigned int num;
- gcov_type *values;
- void (*merge)(gcov_type *, unsigned int);
-};
-
-/**
- * struct gcov_info - profiling data per object file
- * @version: gcov version magic indicating the gcc version used for compilation
- * @next: list head for a singly-linked list
- * @stamp: time stamp
- * @filename: name of the associated gcov data file
- * @n_functions: number of instrumented functions
- * @functions: function data
- * @ctr_mask: mask specifying which counter types are active
- * @counts: counter data per counter type
- *
- * This data is generated by gcc during compilation and doesn't change
- * at run-time with the exception of the next pointer.
- */
-struct gcov_info {
- unsigned int version;
- struct gcov_info *next;
- unsigned int stamp;
- const char *filename;
- unsigned int n_functions;
- const struct gcov_fn_info *functions;
- unsigned int ctr_mask;
- struct gcov_ctr_info counts[];
-};
-
-/**
- * gcov_info_filename - return info filename
- * @info: profiling data set
- */
-const char *gcov_info_filename(struct gcov_info *info)
-{
- return info->filename;
-}
-
-/**
- * gcov_info_version - return info version
- * @info: profiling data set
- */
-unsigned int gcov_info_version(struct gcov_info *info)
-{
- return info->version;
-}
-
-/**
- * gcov_info_next - return next profiling data set
- * @info: profiling data set
- *
- * Returns next gcov_info following @info or first gcov_info in the chain if
- * @info is %NULL.
- */
-struct gcov_info *gcov_info_next(struct gcov_info *info)
-{
- if (!info)
- return gcov_info_head;
-
- return info->next;
-}
-
-/**
- * gcov_info_link - link/add profiling data set to the list
- * @info: profiling data set
- */
-void gcov_info_link(struct gcov_info *info)
-{
- info->next = gcov_info_head;
- gcov_info_head = info;
-}
-
-/**
- * gcov_info_unlink - unlink/remove profiling data set from the list
- * @prev: previous profiling data set
- * @info: profiling data set
- */
-void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info)
-{
- if (prev)
- prev->next = info->next;
- else
- gcov_info_head = info->next;
-}
-
-/**
- * gcov_info_within_module - check if a profiling data set belongs to a module
- * @info: profiling data set
- * @mod: module
- *
- * Returns true if profiling data belongs module, false otherwise.
- */
-bool gcov_info_within_module(struct gcov_info *info, struct module *mod)
-{
- return within_module((unsigned long)info, mod);
-}
-
-/* Symbolic links to be created for each profiling data file. */
-const struct gcov_link gcov_link[] = {
- { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */
- { 0, NULL},
-};
-
-/*
- * Determine whether a counter is active. Based on gcc magic. Doesn't change
- * at run-time.
- */
-static int counter_active(struct gcov_info *info, unsigned int type)
-{
- return (1 << type) & info->ctr_mask;
-}
-
-/* Determine number of active counters. Based on gcc magic. */
-static unsigned int num_counter_active(struct gcov_info *info)
-{
- unsigned int i;
- unsigned int result = 0;
-
- for (i = 0; i < GCOV_COUNTERS; i++) {
- if (counter_active(info, i))
- result++;
- }
- return result;
-}
-
-/**
- * gcov_info_reset - reset profiling data to zero
- * @info: profiling data set
- */
-void gcov_info_reset(struct gcov_info *info)
-{
- unsigned int active = num_counter_active(info);
- unsigned int i;
-
- for (i = 0; i < active; i++) {
- memset(info->counts[i].values, 0,
- info->counts[i].num * sizeof(gcov_type));
- }
-}
-
-/**
- * gcov_info_is_compatible - check if profiling data can be added
- * @info1: first profiling data set
- * @info2: second profiling data set
- *
- * Returns non-zero if profiling data can be added, zero otherwise.
- */
-int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
-{
- return (info1->stamp == info2->stamp);
-}
-
-/**
- * gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
- *
- * Adds profiling counts of @source to @dest.
- */
-void gcov_info_add(struct gcov_info *dest, struct gcov_info *source)
-{
- unsigned int i;
- unsigned int j;
-
- for (i = 0; i < num_counter_active(dest); i++) {
- for (j = 0; j < dest->counts[i].num; j++) {
- dest->counts[i].values[j] +=
- source->counts[i].values[j];
- }
- }
-}
-
-/* Get size of function info entry. Based on gcc magic. */
-static size_t get_fn_size(struct gcov_info *info)
-{
- size_t size;
-
- size = sizeof(struct gcov_fn_info) + num_counter_active(info) *
- sizeof(unsigned int);
- if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int))
- size = ALIGN(size, __alignof__(struct gcov_fn_info));
- return size;
-}
-
-/* Get address of function info entry. Based on gcc magic. */
-static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn)
-{
- return (struct gcov_fn_info *)
- ((char *) info->functions + fn * get_fn_size(info));
-}
-
-/**
- * gcov_info_dup - duplicate profiling data set
- * @info: profiling data set to duplicate
- *
- * Return newly allocated duplicate on success, %NULL on error.
- */
-struct gcov_info *gcov_info_dup(struct gcov_info *info)
-{
- struct gcov_info *dup;
- unsigned int i;
- unsigned int active;
-
- /* Duplicate gcov_info. */
- active = num_counter_active(info);
- dup = kzalloc(struct_size(dup, counts, active), GFP_KERNEL);
- if (!dup)
- return NULL;
- dup->version = info->version;
- dup->stamp = info->stamp;
- dup->n_functions = info->n_functions;
- dup->ctr_mask = info->ctr_mask;
- /* Duplicate filename. */
- dup->filename = kstrdup(info->filename, GFP_KERNEL);
- if (!dup->filename)
- goto err_free;
- /* Duplicate table of functions. */
- dup->functions = kmemdup(info->functions, info->n_functions *
- get_fn_size(info), GFP_KERNEL);
- if (!dup->functions)
- goto err_free;
- /* Duplicate counter arrays. */
- for (i = 0; i < active ; i++) {
- struct gcov_ctr_info *ctr = &info->counts[i];
- size_t size = ctr->num * sizeof(gcov_type);
-
- dup->counts[i].num = ctr->num;
- dup->counts[i].merge = ctr->merge;
- dup->counts[i].values = vmalloc(size);
- if (!dup->counts[i].values)
- goto err_free;
- memcpy(dup->counts[i].values, ctr->values, size);
- }
- return dup;
-
-err_free:
- gcov_info_free(dup);
- return NULL;
-}
-
-/**
- * gcov_info_free - release memory for profiling data set duplicate
- * @info: profiling data set duplicate to free
- */
-void gcov_info_free(struct gcov_info *info)
-{
- unsigned int active = num_counter_active(info);
- unsigned int i;
-
- for (i = 0; i < active ; i++)
- vfree(info->counts[i].values);
- kfree(info->functions);
- kfree(info->filename);
- kfree(info);
-}
-
-/**
- * struct type_info - iterator helper array
- * @ctr_type: counter type
- * @offset: index of the first value of the current function for this type
- *
- * This array is needed to convert the in-memory data format into the in-file
- * data format:
- *
- * In-memory:
- * for each counter type
- * for each function
- * values
- *
- * In-file:
- * for each function
- * for each counter type
- * values
- *
- * See gcc source gcc/gcov-io.h for more information on data organization.
- */
-struct type_info {
- int ctr_type;
- unsigned int offset;
-};
-
-/**
- * struct gcov_iterator - specifies current file position in logical records
- * @info: associated profiling data
- * @record: record type
- * @function: function number
- * @type: counter type
- * @count: index into values array
- * @num_types: number of counter types
- * @type_info: helper array to get values-array offset for current function
- */
-struct gcov_iterator {
- struct gcov_info *info;
-
- int record;
- unsigned int function;
- unsigned int type;
- unsigned int count;
-
- int num_types;
- struct type_info type_info[];
-};
-
-static struct gcov_fn_info *get_func(struct gcov_iterator *iter)
-{
- return get_fn_info(iter->info, iter->function);
-}
-
-static struct type_info *get_type(struct gcov_iterator *iter)
-{
- return &iter->type_info[iter->type];
-}
-
-/**
- * gcov_iter_new - allocate and initialize profiling data iterator
- * @info: profiling data set to be iterated
- *
- * Return file iterator on success, %NULL otherwise.
- */
-struct gcov_iterator *gcov_iter_new(struct gcov_info *info)
-{
- struct gcov_iterator *iter;
-
- iter = kzalloc(struct_size(iter, type_info, num_counter_active(info)),
- GFP_KERNEL);
- if (iter)
- iter->info = info;
-
- return iter;
-}
-
-/**
- * gcov_iter_free - release memory for iterator
- * @iter: file iterator to free
- */
-void gcov_iter_free(struct gcov_iterator *iter)
-{
- kfree(iter);
-}
-
-/**
- * gcov_iter_get_info - return profiling data set for given file iterator
- * @iter: file iterator
- */
-struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter)
-{
- return iter->info;
-}
-
-/**
- * gcov_iter_start - reset file iterator to starting position
- * @iter: file iterator
- */
-void gcov_iter_start(struct gcov_iterator *iter)
-{
- int i;
-
- iter->record = 0;
- iter->function = 0;
- iter->type = 0;
- iter->count = 0;
- iter->num_types = 0;
- for (i = 0; i < GCOV_COUNTERS; i++) {
- if (counter_active(iter->info, i)) {
- iter->type_info[iter->num_types].ctr_type = i;
- iter->type_info[iter->num_types++].offset = 0;
- }
- }
-}
-
-/* Mapping of logical record number to actual file content. */
-#define RECORD_FILE_MAGIC 0
-#define RECORD_GCOV_VERSION 1
-#define RECORD_TIME_STAMP 2
-#define RECORD_FUNCTION_TAG 3
-#define RECORD_FUNCTON_TAG_LEN 4
-#define RECORD_FUNCTION_IDENT 5
-#define RECORD_FUNCTION_CHECK 6
-#define RECORD_COUNT_TAG 7
-#define RECORD_COUNT_LEN 8
-#define RECORD_COUNT 9
-
-/**
- * gcov_iter_next - advance file iterator to next logical record
- * @iter: file iterator
- *
- * Return zero if new position is valid, non-zero if iterator has reached end.
- */
-int gcov_iter_next(struct gcov_iterator *iter)
-{
- switch (iter->record) {
- case RECORD_FILE_MAGIC:
- case RECORD_GCOV_VERSION:
- case RECORD_FUNCTION_TAG:
- case RECORD_FUNCTON_TAG_LEN:
- case RECORD_FUNCTION_IDENT:
- case RECORD_COUNT_TAG:
- /* Advance to next record */
- iter->record++;
- break;
- case RECORD_COUNT:
- /* Advance to next count */
- iter->count++;
- /* fall through */
- case RECORD_COUNT_LEN:
- if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
- iter->record = 9;
- break;
- }
- /* Advance to next counter type */
- get_type(iter)->offset += iter->count;
- iter->count = 0;
- iter->type++;
- /* fall through */
- case RECORD_FUNCTION_CHECK:
- if (iter->type < iter->num_types) {
- iter->record = 7;
- break;
- }
- /* Advance to next function */
- iter->type = 0;
- iter->function++;
- /* fall through */
- case RECORD_TIME_STAMP:
- if (iter->function < iter->info->n_functions)
- iter->record = 3;
- else
- iter->record = -1;
- break;
- }
- /* Check for EOF. */
- if (iter->record == -1)
- return -EINVAL;
- else
- return 0;
-}
-
-/**
- * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file
- * @seq: seq_file handle
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file.
- */
-static int seq_write_gcov_u32(struct seq_file *seq, u32 v)
-{
- return seq_write(seq, &v, sizeof(v));
-}
-
-/**
- * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file
- * @seq: seq_file handle
- * @v: value to be stored
- *
- * Number format defined by gcc: numbers are recorded in the 32 bit
- * unsigned binary form of the endianness of the machine generating the
- * file. 64 bit numbers are stored as two 32 bit numbers, the low part
- * first.
- */
-static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
-{
- u32 data[2];
-
- data[0] = (v & 0xffffffffUL);
- data[1] = (v >> 32);
- return seq_write(seq, data, sizeof(data));
-}
-
-/**
- * gcov_iter_write - write data for current pos to seq_file
- * @iter: file iterator
- * @seq: seq_file handle
- *
- * Return zero on success, non-zero otherwise.
- */
-int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
-{
- int rc = -EINVAL;
-
- switch (iter->record) {
- case RECORD_FILE_MAGIC:
- rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC);
- break;
- case RECORD_GCOV_VERSION:
- rc = seq_write_gcov_u32(seq, iter->info->version);
- break;
- case RECORD_TIME_STAMP:
- rc = seq_write_gcov_u32(seq, iter->info->stamp);
- break;
- case RECORD_FUNCTION_TAG:
- rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
- break;
- case RECORD_FUNCTON_TAG_LEN:
- rc = seq_write_gcov_u32(seq, 2);
- break;
- case RECORD_FUNCTION_IDENT:
- rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
- break;
- case RECORD_FUNCTION_CHECK:
- rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
- break;
- case RECORD_COUNT_TAG:
- rc = seq_write_gcov_u32(seq,
- GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type));
- break;
- case RECORD_COUNT_LEN:
- rc = seq_write_gcov_u32(seq,
- get_func(iter)->n_ctrs[iter->type] * 2);
- break;
- case RECORD_COUNT:
- rc = seq_write_gcov_u64(seq,
- iter->info->counts[iter->type].
- values[iter->count + get_type(iter)->offset]);
- break;
- }
- return rc;
-}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d63c324895ea..20512252ecc9 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -118,7 +118,7 @@ config IRQ_FORCED_THREADING
config SPARSE_IRQ
bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
- ---help---
+ help
Sparse irq numbering is useful for distro kernels that want
to define a high CONFIG_NR_CPUS value but still want to have
@@ -134,7 +134,7 @@ config GENERIC_IRQ_DEBUGFS
depends on DEBUG_FS
select GENERIC_IRQ_INJECTION
default n
- ---help---
+ help
Exposes internal state information through debugfs. Mostly for
developers and debugging of hard to diagnose interrupt problems.
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 55c5d883a93e..6afae0bcbac4 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -427,7 +427,8 @@ void kcov_task_exit(struct task_struct *t)
* WARN_ON(!kcov->remote && kcov->t != t);
*
* For KCOV_REMOTE_ENABLE devices, the exiting task is either:
- * 2. A remote task between kcov_remote_start() and kcov_remote_stop().
+ *
+ * 1. A remote task between kcov_remote_start() and kcov_remote_stop().
* In this case we should print a warning right away, since a task
* shouldn't be exiting when it's in a kcov coverage collection
* section. Here t points to the task that is collecting remote
@@ -437,7 +438,7 @@ void kcov_task_exit(struct task_struct *t)
* WARN_ON(kcov->remote && kcov->t != t);
*
* 2. The task that created kcov exiting without calling KCOV_DISABLE,
- * and then again we can make sure that t->kcov->t == t:
+ * and then again we make sure that t->kcov->t == t:
* WARN_ON(kcov->remote && kcov->t != t);
*
* By combining all three checks into one we get:
@@ -764,7 +765,7 @@ static const struct file_operations kcov_fops = {
* Internally, kcov_remote_start() looks up the kcov device associated with the
* provided handle, allocates an area for coverage collection, and saves the
* pointers to kcov and area into the current task_struct to allow coverage to
- * be collected via __sanitizer_cov_trace_pc()
+ * be collected via __sanitizer_cov_trace_pc().
* In turns kcov_remote_stop() clears those pointers from task_struct to stop
* collecting coverage and copies all collected coverage into the kcov area.
*/
@@ -972,16 +973,25 @@ void kcov_remote_stop(void)
local_irq_restore(flags);
return;
}
- kcov = t->kcov;
- area = t->kcov_area;
- size = t->kcov_size;
- sequence = t->kcov_sequence;
-
+ /*
+ * When in softirq, check if the corresponding kcov_remote_start()
+ * actually found the remote handle and started collecting coverage.
+ */
+ if (in_serving_softirq() && !t->kcov_softirq) {
+ local_irq_restore(flags);
+ return;
+ }
+ /* Make sure that kcov_softirq is only set when in softirq. */
if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
local_irq_restore(flags);
return;
}
+ kcov = t->kcov;
+ area = t->kcov_area;
+ size = t->kcov_size;
+ sequence = t->kcov_sequence;
+
kcov_stop(t);
if (in_serving_softirq()) {
t->kcov_softirq = 0;
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
new file mode 100644
index 000000000000..d4999b38d1be
--- /dev/null
+++ b/kernel/kcsan/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+KCSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+UBSAN_SANITIZE := n
+
+CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
+
+CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
+ $(call cc-option,-fno-stack-protector,)
+
+obj-y := core.o debugfs.o report.o
+obj-$(CONFIG_KCSAN_SELFTEST) += test.o
diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
new file mode 100644
index 000000000000..be9e625227f3
--- /dev/null
+++ b/kernel/kcsan/atomic.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_KCSAN_ATOMIC_H
+#define _KERNEL_KCSAN_ATOMIC_H
+
+#include <linux/jiffies.h>
+#include <linux/sched.h>
+
+/*
+ * Special rules for certain memory where concurrent conflicting accesses are
+ * common, however, the current convention is to not mark them; returns true if
+ * access to @ptr should be considered atomic. Called from slow-path.
+ */
+static bool kcsan_is_atomic_special(const volatile void *ptr)
+{
+ /* volatile globals that have been observed in data races. */
+ return ptr == &jiffies || ptr == &current->state;
+}
+
+#endif /* _KERNEL_KCSAN_ATOMIC_H */
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
new file mode 100644
index 000000000000..15f67949d11e
--- /dev/null
+++ b/kernel/kcsan/core.c
@@ -0,0 +1,850 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include "atomic.h"
+#include "encoding.h"
+#include "kcsan.h"
+
+static bool kcsan_early_enable = IS_ENABLED(CONFIG_KCSAN_EARLY_ENABLE);
+unsigned int kcsan_udelay_task = CONFIG_KCSAN_UDELAY_TASK;
+unsigned int kcsan_udelay_interrupt = CONFIG_KCSAN_UDELAY_INTERRUPT;
+static long kcsan_skip_watch = CONFIG_KCSAN_SKIP_WATCH;
+static bool kcsan_interrupt_watcher = IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER);
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kcsan."
+module_param_named(early_enable, kcsan_early_enable, bool, 0);
+module_param_named(udelay_task, kcsan_udelay_task, uint, 0644);
+module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
+module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
+module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
+
+bool kcsan_enabled;
+
+/* Per-CPU kcsan_ctx for interrupts */
+static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
+ .disable_count = 0,
+ .atomic_next = 0,
+ .atomic_nest_count = 0,
+ .in_flat_atomic = false,
+ .access_mask = 0,
+ .scoped_accesses = {LIST_POISON1, NULL},
+};
+
+/*
+ * Helper macros to index into adjacent slots, starting from address slot
+ * itself, followed by the right and left slots.
+ *
+ * The purpose is 2-fold:
+ *
+ * 1. if during insertion the address slot is already occupied, check if
+ * any adjacent slots are free;
+ * 2. accesses that straddle a slot boundary due to size that exceeds a
+ * slot's range may check adjacent slots if any watchpoint matches.
+ *
+ * Note that accesses with very large size may still miss a watchpoint; however,
+ * given this should be rare, this is a reasonable trade-off to make, since this
+ * will avoid:
+ *
+ * 1. excessive contention between watchpoint checks and setup;
+ * 2. larger number of simultaneous watchpoints without sacrificing
+ * performance.
+ *
+ * Example: SLOT_IDX values for KCSAN_CHECK_ADJACENT=1, where i is [0, 1, 2]:
+ *
+ * slot=0: [ 1, 2, 0]
+ * slot=9: [10, 11, 9]
+ * slot=63: [64, 65, 63]
+ */
+#define SLOT_IDX(slot, i) (slot + ((i + KCSAN_CHECK_ADJACENT) % NUM_SLOTS))
+
+/*
+ * SLOT_IDX_FAST is used in the fast-path. Not first checking the address's primary
+ * slot (middle) is fine if we assume that races occur rarely. The set of
+ * indices {SLOT_IDX(slot, i) | i in [0, NUM_SLOTS)} is equivalent to
+ * {SLOT_IDX_FAST(slot, i) | i in [0, NUM_SLOTS)}.
+ */
+#define SLOT_IDX_FAST(slot, i) (slot + i)
+
+/*
+ * Watchpoints, with each entry encoded as defined in encoding.h: in order to be
+ * able to safely update and access a watchpoint without introducing locking
+ * overhead, we encode each watchpoint as a single atomic long. The initial
+ * zero-initialized state matches INVALID_WATCHPOINT.
+ *
+ * Add NUM_SLOTS-1 entries to account for overflow; this helps avoid having to
+ * use more complicated SLOT_IDX_FAST calculation with modulo in the fast-path.
+ */
+static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
+
+/*
+ * Instructions to skip watching counter, used in should_watch(). We use a
+ * per-CPU counter to avoid excessive contention.
+ */
+static DEFINE_PER_CPU(long, kcsan_skip);
+
+static __always_inline atomic_long_t *find_watchpoint(unsigned long addr,
+ size_t size,
+ bool expect_write,
+ long *encoded_watchpoint)
+{
+ const int slot = watchpoint_slot(addr);
+ const unsigned long addr_masked = addr & WATCHPOINT_ADDR_MASK;
+ atomic_long_t *watchpoint;
+ unsigned long wp_addr_masked;
+ size_t wp_size;
+ bool is_write;
+ int i;
+
+ BUILD_BUG_ON(CONFIG_KCSAN_NUM_WATCHPOINTS < NUM_SLOTS);
+
+ for (i = 0; i < NUM_SLOTS; ++i) {
+ watchpoint = &watchpoints[SLOT_IDX_FAST(slot, i)];
+ *encoded_watchpoint = atomic_long_read(watchpoint);
+ if (!decode_watchpoint(*encoded_watchpoint, &wp_addr_masked,
+ &wp_size, &is_write))
+ continue;
+
+ if (expect_write && !is_write)
+ continue;
+
+ /* Check if the watchpoint matches the access. */
+ if (matching_access(wp_addr_masked, wp_size, addr_masked, size))
+ return watchpoint;
+ }
+
+ return NULL;
+}
+
+static inline atomic_long_t *
+insert_watchpoint(unsigned long addr, size_t size, bool is_write)
+{
+ const int slot = watchpoint_slot(addr);
+ const long encoded_watchpoint = encode_watchpoint(addr, size, is_write);
+ atomic_long_t *watchpoint;
+ int i;
+
+ /* Check slot index logic, ensuring we stay within array bounds. */
+ BUILD_BUG_ON(SLOT_IDX(0, 0) != KCSAN_CHECK_ADJACENT);
+ BUILD_BUG_ON(SLOT_IDX(0, KCSAN_CHECK_ADJACENT+1) != 0);
+ BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT) != ARRAY_SIZE(watchpoints)-1);
+ BUILD_BUG_ON(SLOT_IDX(CONFIG_KCSAN_NUM_WATCHPOINTS-1, KCSAN_CHECK_ADJACENT+1) != ARRAY_SIZE(watchpoints) - NUM_SLOTS);
+
+ for (i = 0; i < NUM_SLOTS; ++i) {
+ long expect_val = INVALID_WATCHPOINT;
+
+ /* Try to acquire this slot. */
+ watchpoint = &watchpoints[SLOT_IDX(slot, i)];
+ if (atomic_long_try_cmpxchg_relaxed(watchpoint, &expect_val, encoded_watchpoint))
+ return watchpoint;
+ }
+
+ return NULL;
+}
+
+/*
+ * Return true if watchpoint was successfully consumed, false otherwise.
+ *
+ * This may return false if:
+ *
+ * 1. another thread already consumed the watchpoint;
+ * 2. the thread that set up the watchpoint already removed it;
+ * 3. the watchpoint was removed and then re-used.
+ */
+static __always_inline bool
+try_consume_watchpoint(atomic_long_t *watchpoint, long encoded_watchpoint)
+{
+ return atomic_long_try_cmpxchg_relaxed(watchpoint, &encoded_watchpoint, CONSUMED_WATCHPOINT);
+}
+
+/* Return true if watchpoint was not touched, false if already consumed. */
+static inline bool consume_watchpoint(atomic_long_t *watchpoint)
+{
+ return atomic_long_xchg_relaxed(watchpoint, CONSUMED_WATCHPOINT) != CONSUMED_WATCHPOINT;
+}
+
+/* Remove the watchpoint -- its slot may be reused after. */
+static inline void remove_watchpoint(atomic_long_t *watchpoint)
+{
+ atomic_long_set(watchpoint, INVALID_WATCHPOINT);
+}
+
+static __always_inline struct kcsan_ctx *get_ctx(void)
+{
+ /*
+ * In interrupts, use raw_cpu_ptr to avoid unnecessary checks, that would
+ * also result in calls that generate warnings in uaccess regions.
+ */
+ return in_task() ? &current->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
+}
+
+/* Check scoped accesses; never inline because this is a slow-path! */
+static noinline void kcsan_check_scoped_accesses(void)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+ struct list_head *prev_save = ctx->scoped_accesses.prev;
+ struct kcsan_scoped_access *scoped_access;
+
+ ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
+ list_for_each_entry(scoped_access, &ctx->scoped_accesses, list)
+ __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type);
+ ctx->scoped_accesses.prev = prev_save;
+}
+
+/* Rules for generic atomic accesses. Called from fast-path. */
+static __always_inline bool
+is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+{
+ if (type & KCSAN_ACCESS_ATOMIC)
+ return true;
+
+ /*
+ * Unless explicitly declared atomic, never consider an assertion access
+ * as atomic. This allows using them also in atomic regions, such as
+ * seqlocks, without implicitly changing their semantics.
+ */
+ if (type & KCSAN_ACCESS_ASSERT)
+ return false;
+
+ if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) &&
+ (type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) &&
+ IS_ALIGNED((unsigned long)ptr, size))
+ return true; /* Assume aligned writes up to word size are atomic. */
+
+ if (ctx->atomic_next > 0) {
+ /*
+ * Because we do not have separate contexts for nested
+ * interrupts, in case atomic_next is set, we simply assume that
+ * the outer interrupt set atomic_next. In the worst case, we
+ * will conservatively consider operations as atomic. This is a
+ * reasonable trade-off to make, since this case should be
+ * extremely rare; however, even if extremely rare, it could
+ * lead to false positives otherwise.
+ */
+ if ((hardirq_count() >> HARDIRQ_SHIFT) < 2)
+ --ctx->atomic_next; /* in task, or outer interrupt */
+ return true;
+ }
+
+ return ctx->atomic_nest_count > 0 || ctx->in_flat_atomic;
+}
+
+static __always_inline bool
+should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+{
+ /*
+ * Never set up watchpoints when memory operations are atomic.
+ *
+ * Need to check this first, before kcsan_skip check below: (1) atomics
+ * should not count towards skipped instructions, and (2) to actually
+ * decrement kcsan_atomic_next for consecutive instruction stream.
+ */
+ if (is_atomic(ptr, size, type, ctx))
+ return false;
+
+ if (this_cpu_dec_return(kcsan_skip) >= 0)
+ return false;
+
+ /*
+ * NOTE: If we get here, kcsan_skip must always be reset in slow path
+ * via reset_kcsan_skip() to avoid underflow.
+ */
+
+ /* this operation should be watched */
+ return true;
+}
+
+static inline void reset_kcsan_skip(void)
+{
+ long skip_count = kcsan_skip_watch -
+ (IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ?
+ prandom_u32_max(kcsan_skip_watch) :
+ 0);
+ this_cpu_write(kcsan_skip, skip_count);
+}
+
+static __always_inline bool kcsan_is_enabled(void)
+{
+ return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0;
+}
+
+static inline unsigned int get_delay(void)
+{
+ unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt;
+ return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
+ prandom_u32_max(delay) :
+ 0);
+}
+
+/*
+ * Pull everything together: check_access() below contains the performance
+ * critical operations; the fast-path (including check_access) functions should
+ * all be inlinable by the instrumentation functions.
+ *
+ * The slow-path (kcsan_found_watchpoint, kcsan_setup_watchpoint) are
+ * non-inlinable -- note that, we prefix these with "kcsan_" to ensure they can
+ * be filtered from the stacktrace, as well as give them unique names for the
+ * UACCESS whitelist of objtool. Each function uses user_access_save/restore(),
+ * since they do not access any user memory, but instrumentation is still
+ * emitted in UACCESS regions.
+ */
+
+static noinline void kcsan_found_watchpoint(const volatile void *ptr,
+ size_t size,
+ int type,
+ atomic_long_t *watchpoint,
+ long encoded_watchpoint)
+{
+ unsigned long flags;
+ bool consumed;
+
+ if (!kcsan_is_enabled())
+ return;
+
+ /*
+ * The access_mask check relies on value-change comparison. To avoid
+ * reporting a race where e.g. the writer set up the watchpoint, but the
+ * reader has access_mask!=0, we have to ignore the found watchpoint.
+ */
+ if (get_ctx()->access_mask != 0)
+ return;
+
+ /*
+ * Consume the watchpoint as soon as possible, to minimize the chances
+ * of !consumed. Consuming the watchpoint must always be guarded by
+ * kcsan_is_enabled() check, as otherwise we might erroneously
+ * triggering reports when disabled.
+ */
+ consumed = try_consume_watchpoint(watchpoint, encoded_watchpoint);
+
+ /* keep this after try_consume_watchpoint */
+ flags = user_access_save();
+
+ if (consumed) {
+ kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
+ KCSAN_REPORT_CONSUMED_WATCHPOINT,
+ watchpoint - watchpoints);
+ } else {
+ /*
+ * The other thread may not print any diagnostics, as it has
+ * already removed the watchpoint, or another thread consumed
+ * the watchpoint before this thread.
+ */
+ kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES);
+ }
+
+ if ((type & KCSAN_ACCESS_ASSERT) != 0)
+ kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ else
+ kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES);
+
+ user_access_restore(flags);
+}
+
+static noinline void
+kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
+{
+ const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
+ const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
+ atomic_long_t *watchpoint;
+ union {
+ u8 _1;
+ u16 _2;
+ u32 _4;
+ u64 _8;
+ } expect_value;
+ unsigned long access_mask;
+ enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
+ unsigned long ua_flags = user_access_save();
+ unsigned long irq_flags = 0;
+
+ /*
+ * Always reset kcsan_skip counter in slow-path to avoid underflow; see
+ * should_watch().
+ */
+ reset_kcsan_skip();
+
+ if (!kcsan_is_enabled())
+ goto out;
+
+ /*
+ * Special atomic rules: unlikely to be true, so we check them here in
+ * the slow-path, and not in the fast-path in is_atomic(). Call after
+ * kcsan_is_enabled(), as we may access memory that is not yet
+ * initialized during early boot.
+ */
+ if (!is_assert && kcsan_is_atomic_special(ptr))
+ goto out;
+
+ if (!check_encodable((unsigned long)ptr, size)) {
+ kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES);
+ goto out;
+ }
+
+ if (!kcsan_interrupt_watcher)
+ /* Use raw to avoid lockdep recursion via IRQ flags tracing. */
+ raw_local_irq_save(irq_flags);
+
+ watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
+ if (watchpoint == NULL) {
+ /*
+ * Out of capacity: the size of 'watchpoints', and the frequency
+ * with which should_watch() returns true should be tweaked so
+ * that this case happens very rarely.
+ */
+ kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY);
+ goto out_unlock;
+ }
+
+ kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS);
+ kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS);
+
+ /*
+ * Read the current value, to later check and infer a race if the data
+ * was modified via a non-instrumented access, e.g. from a device.
+ */
+ expect_value._8 = 0;
+ switch (size) {
+ case 1:
+ expect_value._1 = READ_ONCE(*(const u8 *)ptr);
+ break;
+ case 2:
+ expect_value._2 = READ_ONCE(*(const u16 *)ptr);
+ break;
+ case 4:
+ expect_value._4 = READ_ONCE(*(const u32 *)ptr);
+ break;
+ case 8:
+ expect_value._8 = READ_ONCE(*(const u64 *)ptr);
+ break;
+ default:
+ break; /* ignore; we do not diff the values */
+ }
+
+ if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) {
+ kcsan_disable_current();
+ pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
+ is_write ? "write" : "read", size, ptr,
+ watchpoint_slot((unsigned long)ptr),
+ encode_watchpoint((unsigned long)ptr, size, is_write));
+ kcsan_enable_current();
+ }
+
+ /*
+ * Delay this thread, to increase probability of observing a racy
+ * conflicting access.
+ */
+ udelay(get_delay());
+
+ /*
+ * Re-read value, and check if it is as expected; if not, we infer a
+ * racy access.
+ */
+ access_mask = get_ctx()->access_mask;
+ switch (size) {
+ case 1:
+ expect_value._1 ^= READ_ONCE(*(const u8 *)ptr);
+ if (access_mask)
+ expect_value._1 &= (u8)access_mask;
+ break;
+ case 2:
+ expect_value._2 ^= READ_ONCE(*(const u16 *)ptr);
+ if (access_mask)
+ expect_value._2 &= (u16)access_mask;
+ break;
+ case 4:
+ expect_value._4 ^= READ_ONCE(*(const u32 *)ptr);
+ if (access_mask)
+ expect_value._4 &= (u32)access_mask;
+ break;
+ case 8:
+ expect_value._8 ^= READ_ONCE(*(const u64 *)ptr);
+ if (access_mask)
+ expect_value._8 &= (u64)access_mask;
+ break;
+ default:
+ break; /* ignore; we do not diff the values */
+ }
+
+ /* Were we able to observe a value-change? */
+ if (expect_value._8 != 0)
+ value_change = KCSAN_VALUE_CHANGE_TRUE;
+
+ /* Check if this access raced with another. */
+ if (!consume_watchpoint(watchpoint)) {
+ /*
+ * Depending on the access type, map a value_change of MAYBE to
+ * TRUE (always report) or FALSE (never report).
+ */
+ if (value_change == KCSAN_VALUE_CHANGE_MAYBE) {
+ if (access_mask != 0) {
+ /*
+ * For access with access_mask, we require a
+ * value-change, as it is likely that races on
+ * ~access_mask bits are expected.
+ */
+ value_change = KCSAN_VALUE_CHANGE_FALSE;
+ } else if (size > 8 || is_assert) {
+ /* Always assume a value-change. */
+ value_change = KCSAN_VALUE_CHANGE_TRUE;
+ }
+ }
+
+ /*
+ * No need to increment 'data_races' counter, as the racing
+ * thread already did.
+ *
+ * Count 'assert_failures' for each failed ASSERT access,
+ * therefore both this thread and the racing thread may
+ * increment this counter.
+ */
+ if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
+ kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+
+ kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL,
+ watchpoint - watchpoints);
+ } else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
+ /* Inferring a race, since the value should not have changed. */
+
+ kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN);
+ if (is_assert)
+ kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
+ kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE,
+ KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
+ watchpoint - watchpoints);
+ }
+
+ /*
+ * Remove watchpoint; must be after reporting, since the slot may be
+ * reused after this point.
+ */
+ remove_watchpoint(watchpoint);
+ kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS);
+out_unlock:
+ if (!kcsan_interrupt_watcher)
+ raw_local_irq_restore(irq_flags);
+out:
+ user_access_restore(ua_flags);
+}
+
+static __always_inline void check_access(const volatile void *ptr, size_t size,
+ int type)
+{
+ const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
+ atomic_long_t *watchpoint;
+ long encoded_watchpoint;
+
+ /*
+ * Do nothing for 0 sized check; this comparison will be optimized out
+ * for constant sized instrumentation (__tsan_{read,write}N).
+ */
+ if (unlikely(size == 0))
+ return;
+
+ /*
+ * Avoid user_access_save in fast-path: find_watchpoint is safe without
+ * user_access_save, as the address that ptr points to is only used to
+ * check if a watchpoint exists; ptr is never dereferenced.
+ */
+ watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
+ &encoded_watchpoint);
+ /*
+ * It is safe to check kcsan_is_enabled() after find_watchpoint in the
+ * slow-path, as long as no state changes that cause a race to be
+ * detected and reported have occurred until kcsan_is_enabled() is
+ * checked.
+ */
+
+ if (unlikely(watchpoint != NULL))
+ kcsan_found_watchpoint(ptr, size, type, watchpoint,
+ encoded_watchpoint);
+ else {
+ struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
+
+ if (unlikely(should_watch(ptr, size, type, ctx)))
+ kcsan_setup_watchpoint(ptr, size, type);
+ else if (unlikely(ctx->scoped_accesses.prev))
+ kcsan_check_scoped_accesses();
+ }
+}
+
+/* === Public interface ===================================================== */
+
+void __init kcsan_init(void)
+{
+ BUG_ON(!in_task());
+
+ kcsan_debugfs_init();
+
+ /*
+ * We are in the init task, and no other tasks should be running;
+ * WRITE_ONCE without memory barrier is sufficient.
+ */
+ if (kcsan_early_enable)
+ WRITE_ONCE(kcsan_enabled, true);
+}
+
+/* === Exported interface =================================================== */
+
+void kcsan_disable_current(void)
+{
+ ++get_ctx()->disable_count;
+}
+EXPORT_SYMBOL(kcsan_disable_current);
+
+void kcsan_enable_current(void)
+{
+ if (get_ctx()->disable_count-- == 0) {
+ /*
+ * Warn if kcsan_enable_current() calls are unbalanced with
+ * kcsan_disable_current() calls, which causes disable_count to
+ * become negative and should not happen.
+ */
+ kcsan_disable_current(); /* restore to 0, KCSAN still enabled */
+ kcsan_disable_current(); /* disable to generate warning */
+ WARN(1, "Unbalanced %s()", __func__);
+ kcsan_enable_current();
+ }
+}
+EXPORT_SYMBOL(kcsan_enable_current);
+
+void kcsan_enable_current_nowarn(void)
+{
+ if (get_ctx()->disable_count-- == 0)
+ kcsan_disable_current();
+}
+EXPORT_SYMBOL(kcsan_enable_current_nowarn);
+
+void kcsan_nestable_atomic_begin(void)
+{
+ /*
+ * Do *not* check and warn if we are in a flat atomic region: nestable
+ * and flat atomic regions are independent from each other.
+ * See include/linux/kcsan.h: struct kcsan_ctx comments for more
+ * comments.
+ */
+
+ ++get_ctx()->atomic_nest_count;
+}
+EXPORT_SYMBOL(kcsan_nestable_atomic_begin);
+
+void kcsan_nestable_atomic_end(void)
+{
+ if (get_ctx()->atomic_nest_count-- == 0) {
+ /*
+ * Warn if kcsan_nestable_atomic_end() calls are unbalanced with
+ * kcsan_nestable_atomic_begin() calls, which causes
+ * atomic_nest_count to become negative and should not happen.
+ */
+ kcsan_nestable_atomic_begin(); /* restore to 0 */
+ kcsan_disable_current(); /* disable to generate warning */
+ WARN(1, "Unbalanced %s()", __func__);
+ kcsan_enable_current();
+ }
+}
+EXPORT_SYMBOL(kcsan_nestable_atomic_end);
+
+void kcsan_flat_atomic_begin(void)
+{
+ get_ctx()->in_flat_atomic = true;
+}
+EXPORT_SYMBOL(kcsan_flat_atomic_begin);
+
+void kcsan_flat_atomic_end(void)
+{
+ get_ctx()->in_flat_atomic = false;
+}
+EXPORT_SYMBOL(kcsan_flat_atomic_end);
+
+void kcsan_atomic_next(int n)
+{
+ get_ctx()->atomic_next = n;
+}
+EXPORT_SYMBOL(kcsan_atomic_next);
+
+void kcsan_set_access_mask(unsigned long mask)
+{
+ get_ctx()->access_mask = mask;
+}
+EXPORT_SYMBOL(kcsan_set_access_mask);
+
+struct kcsan_scoped_access *
+kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
+ struct kcsan_scoped_access *sa)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+
+ __kcsan_check_access(ptr, size, type);
+
+ ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
+
+ INIT_LIST_HEAD(&sa->list);
+ sa->ptr = ptr;
+ sa->size = size;
+ sa->type = type;
+
+ if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
+ INIT_LIST_HEAD(&ctx->scoped_accesses);
+ list_add(&sa->list, &ctx->scoped_accesses);
+
+ ctx->disable_count--;
+ return sa;
+}
+EXPORT_SYMBOL(kcsan_begin_scoped_access);
+
+void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
+{
+ struct kcsan_ctx *ctx = get_ctx();
+
+ if (WARN(!ctx->scoped_accesses.prev, "Unbalanced %s()?", __func__))
+ return;
+
+ ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
+
+ list_del(&sa->list);
+ if (list_empty(&ctx->scoped_accesses))
+ /*
+ * Ensure we do not enter kcsan_check_scoped_accesses()
+ * slow-path if unnecessary, and avoids requiring list_empty()
+ * in the fast-path (to avoid a READ_ONCE() and potential
+ * uaccess warning).
+ */
+ ctx->scoped_accesses.prev = NULL;
+
+ ctx->disable_count--;
+
+ __kcsan_check_access(sa->ptr, sa->size, sa->type);
+}
+EXPORT_SYMBOL(kcsan_end_scoped_access);
+
+void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
+{
+ check_access(ptr, size, type);
+}
+EXPORT_SYMBOL(__kcsan_check_access);
+
+/*
+ * KCSAN uses the same instrumentation that is emitted by supported compilers
+ * for ThreadSanitizer (TSAN).
+ *
+ * When enabled, the compiler emits instrumentation calls (the functions
+ * prefixed with "__tsan" below) for all loads and stores that it generated;
+ * inline asm is not instrumented.
+ *
+ * Note that, not all supported compiler versions distinguish aligned/unaligned
+ * accesses, but e.g. recent versions of Clang do. We simply alias the unaligned
+ * version to the generic version, which can handle both.
+ */
+
+#define DEFINE_TSAN_READ_WRITE(size) \
+ void __tsan_read##size(void *ptr) \
+ { \
+ check_access(ptr, size, 0); \
+ } \
+ EXPORT_SYMBOL(__tsan_read##size); \
+ void __tsan_unaligned_read##size(void *ptr) \
+ __alias(__tsan_read##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_read##size); \
+ void __tsan_write##size(void *ptr) \
+ { \
+ check_access(ptr, size, KCSAN_ACCESS_WRITE); \
+ } \
+ EXPORT_SYMBOL(__tsan_write##size); \
+ void __tsan_unaligned_write##size(void *ptr) \
+ __alias(__tsan_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_write##size)
+
+DEFINE_TSAN_READ_WRITE(1);
+DEFINE_TSAN_READ_WRITE(2);
+DEFINE_TSAN_READ_WRITE(4);
+DEFINE_TSAN_READ_WRITE(8);
+DEFINE_TSAN_READ_WRITE(16);
+
+void __tsan_read_range(void *ptr, size_t size)
+{
+ check_access(ptr, size, 0);
+}
+EXPORT_SYMBOL(__tsan_read_range);
+
+void __tsan_write_range(void *ptr, size_t size)
+{
+ check_access(ptr, size, KCSAN_ACCESS_WRITE);
+}
+EXPORT_SYMBOL(__tsan_write_range);
+
+/*
+ * Use of explicit volatile is generally disallowed [1], however, volatile is
+ * still used in various concurrent context, whether in low-level
+ * synchronization primitives or for legacy reasons.
+ * [1] https://lwn.net/Articles/233479/
+ *
+ * We only consider volatile accesses atomic if they are aligned and would pass
+ * the size-check of compiletime_assert_rwonce_type().
+ */
+#define DEFINE_TSAN_VOLATILE_READ_WRITE(size) \
+ void __tsan_volatile_read##size(void *ptr) \
+ { \
+ const bool is_atomic = size <= sizeof(long long) && \
+ IS_ALIGNED((unsigned long)ptr, size); \
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
+ return; \
+ check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \
+ } \
+ EXPORT_SYMBOL(__tsan_volatile_read##size); \
+ void __tsan_unaligned_volatile_read##size(void *ptr) \
+ __alias(__tsan_volatile_read##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size); \
+ void __tsan_volatile_write##size(void *ptr) \
+ { \
+ const bool is_atomic = size <= sizeof(long long) && \
+ IS_ALIGNED((unsigned long)ptr, size); \
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
+ return; \
+ check_access(ptr, size, \
+ KCSAN_ACCESS_WRITE | \
+ (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \
+ } \
+ EXPORT_SYMBOL(__tsan_volatile_write##size); \
+ void __tsan_unaligned_volatile_write##size(void *ptr) \
+ __alias(__tsan_volatile_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_volatile_write##size)
+
+DEFINE_TSAN_VOLATILE_READ_WRITE(1);
+DEFINE_TSAN_VOLATILE_READ_WRITE(2);
+DEFINE_TSAN_VOLATILE_READ_WRITE(4);
+DEFINE_TSAN_VOLATILE_READ_WRITE(8);
+DEFINE_TSAN_VOLATILE_READ_WRITE(16);
+
+/*
+ * The below are not required by KCSAN, but can still be emitted by the
+ * compiler.
+ */
+void __tsan_func_entry(void *call_pc)
+{
+}
+EXPORT_SYMBOL(__tsan_func_entry);
+void __tsan_func_exit(void)
+{
+}
+EXPORT_SYMBOL(__tsan_func_exit);
+void __tsan_init(void)
+{
+}
+EXPORT_SYMBOL(__tsan_init);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
new file mode 100644
index 000000000000..023e49c58d55
--- /dev/null
+++ b/kernel/kcsan/debugfs.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/atomic.h>
+#include <linux/bsearch.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/init.h>
+#include <linux/kallsyms.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+
+#include "kcsan.h"
+
+/*
+ * Statistics counters.
+ */
+static atomic_long_t counters[KCSAN_COUNTER_COUNT];
+
+/*
+ * Addresses for filtering functions from reporting. This list can be used as a
+ * whitelist or blacklist.
+ */
+static struct {
+ unsigned long *addrs; /* array of addresses */
+ size_t size; /* current size */
+ int used; /* number of elements used */
+ bool sorted; /* if elements are sorted */
+ bool whitelist; /* if list is a blacklist or whitelist */
+} report_filterlist = {
+ .addrs = NULL,
+ .size = 8, /* small initial size */
+ .used = 0,
+ .sorted = false,
+ .whitelist = false, /* default is blacklist */
+};
+static DEFINE_SPINLOCK(report_filterlist_lock);
+
+static const char *counter_to_name(enum kcsan_counter_id id)
+{
+ switch (id) {
+ case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints";
+ case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints";
+ case KCSAN_COUNTER_DATA_RACES: return "data_races";
+ case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures";
+ case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity";
+ case KCSAN_COUNTER_REPORT_RACES: return "report_races";
+ case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin";
+ case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses";
+ case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives";
+ case KCSAN_COUNTER_COUNT:
+ BUG();
+ }
+ return NULL;
+}
+
+void kcsan_counter_inc(enum kcsan_counter_id id)
+{
+ atomic_long_inc(&counters[id]);
+}
+
+void kcsan_counter_dec(enum kcsan_counter_id id)
+{
+ atomic_long_dec(&counters[id]);
+}
+
+/*
+ * The microbenchmark allows benchmarking KCSAN core runtime only. To run
+ * multiple threads, pipe 'microbench=<iters>' from multiple tasks into the
+ * debugfs file. This will not generate any conflicts, and tests fast-path only.
+ */
+static noinline void microbenchmark(unsigned long iters)
+{
+ const struct kcsan_ctx ctx_save = current->kcsan_ctx;
+ const bool was_enabled = READ_ONCE(kcsan_enabled);
+ cycles_t cycles;
+
+ /* We may have been called from an atomic region; reset context. */
+ memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
+ /*
+ * Disable to benchmark fast-path for all accesses, and (expected
+ * negligible) call into slow-path, but never set up watchpoints.
+ */
+ WRITE_ONCE(kcsan_enabled, false);
+
+ pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
+
+ cycles = get_cycles();
+ while (iters--) {
+ unsigned long addr = iters & ((PAGE_SIZE << 8) - 1);
+ int type = !(iters & 0x7f) ? KCSAN_ACCESS_ATOMIC :
+ (!(iters & 0xf) ? KCSAN_ACCESS_WRITE : 0);
+ __kcsan_check_access((void *)addr, sizeof(long), type);
+ }
+ cycles = get_cycles() - cycles;
+
+ pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
+
+ WRITE_ONCE(kcsan_enabled, was_enabled);
+ /* restore context */
+ current->kcsan_ctx = ctx_save;
+}
+
+/*
+ * Simple test to create conflicting accesses. Write 'test=<iters>' to KCSAN's
+ * debugfs file from multiple tasks to generate real conflicts and show reports.
+ */
+static long test_dummy;
+static long test_flags;
+static long test_scoped;
+static noinline void test_thread(unsigned long iters)
+{
+ const long CHANGE_BITS = 0xff00ff00ff00ff00L;
+ const struct kcsan_ctx ctx_save = current->kcsan_ctx;
+ cycles_t cycles;
+
+ /* We may have been called from an atomic region; reset context. */
+ memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
+
+ pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
+ pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n",
+ &test_dummy, &test_flags, &test_scoped);
+
+ cycles = get_cycles();
+ while (iters--) {
+ /* These all should generate reports. */
+ __kcsan_check_read(&test_dummy, sizeof(test_dummy));
+ ASSERT_EXCLUSIVE_WRITER(test_dummy);
+ ASSERT_EXCLUSIVE_ACCESS(test_dummy);
+
+ ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */
+ __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
+
+ ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */
+ __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
+
+ /* not actually instrumented */
+ WRITE_ONCE(test_dummy, iters); /* to observe value-change */
+ __kcsan_check_write(&test_dummy, sizeof(test_dummy));
+
+ test_flags ^= CHANGE_BITS; /* generate value-change */
+ __kcsan_check_write(&test_flags, sizeof(test_flags));
+
+ BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
+ {
+ /* Should generate reports anywhere in this block. */
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped);
+ ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped);
+ BUG_ON(!current->kcsan_ctx.scoped_accesses.prev);
+ /* Unrelated accesses. */
+ __kcsan_check_access(&cycles, sizeof(cycles), 0);
+ __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC);
+ }
+ BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
+ }
+ cycles = get_cycles() - cycles;
+
+ pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
+
+ /* restore context */
+ current->kcsan_ctx = ctx_save;
+}
+
+static int cmp_filterlist_addrs(const void *rhs, const void *lhs)
+{
+ const unsigned long a = *(const unsigned long *)rhs;
+ const unsigned long b = *(const unsigned long *)lhs;
+
+ return a < b ? -1 : a == b ? 0 : 1;
+}
+
+bool kcsan_skip_report_debugfs(unsigned long func_addr)
+{
+ unsigned long symbolsize, offset;
+ unsigned long flags;
+ bool ret = false;
+
+ if (!kallsyms_lookup_size_offset(func_addr, &symbolsize, &offset))
+ return false;
+ func_addr -= offset; /* Get function start */
+
+ spin_lock_irqsave(&report_filterlist_lock, flags);
+ if (report_filterlist.used == 0)
+ goto out;
+
+ /* Sort array if it is unsorted, and then do a binary search. */
+ if (!report_filterlist.sorted) {
+ sort(report_filterlist.addrs, report_filterlist.used,
+ sizeof(unsigned long), cmp_filterlist_addrs, NULL);
+ report_filterlist.sorted = true;
+ }
+ ret = !!bsearch(&func_addr, report_filterlist.addrs,
+ report_filterlist.used, sizeof(unsigned long),
+ cmp_filterlist_addrs);
+ if (report_filterlist.whitelist)
+ ret = !ret;
+
+out:
+ spin_unlock_irqrestore(&report_filterlist_lock, flags);
+ return ret;
+}
+
+static void set_report_filterlist_whitelist(bool whitelist)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&report_filterlist_lock, flags);
+ report_filterlist.whitelist = whitelist;
+ spin_unlock_irqrestore(&report_filterlist_lock, flags);
+}
+
+/* Returns 0 on success, error-code otherwise. */
+static ssize_t insert_report_filterlist(const char *func)
+{
+ unsigned long flags;
+ unsigned long addr = kallsyms_lookup_name(func);
+ ssize_t ret = 0;
+
+ if (!addr) {
+ pr_err("KCSAN: could not find function: '%s'\n", func);
+ return -ENOENT;
+ }
+
+ spin_lock_irqsave(&report_filterlist_lock, flags);
+
+ if (report_filterlist.addrs == NULL) {
+ /* initial allocation */
+ report_filterlist.addrs =
+ kmalloc_array(report_filterlist.size,
+ sizeof(unsigned long), GFP_ATOMIC);
+ if (report_filterlist.addrs == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else if (report_filterlist.used == report_filterlist.size) {
+ /* resize filterlist */
+ size_t new_size = report_filterlist.size * 2;
+ unsigned long *new_addrs =
+ krealloc(report_filterlist.addrs,
+ new_size * sizeof(unsigned long), GFP_ATOMIC);
+
+ if (new_addrs == NULL) {
+ /* leave filterlist itself untouched */
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ report_filterlist.size = new_size;
+ report_filterlist.addrs = new_addrs;
+ }
+
+ /* Note: deduplicating should be done in userspace. */
+ report_filterlist.addrs[report_filterlist.used++] =
+ kallsyms_lookup_name(func);
+ report_filterlist.sorted = false;
+
+out:
+ spin_unlock_irqrestore(&report_filterlist_lock, flags);
+
+ return ret;
+}
+
+static int show_info(struct seq_file *file, void *v)
+{
+ int i;
+ unsigned long flags;
+
+ /* show stats */
+ seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled));
+ for (i = 0; i < KCSAN_COUNTER_COUNT; ++i)
+ seq_printf(file, "%s: %ld\n", counter_to_name(i),
+ atomic_long_read(&counters[i]));
+
+ /* show filter functions, and filter type */
+ spin_lock_irqsave(&report_filterlist_lock, flags);
+ seq_printf(file, "\n%s functions: %s\n",
+ report_filterlist.whitelist ? "whitelisted" : "blacklisted",
+ report_filterlist.used == 0 ? "none" : "");
+ for (i = 0; i < report_filterlist.used; ++i)
+ seq_printf(file, " %ps\n", (void *)report_filterlist.addrs[i]);
+ spin_unlock_irqrestore(&report_filterlist_lock, flags);
+
+ return 0;
+}
+
+static int debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_info, NULL);
+}
+
+static ssize_t
+debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *off)
+{
+ char kbuf[KSYM_NAME_LEN];
+ char *arg;
+ int read_len = count < (sizeof(kbuf) - 1) ? count : (sizeof(kbuf) - 1);
+
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EFAULT;
+ kbuf[read_len] = '\0';
+ arg = strstrip(kbuf);
+
+ if (!strcmp(arg, "on")) {
+ WRITE_ONCE(kcsan_enabled, true);
+ } else if (!strcmp(arg, "off")) {
+ WRITE_ONCE(kcsan_enabled, false);
+ } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) {
+ unsigned long iters;
+
+ if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters))
+ return -EINVAL;
+ microbenchmark(iters);
+ } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) {
+ unsigned long iters;
+
+ if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters))
+ return -EINVAL;
+ test_thread(iters);
+ } else if (!strcmp(arg, "whitelist")) {
+ set_report_filterlist_whitelist(true);
+ } else if (!strcmp(arg, "blacklist")) {
+ set_report_filterlist_whitelist(false);
+ } else if (arg[0] == '!') {
+ ssize_t ret = insert_report_filterlist(&arg[1]);
+
+ if (ret < 0)
+ return ret;
+ } else {
+ return -EINVAL;
+ }
+
+ return count;
+}
+
+static const struct file_operations debugfs_ops =
+{
+ .read = seq_read,
+ .open = debugfs_open,
+ .write = debugfs_write,
+ .release = single_release
+};
+
+void __init kcsan_debugfs_init(void)
+{
+ debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops);
+}
diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h
new file mode 100644
index 000000000000..f03562aaf2eb
--- /dev/null
+++ b/kernel/kcsan/encoding.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_KCSAN_ENCODING_H
+#define _KERNEL_KCSAN_ENCODING_H
+
+#include <linux/bits.h>
+#include <linux/log2.h>
+#include <linux/mm.h>
+
+#include "kcsan.h"
+
+#define SLOT_RANGE PAGE_SIZE
+
+#define INVALID_WATCHPOINT 0
+#define CONSUMED_WATCHPOINT 1
+
+/*
+ * The maximum useful size of accesses for which we set up watchpoints is the
+ * max range of slots we check on an access.
+ */
+#define MAX_ENCODABLE_SIZE (SLOT_RANGE * (1 + KCSAN_CHECK_ADJACENT))
+
+/*
+ * Number of bits we use to store size info.
+ */
+#define WATCHPOINT_SIZE_BITS bits_per(MAX_ENCODABLE_SIZE)
+/*
+ * This encoding for addresses discards the upper (1 for is-write + SIZE_BITS);
+ * however, most 64-bit architectures do not use the full 64-bit address space.
+ * Also, in order for a false positive to be observable 2 things need to happen:
+ *
+ * 1. different addresses but with the same encoded address race;
+ * 2. and both map onto the same watchpoint slots;
+ *
+ * Both these are assumed to be very unlikely. However, in case it still happens
+ * happens, the report logic will filter out the false positive (see report.c).
+ */
+#define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS)
+
+/*
+ * Masks to set/retrieve the encoded data.
+ */
+#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1)
+#define WATCHPOINT_SIZE_MASK \
+ GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS)
+#define WATCHPOINT_ADDR_MASK \
+ GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0)
+
+static inline bool check_encodable(unsigned long addr, size_t size)
+{
+ return size <= MAX_ENCODABLE_SIZE;
+}
+
+static inline long
+encode_watchpoint(unsigned long addr, size_t size, bool is_write)
+{
+ return (long)((is_write ? WATCHPOINT_WRITE_MASK : 0) |
+ (size << WATCHPOINT_ADDR_BITS) |
+ (addr & WATCHPOINT_ADDR_MASK));
+}
+
+static __always_inline bool decode_watchpoint(long watchpoint,
+ unsigned long *addr_masked,
+ size_t *size,
+ bool *is_write)
+{
+ if (watchpoint == INVALID_WATCHPOINT ||
+ watchpoint == CONSUMED_WATCHPOINT)
+ return false;
+
+ *addr_masked = (unsigned long)watchpoint & WATCHPOINT_ADDR_MASK;
+ *size = ((unsigned long)watchpoint & WATCHPOINT_SIZE_MASK) >> WATCHPOINT_ADDR_BITS;
+ *is_write = !!((unsigned long)watchpoint & WATCHPOINT_WRITE_MASK);
+
+ return true;
+}
+
+/*
+ * Return watchpoint slot for an address.
+ */
+static __always_inline int watchpoint_slot(unsigned long addr)
+{
+ return (addr / PAGE_SIZE) % CONFIG_KCSAN_NUM_WATCHPOINTS;
+}
+
+static __always_inline bool matching_access(unsigned long addr1, size_t size1,
+ unsigned long addr2, size_t size2)
+{
+ unsigned long end_range1 = addr1 + size1 - 1;
+ unsigned long end_range2 = addr2 + size2 - 1;
+
+ return addr1 <= end_range2 && addr2 <= end_range1;
+}
+
+#endif /* _KERNEL_KCSAN_ENCODING_H */
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
new file mode 100644
index 000000000000..763d6d08d94b
--- /dev/null
+++ b/kernel/kcsan/kcsan.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * The Kernel Concurrency Sanitizer (KCSAN) infrastructure. For more info please
+ * see Documentation/dev-tools/kcsan.rst.
+ */
+
+#ifndef _KERNEL_KCSAN_KCSAN_H
+#define _KERNEL_KCSAN_KCSAN_H
+
+#include <linux/kcsan.h>
+
+/* The number of adjacent watchpoints to check. */
+#define KCSAN_CHECK_ADJACENT 1
+#define NUM_SLOTS (1 + 2*KCSAN_CHECK_ADJACENT)
+
+extern unsigned int kcsan_udelay_task;
+extern unsigned int kcsan_udelay_interrupt;
+
+/*
+ * Globally enable and disable KCSAN.
+ */
+extern bool kcsan_enabled;
+
+/*
+ * Initialize debugfs file.
+ */
+void kcsan_debugfs_init(void);
+
+enum kcsan_counter_id {
+ /*
+ * Number of watchpoints currently in use.
+ */
+ KCSAN_COUNTER_USED_WATCHPOINTS,
+
+ /*
+ * Total number of watchpoints set up.
+ */
+ KCSAN_COUNTER_SETUP_WATCHPOINTS,
+
+ /*
+ * Total number of data races.
+ */
+ KCSAN_COUNTER_DATA_RACES,
+
+ /*
+ * Total number of ASSERT failures due to races. If the observed race is
+ * due to two conflicting ASSERT type accesses, then both will be
+ * counted.
+ */
+ KCSAN_COUNTER_ASSERT_FAILURES,
+
+ /*
+ * Number of times no watchpoints were available.
+ */
+ KCSAN_COUNTER_NO_CAPACITY,
+
+ /*
+ * A thread checking a watchpoint raced with another checking thread;
+ * only one will be reported.
+ */
+ KCSAN_COUNTER_REPORT_RACES,
+
+ /*
+ * Observed data value change, but writer thread unknown.
+ */
+ KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN,
+
+ /*
+ * The access cannot be encoded to a valid watchpoint.
+ */
+ KCSAN_COUNTER_UNENCODABLE_ACCESSES,
+
+ /*
+ * Watchpoint encoding caused a watchpoint to fire on mismatching
+ * accesses.
+ */
+ KCSAN_COUNTER_ENCODING_FALSE_POSITIVES,
+
+ KCSAN_COUNTER_COUNT, /* number of counters */
+};
+
+/*
+ * Increment/decrement counter with given id; avoid calling these in fast-path.
+ */
+extern void kcsan_counter_inc(enum kcsan_counter_id id);
+extern void kcsan_counter_dec(enum kcsan_counter_id id);
+
+/*
+ * Returns true if data races in the function symbol that maps to func_addr
+ * (offsets are ignored) should *not* be reported.
+ */
+extern bool kcsan_skip_report_debugfs(unsigned long func_addr);
+
+/*
+ * Value-change states.
+ */
+enum kcsan_value_change {
+ /*
+ * Did not observe a value-change, however, it is valid to report the
+ * race, depending on preferences.
+ */
+ KCSAN_VALUE_CHANGE_MAYBE,
+
+ /*
+ * Did not observe a value-change, and it is invalid to report the race.
+ */
+ KCSAN_VALUE_CHANGE_FALSE,
+
+ /*
+ * The value was observed to change, and the race should be reported.
+ */
+ KCSAN_VALUE_CHANGE_TRUE,
+};
+
+enum kcsan_report_type {
+ /*
+ * The thread that set up the watchpoint and briefly stalled was
+ * signalled that another thread triggered the watchpoint.
+ */
+ KCSAN_REPORT_RACE_SIGNAL,
+
+ /*
+ * A thread found and consumed a matching watchpoint.
+ */
+ KCSAN_REPORT_CONSUMED_WATCHPOINT,
+
+ /*
+ * No other thread was observed to race with the access, but the data
+ * value before and after the stall differs.
+ */
+ KCSAN_REPORT_RACE_UNKNOWN_ORIGIN,
+};
+
+/*
+ * Print a race report from thread that encountered the race.
+ */
+extern void kcsan_report(const volatile void *ptr, size_t size, int access_type,
+ enum kcsan_value_change value_change,
+ enum kcsan_report_type type, int watchpoint_idx);
+
+#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
new file mode 100644
index 000000000000..ac5f8345bae9
--- /dev/null
+++ b/kernel/kcsan/report.c
@@ -0,0 +1,634 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debug_locks.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/preempt.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/stacktrace.h>
+
+#include "kcsan.h"
+#include "encoding.h"
+
+/*
+ * Max. number of stack entries to show in the report.
+ */
+#define NUM_STACK_ENTRIES 64
+
+/* Common access info. */
+struct access_info {
+ const volatile void *ptr;
+ size_t size;
+ int access_type;
+ int task_pid;
+ int cpu_id;
+};
+
+/*
+ * Other thread info: communicated from other racing thread to thread that set
+ * up the watchpoint, which then prints the complete report atomically.
+ */
+struct other_info {
+ struct access_info ai;
+ unsigned long stack_entries[NUM_STACK_ENTRIES];
+ int num_stack_entries;
+
+ /*
+ * Optionally pass @current. Typically we do not need to pass @current
+ * via @other_info since just @task_pid is sufficient. Passing @current
+ * has additional overhead.
+ *
+ * To safely pass @current, we must either use get_task_struct/
+ * put_task_struct, or stall the thread that populated @other_info.
+ *
+ * We cannot rely on get_task_struct/put_task_struct in case
+ * release_report() races with a task being released, and would have to
+ * free it in release_report(). This may result in deadlock if we want
+ * to use KCSAN on the allocators.
+ *
+ * Since we also want to reliably print held locks for
+ * CONFIG_KCSAN_VERBOSE, the current implementation stalls the thread
+ * that populated @other_info until it has been consumed.
+ */
+ struct task_struct *task;
+};
+
+/*
+ * To never block any producers of struct other_info, we need as many elements
+ * as we have watchpoints (upper bound on concurrent races to report).
+ */
+static struct other_info other_infos[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
+
+/*
+ * Information about reported races; used to rate limit reporting.
+ */
+struct report_time {
+ /*
+ * The last time the race was reported.
+ */
+ unsigned long time;
+
+ /*
+ * The frames of the 2 threads; if only 1 thread is known, one frame
+ * will be 0.
+ */
+ unsigned long frame1;
+ unsigned long frame2;
+};
+
+/*
+ * Since we also want to be able to debug allocators with KCSAN, to avoid
+ * deadlock, report_times cannot be dynamically resized with krealloc in
+ * rate_limit_report.
+ *
+ * Therefore, we use a fixed-size array, which at most will occupy a page. This
+ * still adequately rate limits reports, assuming that a) number of unique data
+ * races is not excessive, and b) occurrence of unique races within the
+ * same time window is limited.
+ */
+#define REPORT_TIMES_MAX (PAGE_SIZE / sizeof(struct report_time))
+#define REPORT_TIMES_SIZE \
+ (CONFIG_KCSAN_REPORT_ONCE_IN_MS > REPORT_TIMES_MAX ? \
+ REPORT_TIMES_MAX : \
+ CONFIG_KCSAN_REPORT_ONCE_IN_MS)
+static struct report_time report_times[REPORT_TIMES_SIZE];
+
+/*
+ * Spinlock serializing report generation, and access to @other_infos. Although
+ * it could make sense to have a finer-grained locking story for @other_infos,
+ * report generation needs to be serialized either way, so not much is gained.
+ */
+static DEFINE_RAW_SPINLOCK(report_lock);
+
+/*
+ * Checks if the race identified by thread frames frame1 and frame2 has
+ * been reported since (now - KCSAN_REPORT_ONCE_IN_MS).
+ */
+static bool rate_limit_report(unsigned long frame1, unsigned long frame2)
+{
+ struct report_time *use_entry = &report_times[0];
+ unsigned long invalid_before;
+ int i;
+
+ BUILD_BUG_ON(CONFIG_KCSAN_REPORT_ONCE_IN_MS != 0 && REPORT_TIMES_SIZE == 0);
+
+ if (CONFIG_KCSAN_REPORT_ONCE_IN_MS == 0)
+ return false;
+
+ invalid_before = jiffies - msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS);
+
+ /* Check if a matching race report exists. */
+ for (i = 0; i < REPORT_TIMES_SIZE; ++i) {
+ struct report_time *rt = &report_times[i];
+
+ /*
+ * Must always select an entry for use to store info as we
+ * cannot resize report_times; at the end of the scan, use_entry
+ * will be the oldest entry, which ideally also happened before
+ * KCSAN_REPORT_ONCE_IN_MS ago.
+ */
+ if (time_before(rt->time, use_entry->time))
+ use_entry = rt;
+
+ /*
+ * Initially, no need to check any further as this entry as well
+ * as following entries have never been used.
+ */
+ if (rt->time == 0)
+ break;
+
+ /* Check if entry expired. */
+ if (time_before(rt->time, invalid_before))
+ continue; /* before KCSAN_REPORT_ONCE_IN_MS ago */
+
+ /* Reported recently, check if race matches. */
+ if ((rt->frame1 == frame1 && rt->frame2 == frame2) ||
+ (rt->frame1 == frame2 && rt->frame2 == frame1))
+ return true;
+ }
+
+ use_entry->time = jiffies;
+ use_entry->frame1 = frame1;
+ use_entry->frame2 = frame2;
+ return false;
+}
+
+/*
+ * Special rules to skip reporting.
+ */
+static bool
+skip_report(enum kcsan_value_change value_change, unsigned long top_frame)
+{
+ /* Should never get here if value_change==FALSE. */
+ WARN_ON_ONCE(value_change == KCSAN_VALUE_CHANGE_FALSE);
+
+ /*
+ * The first call to skip_report always has value_change==TRUE, since we
+ * cannot know the value written of an instrumented access. For the 2nd
+ * call there are 6 cases with CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY:
+ *
+ * 1. read watchpoint, conflicting write (value_change==TRUE): report;
+ * 2. read watchpoint, conflicting write (value_change==MAYBE): skip;
+ * 3. write watchpoint, conflicting write (value_change==TRUE): report;
+ * 4. write watchpoint, conflicting write (value_change==MAYBE): skip;
+ * 5. write watchpoint, conflicting read (value_change==MAYBE): skip;
+ * 6. write watchpoint, conflicting read (value_change==TRUE): report;
+ *
+ * Cases 1-4 are intuitive and expected; case 5 ensures we do not report
+ * data races where the write may have rewritten the same value; case 6
+ * is possible either if the size is larger than what we check value
+ * changes for or the access type is KCSAN_ACCESS_ASSERT.
+ */
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY) &&
+ value_change == KCSAN_VALUE_CHANGE_MAYBE) {
+ /*
+ * The access is a write, but the data value did not change.
+ *
+ * We opt-out of this filter for certain functions at request of
+ * maintainers.
+ */
+ char buf[64];
+ int len = scnprintf(buf, sizeof(buf), "%ps", (void *)top_frame);
+
+ if (!strnstr(buf, "rcu_", len) &&
+ !strnstr(buf, "_rcu", len) &&
+ !strnstr(buf, "_srcu", len))
+ return true;
+ }
+
+ return kcsan_skip_report_debugfs(top_frame);
+}
+
+static const char *get_access_type(int type)
+{
+ if (type & KCSAN_ACCESS_ASSERT) {
+ if (type & KCSAN_ACCESS_SCOPED) {
+ if (type & KCSAN_ACCESS_WRITE)
+ return "assert no accesses (scoped)";
+ else
+ return "assert no writes (scoped)";
+ } else {
+ if (type & KCSAN_ACCESS_WRITE)
+ return "assert no accesses";
+ else
+ return "assert no writes";
+ }
+ }
+
+ switch (type) {
+ case 0:
+ return "read";
+ case KCSAN_ACCESS_ATOMIC:
+ return "read (marked)";
+ case KCSAN_ACCESS_WRITE:
+ return "write";
+ case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "write (marked)";
+ case KCSAN_ACCESS_SCOPED:
+ return "read (scoped)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
+ return "read (marked, scoped)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
+ return "write (scoped)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "write (marked, scoped)";
+ default:
+ BUG();
+ }
+}
+
+static const char *get_bug_type(int type)
+{
+ return (type & KCSAN_ACCESS_ASSERT) != 0 ? "assert: race" : "data-race";
+}
+
+/* Return thread description: in task or interrupt. */
+static const char *get_thread_desc(int task_id)
+{
+ if (task_id != -1) {
+ static char buf[32]; /* safe: protected by report_lock */
+
+ snprintf(buf, sizeof(buf), "task %i", task_id);
+ return buf;
+ }
+ return "interrupt";
+}
+
+/* Helper to skip KCSAN-related functions in stack-trace. */
+static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries)
+{
+ char buf[64];
+ char *cur;
+ int len, skip;
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skip]);
+
+ /* Never show tsan_* or {read,write}_once_size. */
+ if (strnstr(buf, "tsan_", len) ||
+ strnstr(buf, "_once_size", len))
+ continue;
+
+ cur = strnstr(buf, "kcsan_", len);
+ if (cur) {
+ cur += sizeof("kcsan_") - 1;
+ if (strncmp(cur, "test", sizeof("test") - 1))
+ continue; /* KCSAN runtime function. */
+ /* KCSAN related test. */
+ }
+
+ /*
+ * No match for runtime functions -- @skip entries to skip to
+ * get to first frame of interest.
+ */
+ break;
+ }
+
+ return skip;
+}
+
+/* Compares symbolized strings of addr1 and addr2. */
+static int sym_strcmp(void *addr1, void *addr2)
+{
+ char buf1[64];
+ char buf2[64];
+
+ snprintf(buf1, sizeof(buf1), "%pS", addr1);
+ snprintf(buf2, sizeof(buf2), "%pS", addr2);
+
+ return strncmp(buf1, buf2, sizeof(buf1));
+}
+
+static void print_verbose_info(struct task_struct *task)
+{
+ if (!task)
+ return;
+
+ pr_err("\n");
+ debug_show_held_locks(task);
+ print_irqtrace_events(task);
+}
+
+/*
+ * Returns true if a report was generated, false otherwise.
+ */
+static bool print_report(enum kcsan_value_change value_change,
+ enum kcsan_report_type type,
+ const struct access_info *ai,
+ const struct other_info *other_info)
+{
+ unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
+ int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
+ int skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+ unsigned long this_frame = stack_entries[skipnr];
+ unsigned long other_frame = 0;
+ int other_skipnr = 0; /* silence uninit warnings */
+
+ /*
+ * Must check report filter rules before starting to print.
+ */
+ if (skip_report(KCSAN_VALUE_CHANGE_TRUE, stack_entries[skipnr]))
+ return false;
+
+ if (type == KCSAN_REPORT_RACE_SIGNAL) {
+ other_skipnr = get_stack_skipnr(other_info->stack_entries,
+ other_info->num_stack_entries);
+ other_frame = other_info->stack_entries[other_skipnr];
+
+ /* @value_change is only known for the other thread */
+ if (skip_report(value_change, other_frame))
+ return false;
+ }
+
+ if (rate_limit_report(this_frame, other_frame))
+ return false;
+
+ /* Print report header. */
+ pr_err("==================================================================\n");
+ switch (type) {
+ case KCSAN_REPORT_RACE_SIGNAL: {
+ int cmp;
+
+ /*
+ * Order functions lexographically for consistent bug titles.
+ * Do not print offset of functions to keep title short.
+ */
+ cmp = sym_strcmp((void *)other_frame, (void *)this_frame);
+ pr_err("BUG: KCSAN: %s in %ps / %ps\n",
+ get_bug_type(ai->access_type | other_info->ai.access_type),
+ (void *)(cmp < 0 ? other_frame : this_frame),
+ (void *)(cmp < 0 ? this_frame : other_frame));
+ } break;
+
+ case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ pr_err("BUG: KCSAN: %s in %pS\n", get_bug_type(ai->access_type),
+ (void *)this_frame);
+ break;
+
+ default:
+ BUG();
+ }
+
+ pr_err("\n");
+
+ /* Print information about the racing accesses. */
+ switch (type) {
+ case KCSAN_REPORT_RACE_SIGNAL:
+ pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(other_info->ai.access_type), other_info->ai.ptr,
+ other_info->ai.size, get_thread_desc(other_info->ai.task_pid),
+ other_info->ai.cpu_id);
+
+ /* Print the other thread's stack trace. */
+ stack_trace_print(other_info->stack_entries + other_skipnr,
+ other_info->num_stack_entries - other_skipnr,
+ 0);
+
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ print_verbose_info(other_info->task);
+
+ pr_err("\n");
+ pr_err("%s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(ai->access_type), ai->ptr, ai->size,
+ get_thread_desc(ai->task_pid), ai->cpu_id);
+ break;
+
+ case KCSAN_REPORT_RACE_UNKNOWN_ORIGIN:
+ pr_err("race at unknown origin, with %s to 0x%px of %zu bytes by %s on cpu %i:\n",
+ get_access_type(ai->access_type), ai->ptr, ai->size,
+ get_thread_desc(ai->task_pid), ai->cpu_id);
+ break;
+
+ default:
+ BUG();
+ }
+ /* Print stack trace of this thread. */
+ stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
+ 0);
+
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ print_verbose_info(current);
+
+ /* Print report footer. */
+ pr_err("\n");
+ pr_err("Reported by Kernel Concurrency Sanitizer on:\n");
+ dump_stack_print_info(KERN_DEFAULT);
+ pr_err("==================================================================\n");
+
+ return true;
+}
+
+static void release_report(unsigned long *flags, struct other_info *other_info)
+{
+ if (other_info)
+ /*
+ * Use size to denote valid/invalid, since KCSAN entirely
+ * ignores 0-sized accesses.
+ */
+ other_info->ai.size = 0;
+
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+}
+
+/*
+ * Sets @other_info->task and awaits consumption of @other_info.
+ *
+ * Precondition: report_lock is held.
+ * Postcondition: report_lock is held.
+ */
+static void set_other_info_task_blocking(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+ /*
+ * We may be instrumenting a code-path where current->state is already
+ * something other than TASK_RUNNING.
+ */
+ const bool is_running = current->state == TASK_RUNNING;
+ /*
+ * To avoid deadlock in case we are in an interrupt here and this is a
+ * race with a task on the same CPU (KCSAN_INTERRUPT_WATCHER), provide a
+ * timeout to ensure this works in all contexts.
+ *
+ * Await approximately the worst case delay of the reporting thread (if
+ * we are not interrupted).
+ */
+ int timeout = max(kcsan_udelay_task, kcsan_udelay_interrupt);
+
+ other_info->task = current;
+ do {
+ if (is_running) {
+ /*
+ * Let lockdep know the real task is sleeping, to print
+ * the held locks (recall we turned lockdep off, so
+ * locking/unlocking @report_lock won't be recorded).
+ */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ }
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+ /*
+ * We cannot call schedule() since we also cannot reliably
+ * determine if sleeping here is permitted -- see in_atomic().
+ */
+
+ udelay(1);
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ if (timeout-- < 0) {
+ /*
+ * Abort. Reset @other_info->task to NULL, since it
+ * appears the other thread is still going to consume
+ * it. It will result in no verbose info printed for
+ * this task.
+ */
+ other_info->task = NULL;
+ break;
+ }
+ /*
+ * If invalid, or @ptr nor @current matches, then @other_info
+ * has been consumed and we may continue. If not, retry.
+ */
+ } while (other_info->ai.size && other_info->ai.ptr == ai->ptr &&
+ other_info->task == current);
+ if (is_running)
+ set_current_state(TASK_RUNNING);
+}
+
+/* Populate @other_info; requires that the provided @other_info not in use. */
+static void prepare_report_producer(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+ raw_spin_lock_irqsave(&report_lock, *flags);
+
+ /*
+ * The same @other_infos entry cannot be used concurrently, because
+ * there is a one-to-one mapping to watchpoint slots (@watchpoints in
+ * core.c), and a watchpoint is only released for reuse after reporting
+ * is done by the consumer of @other_info. Therefore, it is impossible
+ * for another concurrent prepare_report_producer() to set the same
+ * @other_info, and are guaranteed exclusivity for the @other_infos
+ * entry pointed to by @other_info.
+ *
+ * To check this property holds, size should never be non-zero here,
+ * because every consumer of struct other_info resets size to 0 in
+ * release_report().
+ */
+ WARN_ON(other_info->ai.size);
+
+ other_info->ai = *ai;
+ other_info->num_stack_entries = stack_trace_save(other_info->stack_entries, NUM_STACK_ENTRIES, 2);
+
+ if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
+ set_other_info_task_blocking(flags, ai, other_info);
+
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+}
+
+/* Awaits producer to fill @other_info and then returns. */
+static bool prepare_report_consumer(unsigned long *flags,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ while (!other_info->ai.size) { /* Await valid @other_info. */
+ raw_spin_unlock_irqrestore(&report_lock, *flags);
+ cpu_relax();
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ }
+
+ /* Should always have a matching access based on watchpoint encoding. */
+ if (WARN_ON(!matching_access((unsigned long)other_info->ai.ptr & WATCHPOINT_ADDR_MASK, other_info->ai.size,
+ (unsigned long)ai->ptr & WATCHPOINT_ADDR_MASK, ai->size)))
+ goto discard;
+
+ if (!matching_access((unsigned long)other_info->ai.ptr, other_info->ai.size,
+ (unsigned long)ai->ptr, ai->size)) {
+ /*
+ * If the actual accesses to not match, this was a false
+ * positive due to watchpoint encoding.
+ */
+ kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES);
+ goto discard;
+ }
+
+ return true;
+
+discard:
+ release_report(flags, other_info);
+ return false;
+}
+
+/*
+ * Depending on the report type either sets @other_info and returns false, or
+ * awaits @other_info and returns true. If @other_info is not required for the
+ * report type, simply acquires @report_lock and returns true.
+ */
+static noinline bool prepare_report(unsigned long *flags,
+ enum kcsan_report_type type,
+ const struct access_info *ai,
+ struct other_info *other_info)
+{
+ switch (type) {
+ case KCSAN_REPORT_CONSUMED_WATCHPOINT:
+ prepare_report_producer(flags, ai, other_info);
+ return false;
+ case KCSAN_REPORT_RACE_SIGNAL:
+ return prepare_report_consumer(flags, ai, other_info);
+ default:
+ /* @other_info not required; just acquire @report_lock. */
+ raw_spin_lock_irqsave(&report_lock, *flags);
+ return true;
+ }
+}
+
+void kcsan_report(const volatile void *ptr, size_t size, int access_type,
+ enum kcsan_value_change value_change,
+ enum kcsan_report_type type, int watchpoint_idx)
+{
+ unsigned long flags = 0;
+ const struct access_info ai = {
+ .ptr = ptr,
+ .size = size,
+ .access_type = access_type,
+ .task_pid = in_task() ? task_pid_nr(current) : -1,
+ .cpu_id = raw_smp_processor_id()
+ };
+ struct other_info *other_info = type == KCSAN_REPORT_RACE_UNKNOWN_ORIGIN
+ ? NULL : &other_infos[watchpoint_idx];
+
+ kcsan_disable_current();
+ if (WARN_ON(watchpoint_idx < 0 || watchpoint_idx >= ARRAY_SIZE(other_infos)))
+ goto out;
+
+ /*
+ * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if
+ * we do not turn off lockdep here; this could happen due to recursion
+ * into lockdep via KCSAN if we detect a race in utilities used by
+ * lockdep.
+ */
+ lockdep_off();
+
+ if (prepare_report(&flags, type, &ai, other_info)) {
+ /*
+ * Never report if value_change is FALSE, only if we it is
+ * either TRUE or MAYBE. In case of MAYBE, further filtering may
+ * be done once we know the full stack trace in print_report().
+ */
+ bool reported = value_change != KCSAN_VALUE_CHANGE_FALSE &&
+ print_report(value_change, type, &ai, other_info);
+
+ if (reported && panic_on_warn)
+ panic("panic_on_warn set ...\n");
+
+ release_report(&flags, other_info);
+ }
+
+ lockdep_on();
+out:
+ kcsan_enable_current();
+}
diff --git a/kernel/kcsan/test.c b/kernel/kcsan/test.c
new file mode 100644
index 000000000000..d26a052d3383
--- /dev/null
+++ b/kernel/kcsan/test.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/types.h>
+
+#include "encoding.h"
+
+#define ITERS_PER_TEST 2000
+
+/* Test requirements. */
+static bool test_requires(void)
+{
+ /* random should be initialized for the below tests */
+ return prandom_u32() + prandom_u32() != 0;
+}
+
+/*
+ * Test watchpoint encode and decode: check that encoding some access's info,
+ * and then subsequent decode preserves the access's info.
+ */
+static bool test_encode_decode(void)
+{
+ int i;
+
+ for (i = 0; i < ITERS_PER_TEST; ++i) {
+ size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
+ bool is_write = !!prandom_u32_max(2);
+ unsigned long addr;
+
+ prandom_bytes(&addr, sizeof(addr));
+ if (WARN_ON(!check_encodable(addr, size)))
+ return false;
+
+ /* Encode and decode */
+ {
+ const long encoded_watchpoint =
+ encode_watchpoint(addr, size, is_write);
+ unsigned long verif_masked_addr;
+ size_t verif_size;
+ bool verif_is_write;
+
+ /* Check special watchpoints */
+ if (WARN_ON(decode_watchpoint(
+ INVALID_WATCHPOINT, &verif_masked_addr,
+ &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(decode_watchpoint(
+ CONSUMED_WATCHPOINT, &verif_masked_addr,
+ &verif_size, &verif_is_write)))
+ return false;
+
+ /* Check decoding watchpoint returns same data */
+ if (WARN_ON(!decode_watchpoint(
+ encoded_watchpoint, &verif_masked_addr,
+ &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(verif_masked_addr !=
+ (addr & WATCHPOINT_ADDR_MASK)))
+ goto fail;
+ if (WARN_ON(verif_size != size))
+ goto fail;
+ if (WARN_ON(is_write != verif_is_write))
+ goto fail;
+
+ continue;
+fail:
+ pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
+ __func__, is_write ? "write" : "read", size,
+ addr, encoded_watchpoint,
+ verif_is_write ? "write" : "read", verif_size,
+ verif_masked_addr);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Test access matching function. */
+static bool test_matching_access(void)
+{
+ if (WARN_ON(!matching_access(10, 1, 10, 1)))
+ return false;
+ if (WARN_ON(!matching_access(10, 2, 11, 1)))
+ return false;
+ if (WARN_ON(!matching_access(10, 1, 9, 2)))
+ return false;
+ if (WARN_ON(matching_access(10, 1, 11, 1)))
+ return false;
+ if (WARN_ON(matching_access(9, 1, 10, 1)))
+ return false;
+
+ /*
+ * An access of size 0 could match another access, as demonstrated here.
+ * Rather than add more comparisons to 'matching_access()', which would
+ * end up in the fast-path for *all* checks, check_access() simply
+ * returns for all accesses of size 0.
+ */
+ if (WARN_ON(!matching_access(8, 8, 12, 0)))
+ return false;
+
+ return true;
+}
+
+static int __init kcsan_selftest(void)
+{
+ int passed = 0;
+ int total = 0;
+
+#define RUN_TEST(do_test) \
+ do { \
+ ++total; \
+ if (do_test()) \
+ ++passed; \
+ else \
+ pr_err("KCSAN selftest: " #do_test " failed"); \
+ } while (0)
+
+ RUN_TEST(test_requires);
+ RUN_TEST(test_encode_decode);
+ RUN_TEST(test_matching_access);
+
+ pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total);
+ if (passed != total)
+ panic("KCSAN selftests failed");
+ return 0;
+}
+postcore_initcall(kcsan_selftest);
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index bb05fd52de85..09cc78df53c6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -181,34 +181,19 @@ void kimage_file_post_load_cleanup(struct kimage *image)
static int
kimage_validate_signature(struct kimage *image)
{
- const char *reason;
int ret;
ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
image->kernel_buf_len);
- switch (ret) {
- case 0:
- break;
+ if (ret) {
- /* Certain verification errors are non-fatal if we're not
- * checking errors, provided we aren't mandating that there
- * must be a valid signature.
- */
- case -ENODATA:
- reason = "kexec of unsigned image";
- goto decide;
- case -ENOPKG:
- reason = "kexec of image with unsupported crypto";
- goto decide;
- case -ENOKEY:
- reason = "kexec of image with unavailable key";
- decide:
if (IS_ENABLED(CONFIG_KEXEC_SIG_FORCE)) {
- pr_notice("%s rejected\n", reason);
+ pr_notice("Enforced kernel signature verification failed (%d).\n", ret);
return ret;
}
- /* If IMA is guaranteed to appraise a signature on the kexec
+ /*
+ * If IMA is guaranteed to appraise a signature on the kexec
* image, permit it even if the kernel is otherwise locked
* down.
*/
@@ -216,17 +201,10 @@ kimage_validate_signature(struct kimage *image)
security_locked_down(LOCKDOWN_KEXEC))
return -EPERM;
- return 0;
-
- /* All other errors are fatal, including nomem, unparseable
- * signatures and signature check failures - even if signatures
- * aren't required.
- */
- default:
- pr_notice("kernel signature verification failed (%d).\n", ret);
+ pr_debug("kernel signature verification failed (%d).\n", ret);
}
- return ret;
+ return 0;
}
#endif
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 50cd84f53df0..4a904cc56d68 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -46,6 +46,11 @@
static int kprobes_initialized;
+/* kprobe_table can be accessed by
+ * - Normal hlist traversal and RCU add/del under kprobe_mutex is held.
+ * Or
+ * - RCU hlist traversal under disabling preempt (breakpoint handlers)
+ */
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -326,7 +331,8 @@ struct kprobe *get_kprobe(void *addr)
struct kprobe *p;
head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry_rcu(p, head, hlist,
+ lockdep_is_held(&kprobe_mutex)) {
if (p->addr == addr)
return p;
}
@@ -586,11 +592,12 @@ static void kprobe_optimizer(struct work_struct *work)
mutex_unlock(&module_mutex);
mutex_unlock(&text_mutex);
cpus_read_unlock();
- mutex_unlock(&kprobe_mutex);
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
+
+ mutex_unlock(&kprobe_mutex);
}
/* Wait for completing optimization and unoptimization */
@@ -668,8 +675,6 @@ static void force_unoptimize_kprobe(struct optimized_kprobe *op)
lockdep_assert_cpus_held();
arch_unoptimize_kprobe(op);
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
- if (kprobe_disabled(&op->kp))
- arch_disarm_kprobe(&op->kp);
}
/* Unoptimize a kprobe if p is optimized */
@@ -849,7 +854,7 @@ static void optimize_all_kprobes(void)
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
+ hlist_for_each_entry(p, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
@@ -876,7 +881,7 @@ static void unoptimize_all_kprobes(void)
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry(p, head, hlist) {
if (!kprobe_disabled(p))
unoptimize_kprobe(p, false);
}
@@ -1236,6 +1241,26 @@ __releases(hlist_lock)
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);
+struct kprobe kprobe_busy = {
+ .addr = (void *) get_kprobe,
+};
+
+void kprobe_busy_begin(void)
+{
+ struct kprobe_ctlblk *kcb;
+
+ preempt_disable();
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+}
+
+void kprobe_busy_end(void)
+{
+ __this_cpu_write(current_kprobe, NULL);
+ preempt_enable();
+}
+
/*
* This function is called from finish_task_switch when task tk becomes dead,
* so that we can recycle any function-return probe instances associated
@@ -1253,6 +1278,8 @@ void kprobe_flush_task(struct task_struct *tk)
/* Early boot. kretprobe_table_locks not yet initialized. */
return;
+ kprobe_busy_begin();
+
INIT_HLIST_HEAD(&empty_rp);
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
@@ -1266,6 +1293,8 @@ void kprobe_flush_task(struct task_struct *tk)
hlist_del(&ri->hlist);
kfree(ri);
}
+
+ kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);
@@ -1499,12 +1528,14 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
+ lockdep_assert_held(&kprobe_mutex);
+
ap = get_kprobe(p->addr);
if (unlikely(!ap))
return NULL;
if (p != ap) {
- list_for_each_entry_rcu(list_p, &ap->list, list)
+ list_for_each_entry(list_p, &ap->list, list)
if (list_p == p)
/* kprobe p is a valid probe */
goto valid;
@@ -1669,7 +1700,9 @@ static int aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
- list_for_each_entry_rcu(kp, &ap->list, list)
+ lockdep_assert_held(&kprobe_mutex);
+
+ list_for_each_entry(kp, &ap->list, list)
if (!kprobe_disabled(kp))
/*
* There is an active probe on the list.
@@ -1748,7 +1781,7 @@ static int __unregister_kprobe_top(struct kprobe *p)
else {
/* If disabling probe has special handlers, update aggrprobe */
if (p->post_handler && !kprobe_gone(p)) {
- list_for_each_entry_rcu(list_p, &ap->list, list) {
+ list_for_each_entry(list_p, &ap->list, list) {
if ((list_p != p) && (list_p->post_handler))
goto noclean;
}
@@ -2062,13 +2095,15 @@ static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
+ lockdep_assert_held(&kprobe_mutex);
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
* If this is an aggr_kprobe, we have to list all the
* chained probes and mark them GONE.
*/
- list_for_each_entry_rcu(kp, &p->list, list)
+ list_for_each_entry(kp, &p->list, list)
kp->flags |= KPROBE_FLAG_GONE;
p->post_handler = NULL;
kill_optimized_kprobe(p);
@@ -2312,7 +2347,7 @@ static int kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
+ hlist_for_each_entry(p, head, hlist)
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2550,7 +2585,7 @@ static int arm_all_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
/* Arm all kprobes on a best-effort basis */
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry(p, head, hlist) {
if (!kprobe_disabled(p)) {
err = arm_kprobe(p);
if (err) {
@@ -2593,7 +2628,7 @@ static int disarm_all_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
/* Disarm all kprobes on a best-effort basis */
- hlist_for_each_entry_rcu(p, head, hlist) {
+ hlist_for_each_entry(p, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) {
err = disarm_kprobe(p, false);
if (err) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bfbfa481be3a..132f84a5fde3 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1,13 +1,17 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
* Copyright (C) 2004 IBM Corporation, Rusty Russell.
+ * Copyright (C) 2009 Red Hat, Inc.
*
* Creation is done via kthreadd, so that we get a clean environment
* even if we're invoked from userspace (think modprobe, hotplug cpu,
* etc.).
*/
#include <uapi/linux/sched/types.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
@@ -25,6 +29,7 @@
#include <linux/numa.h>
#include <trace/events/sched.h>
+
static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
@@ -46,7 +51,9 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ int (*threadfn)(void *);
void *data;
+ mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
@@ -153,6 +160,20 @@ bool kthread_freezable_should_stop(bool *was_frozen)
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
/**
+ * kthread_func - return the function specified on kthread creation
+ * @task: kthread task in question
+ *
+ * Returns NULL if the task is not a kthread.
+ */
+void *kthread_func(struct task_struct *task)
+{
+ if (task->flags & PF_KTHREAD)
+ return to_kthread(task)->threadfn;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(kthread_func);
+
+/**
* kthread_data - return data value specified on kthread creation
* @task: kthread task in question
*
@@ -164,6 +185,7 @@ void *kthread_data(struct task_struct *task)
{
return to_kthread(task)->data;
}
+EXPORT_SYMBOL_GPL(kthread_data);
/**
* kthread_probe_data - speculative version of kthread_data()
@@ -179,7 +201,7 @@ void *kthread_probe_data(struct task_struct *task)
struct kthread *kthread = to_kthread(task);
void *data = NULL;
- probe_kernel_read(&data, &kthread->data, sizeof(data));
+ copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
return data;
}
@@ -244,6 +266,7 @@ static int kthread(void *_create)
do_exit(-ENOMEM);
}
+ self->threadfn = threadfn;
self->data = data;
init_completion(&self->exited);
init_completion(&self->parked);
@@ -1203,6 +1226,61 @@ void kthread_destroy_worker(struct kthread_worker *worker)
}
EXPORT_SYMBOL(kthread_destroy_worker);
+/**
+ * kthread_use_mm - make the calling kthread operate on an address space
+ * @mm: address space to operate on
+ */
+void kthread_use_mm(struct mm_struct *mm)
+{
+ struct mm_struct *active_mm;
+ struct task_struct *tsk = current;
+
+ WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
+ WARN_ON_ONCE(tsk->mm);
+
+ task_lock(tsk);
+ active_mm = tsk->active_mm;
+ if (active_mm != mm) {
+ mmgrab(mm);
+ tsk->active_mm = mm;
+ }
+ tsk->mm = mm;
+ switch_mm(active_mm, mm, tsk);
+ task_unlock(tsk);
+#ifdef finish_arch_post_lock_switch
+ finish_arch_post_lock_switch();
+#endif
+
+ if (active_mm != mm)
+ mmdrop(active_mm);
+
+ to_kthread(tsk)->oldfs = get_fs();
+ set_fs(USER_DS);
+}
+EXPORT_SYMBOL_GPL(kthread_use_mm);
+
+/**
+ * kthread_unuse_mm - reverse the effect of kthread_use_mm()
+ * @mm: address space to operate on
+ */
+void kthread_unuse_mm(struct mm_struct *mm)
+{
+ struct task_struct *tsk = current;
+
+ WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
+ WARN_ON_ONCE(!tsk->mm);
+
+ set_fs(to_kthread(tsk)->oldfs);
+
+ task_lock(tsk);
+ sync_mm_rss(mm);
+ tsk->mm = NULL;
+ /* active_mm is still 'mm' */
+ enter_lazy_tlb(mm, tsk);
+ task_unlock(tsk);
+}
+EXPORT_SYMBOL_GPL(kthread_unuse_mm);
+
#ifdef CONFIG_BLK_CGROUP
/**
* kthread_associate_blkcg - associate blkcg to current kthread
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 45452facff3b..6d11cfb9b41f 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,6 +5,9 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+# Avoid recursion lockdep -> KCSAN -> ... -> lockdep.
+KCSAN_SANITIZE_lockdep.o := n
+
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_lockdep_proc.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4c057dd8e93b..29a8de4c50b9 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -393,7 +393,7 @@ void lockdep_init_task(struct task_struct *task)
task->lockdep_recursion = 0;
}
-static inline void lockdep_recursion_finish(void)
+static __always_inline void lockdep_recursion_finish(void)
{
if (WARN_ON_ONCE(--current->lockdep_recursion))
current->lockdep_recursion = 0;
@@ -801,7 +801,7 @@ static int count_matching_names(struct lock_class *new_class)
}
/* used from NMI context -- must be lockless */
-static inline struct lock_class *
+static __always_inline struct lock_class *
look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
@@ -4424,7 +4424,7 @@ static void print_unlock_imbalance_bug(struct task_struct *curr,
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
pr_cont(") at:\n");
- print_ip_sym(ip);
+ print_ip_sym(KERN_WARNING, ip);
pr_warn("but there are no more locks to release!\n");
pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
@@ -5075,7 +5075,7 @@ static void print_lock_contention_bug(struct task_struct *curr,
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
pr_cont(") at:\n");
- print_ip_sym(ip);
+ print_ip_sym(KERN_WARNING, ip);
pr_warn("but there are no locks held!\n");
pr_warn("\nother info that might help us debug this:\n");
lockdep_print_held_locks(curr);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index fd4fe1f5b458..36e69100e8e0 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -125,7 +125,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
printk("\n%s/%d's [blocked] stackdump:\n\n",
task->comm, task_pid_nr(task));
- show_stack(task, NULL);
+ show_stack(task, NULL, KERN_DEFAULT);
printk("\n%s/%d's [current] stackdump:\n\n",
current->comm, task_pid_nr(current));
dump_stack();
diff --git a/kernel/module.c b/kernel/module.c
index e8a198588f26..0c6573b98c36 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2783,7 +2783,9 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
void * __weak module_alloc(unsigned long size)
{
- return vmalloc_exec(size);
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __func__);
}
bool __weak module_init_section(const char *name)
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index b03df67621d0..cd356630a311 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -531,7 +531,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
} else if (!IS_ERR(pidfd_pid(file))) {
err = check_setns_flags(flags);
} else {
- err = -EBADF;
+ err = -EINVAL;
}
if (err)
goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index 85568bbfb12b..e2157ca387c8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -680,10 +680,12 @@ device_initcall(register_warn_debugfs);
* Called when gcc's -fstack-protector feature is used, and
* gcc detects corruption of the on-stack canary value
*/
-__visible void __stack_chk_fail(void)
+__visible noinstr void __stack_chk_fail(void)
{
+ instrumentation_begin();
panic("stack-protector: Kernel stack is corrupted in: %pB",
__builtin_return_address(0));
+ instrumentation_end();
}
EXPORT_SYMBOL(__stack_chk_fail);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 4d0e6e815a2b..a7320f07689d 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -3,7 +3,7 @@ config SUSPEND
bool "Suspend to RAM and standby"
depends on ARCH_SUSPEND_POSSIBLE
default y
- ---help---
+ help
Allow the system to enter sleep states in which main memory is
powered and thus its contents are preserved, such as the
suspend-to-RAM state (e.g. the ACPI S3 state).
@@ -42,7 +42,7 @@ config HIBERNATION
select LZO_COMPRESS
select LZO_DECOMPRESS
select CRC32
- ---help---
+ help
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
system and powers it off; and restores that checkpoint on reboot.
@@ -84,7 +84,7 @@ config HIBERNATION_SNAPSHOT_DEV
bool "Userspace snapshot device"
depends on HIBERNATION
default y
- ---help---
+ help
Device used by the uswsusp tools.
Say N if no snapshotting from userspace is needed, this also
@@ -96,7 +96,7 @@ config PM_STD_PARTITION
string "Default resume partition"
depends on HIBERNATION
default ""
- ---help---
+ help
The default resume partition is the partition that the suspend-
to-disk implementation will look for a suspended disk image.
@@ -131,7 +131,7 @@ config PM_SLEEP_SMP_NONZERO_CPU
def_bool y
depends on PM_SLEEP_SMP
depends on ARCH_SUSPEND_NONZERO_CPU
- ---help---
+ help
If an arch can suspend (for suspend, hibernate, kexec, etc) on a
non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
will allow nohz_full mask to include CPU0.
@@ -140,7 +140,7 @@ config PM_AUTOSLEEP
bool "Opportunistic sleep"
depends on PM_SLEEP
default n
- ---help---
+ help
Allow the kernel to trigger a system transition into a global sleep
state automatically whenever there are no active wakeup sources.
@@ -148,7 +148,7 @@ config PM_WAKELOCKS
bool "User space wakeup sources interface"
depends on PM_SLEEP
default n
- ---help---
+ help
Allow user space to create, activate and deactivate wakeup source
objects with the help of a sysfs-based interface.
@@ -165,7 +165,7 @@ config PM_WAKELOCKS_GC
config PM
bool "Device power management core functionality"
- ---help---
+ help
Enable functionality allowing I/O devices to be put into energy-saving
(low power) states, for example after a specified period of inactivity
(autosuspended), and woken up in response to a hardware-generated
@@ -179,7 +179,7 @@ config PM
config PM_DEBUG
bool "Power Management Debug Support"
depends on PM
- ---help---
+ help
This option enables various debugging support in the Power Management
code. This is helpful when debugging and reporting PM bugs, like
suspend support.
@@ -187,7 +187,7 @@ config PM_DEBUG
config PM_ADVANCED_DEBUG
bool "Extra PM attributes in sysfs for low-level debugging/testing"
depends on PM_DEBUG
- ---help---
+ help
Add extra sysfs attributes allowing one to access some Power Management
fields of device objects from user space. If you are not a kernel
developer interested in debugging/testing Power Management, say "no".
@@ -195,7 +195,7 @@ config PM_ADVANCED_DEBUG
config PM_TEST_SUSPEND
bool "Test suspend/resume and wakealarm during bootup"
depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
- ---help---
+ help
This option will let you suspend your machine during bootup, and
make it wake up a few seconds later using an RTC wakeup alarm.
Enable this with a kernel parameter like "test_suspend=mem".
@@ -210,7 +210,7 @@ config PM_SLEEP_DEBUG
config DPM_WATCHDOG
bool "Device suspend/resume watchdog"
depends on PM_DEBUG && PSTORE && EXPERT
- ---help---
+ help
Sets up a watchdog timer to capture drivers that are
locked up attempting to suspend/resume a device.
A detected lockup causes system panic with message
@@ -243,7 +243,7 @@ config PM_TRACE_RTC
depends on PM_SLEEP_DEBUG
depends on X86
select PM_TRACE
- ---help---
+ help
This enables some cheesy code to save the last PM event point in the
RTC across reboots, so that you can debug a machine that just hangs
during suspend (or more commonly, during resume).
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 659800157b17..881128b9351e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,7 +34,6 @@
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index ca0fcb5ced71..01e2858b5fe3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -1590,7 +1590,7 @@ int swsusp_unmark(void)
}
#endif
-static int swsusp_header_init(void)
+static int __init swsusp_header_init(void)
{
swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
if (!swsusp_header)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 3132d6f860a8..b71eaf5f5a86 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -35,7 +35,6 @@
#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/crash_core.h>
-#include <linux/kdb.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
@@ -975,16 +974,6 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
user->idx = log_next_idx;
user->seq = log_next_seq;
break;
- case SEEK_CUR:
- /*
- * It isn't supported due to the record nature of this
- * interface: _SET _DATA and _END point to very specific
- * record positions, while _CUR would be more useful in case
- * of a byte-based log. Because of that, return the default
- * errno value for invalid seek operation.
- */
- ret = -ESPIPE;
- break;
default:
ret = -EINVAL;
}
@@ -2047,18 +2036,7 @@ EXPORT_SYMBOL(vprintk);
int vprintk_default(const char *fmt, va_list args)
{
- int r;
-
-#ifdef CONFIG_KGDB_KDB
- /* Allow to pass printk() to kdb but avoid a recursion. */
- if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
- r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
- return r;
- }
-#endif
- r = vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
-
- return r;
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 4242403316bb..50aeae770434 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -6,6 +6,7 @@
#include <linux/preempt.h>
#include <linux/spinlock.h>
#include <linux/debug_locks.h>
+#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/irq_work.h>
@@ -360,6 +361,12 @@ void __printk_safe_exit(void)
__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
{
+#ifdef CONFIG_KGDB_KDB
+ /* Allow to pass printk() to kdb but avoid a recursion. */
+ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
+ return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
+#endif
+
/*
* Try to use the main logbuf even in NMI. But avoid calling console
* drivers that might have their own locks.
diff --git a/kernel/relay.c b/kernel/relay.c
index 204867220f8a..72fe443ea78f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -91,7 +91,7 @@ static void relay_free_page_array(struct page **array)
*
* Returns 0 if ok, negative on error
*
- * Caller should already have grabbed mmap_sem.
+ * Caller should already have grabbed mmap_lock.
*/
static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
{
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 21fb5a5662b5..5fc9c9b70862 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -7,6 +7,12 @@ endif
# that is not a function of syscall inputs. E.g. involuntary context switches.
KCOV_INSTRUMENT := n
+# There are numerous data races here, however, most of them are due to plain accesses.
+# This would make it even harder for syzbot to find reproducers, because these
+# bugs trigger without specific input. Disable by default, but should re-enable
+# eventually.
+KCSAN_SANITIZE := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8298b2c240ce..8f360326861e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3922,8 +3922,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
&& in_atomic_preempt_off()) {
pr_err("Preemption disabled at:");
- print_ip_sym(preempt_disable_ip);
- pr_cont("\n");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
}
if (panic_on_warn)
panic("scheduling while atomic\n");
@@ -6026,7 +6025,7 @@ void sched_show_task(struct task_struct *p)
(unsigned long)task_thread_info(p)->flags);
print_worker_info(KERN_INFO, p);
- show_stack(p, NULL);
+ show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
EXPORT_SYMBOL_GPL(sched_show_task);
@@ -6871,8 +6870,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
&& !preempt_count_equals(preempt_offset)) {
pr_err("Preemption disabled at:");
- print_ip_sym(preempt_disable_ip);
- pr_cont("\n");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
}
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 35f4cc024dcf..cbcb2f71599b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2770,7 +2770,7 @@ static void task_numa_work(struct callback_head *work)
return;
- if (!down_read_trylock(&mm->mmap_sem))
+ if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, start);
if (!vma) {
@@ -2838,7 +2838,7 @@ out:
mm->numa_scan_offset = start;
else
reset_ptenuma_scan(p);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
* Make sure tasks use at least 32x as much time to run other code
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 05deb81bb3e3..1ae95b9150d3 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -96,6 +96,15 @@ void __cpuidle default_idle_call(void)
}
}
+static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev)
+{
+ if (current_clr_polling_and_test())
+ return -EBUSY;
+
+ return cpuidle_enter_s2idle(drv, dev);
+}
+
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
int next_state)
{
@@ -171,11 +180,9 @@ static void cpuidle_idle_call(void)
if (idle_should_enter_s2idle()) {
rcu_idle_enter();
- entered_state = cpuidle_enter_s2idle(drv, dev);
- if (entered_state > 0) {
- local_irq_enable();
+ entered_state = call_cpuidle_s2idle(drv, dev);
+ if (entered_state > 0)
goto exit_idle;
- }
rcu_idle_exit();
diff --git a/kernel/scs.c b/kernel/scs.c
index 222a7a9ad543..5d4d9bbdec36 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -74,7 +74,7 @@ static void scs_check_usage(struct task_struct *tsk)
for (p = task_scs(tsk); p < __scs_magic(tsk); ++p) {
if (!READ_ONCE_NOCHECK(*p))
break;
- used++;
+ used += sizeof(*p);
}
while (used > curr) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a47c6dd57452..c4201b7f42b1 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -339,12 +339,11 @@ asmlinkage __visible void do_softirq(void)
local_irq_restore(flags);
}
-/*
- * Enter an interrupt context.
+/**
+ * irq_enter_rcu - Enter an interrupt context with RCU watching
*/
-void irq_enter(void)
+void irq_enter_rcu(void)
{
- rcu_irq_enter();
if (is_idle_task(current) && !in_interrupt()) {
/*
* Prevent raise_softirq from needlessly waking up ksoftirqd
@@ -354,10 +353,18 @@ void irq_enter(void)
tick_irq_enter();
_local_bh_enable();
}
-
__irq_enter();
}
+/**
+ * irq_enter - Enter an interrupt context including RCU update
+ */
+void irq_enter(void)
+{
+ rcu_irq_enter();
+ irq_enter_rcu();
+}
+
static inline void invoke_softirq(void)
{
if (ksoftirqd_running(local_softirq_pending()))
@@ -397,10 +404,7 @@ static inline void tick_irq_exit(void)
#endif
}
-/*
- * Exit an interrupt context. Process softirqs if needed and possible:
- */
-void irq_exit(void)
+static inline void __irq_exit_rcu(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
@@ -413,6 +417,28 @@ void irq_exit(void)
invoke_softirq();
tick_irq_exit();
+}
+
+/**
+ * irq_exit_rcu() - Exit an interrupt context without updating RCU
+ *
+ * Also processes softirqs if needed and possible.
+ */
+void irq_exit_rcu(void)
+{
+ __irq_exit_rcu();
+ /* must be last! */
+ lockdep_hardirq_exit();
+}
+
+/**
+ * irq_exit - Exit an interrupt context, update RCU and lockdep
+ *
+ * Also processes softirqs if needed and possible.
+ */
+void irq_exit(void)
+{
+ __irq_exit_rcu();
rcu_irq_exit();
/* must be last! */
lockdep_hardirq_exit();
diff --git a/kernel/sys.c b/kernel/sys.c
index 891667a49bb7..00a96746e28a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -393,6 +393,10 @@ long __sys_setregid(gid_t rgid, gid_t egid)
new->sgid = new->egid;
new->fsgid = new->egid;
+ retval = security_task_fix_setgid(new, old, LSM_SETID_RE);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -435,6 +439,10 @@ long __sys_setgid(gid_t gid)
else
goto error;
+ retval = security_task_fix_setgid(new, old, LSM_SETID_ID);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -756,6 +764,10 @@ long __sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
new->sgid = ksgid;
new->fsgid = new->egid;
+ retval = security_task_fix_setgid(new, old, LSM_SETID_RES);
+ if (retval < 0)
+ goto error;
+
return commit_creds(new);
error:
@@ -862,7 +874,8 @@ long __sys_setfsgid(gid_t gid)
ns_capable(old->user_ns, CAP_SETGID)) {
if (!gid_eq(kgid, old->fsgid)) {
new->fsgid = kgid;
- goto change_okay;
+ if (security_task_fix_setgid(new,old,LSM_SETID_FS) == 0)
+ goto change_okay;
}
}
@@ -1846,7 +1859,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (exe_file) {
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!vma->vm_file)
continue;
@@ -1855,7 +1868,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
goto exit_err;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
fput(exe_file);
}
@@ -1869,7 +1882,7 @@ exit:
fdput(exe);
return err;
exit_err:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
fput(exe_file);
goto exit;
}
@@ -2007,10 +2020,10 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
}
/*
- * arg_lock protects concurent updates but we still need mmap_sem for
+ * arg_lock protects concurent updates but we still need mmap_lock for
* read to exclude races with sys_brk.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
/*
* We don't validate if these members are pointing to
@@ -2049,7 +2062,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
if (prctl_map.auxv_size)
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
@@ -2122,10 +2135,10 @@ static int prctl_set_mm(int opt, unsigned long addr,
/*
* arg_lock protects concurent updates of arg boundaries, we need
- * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr
+ * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr
* validation.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, addr);
spin_lock(&mm->arg_lock);
@@ -2217,7 +2230,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
error = 0;
out:
spin_unlock(&mm->arg_lock);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return error;
}
@@ -2442,13 +2455,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_THP_DISABLE:
if (arg3 || arg4 || arg5)
return -EINVAL;
- if (down_write_killable(&me->mm->mmap_sem))
+ if (mmap_write_lock_killable(me->mm))
return -EINTR;
if (arg2)
set_bit(MMF_DISABLE_THP, &me->mm->flags);
else
clear_bit(MMF_DISABLE_THP, &me->mm->flags);
- up_write(&me->mm->mmap_sem);
+ mmap_write_unlock(me->mm);
break;
case PR_MPX_ENABLE_MANAGEMENT:
case PR_MPX_DISABLE_MANAGEMENT:
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7cb09c4cf21c..02441ead3c3b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -928,14 +928,12 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_arch_init(cs);
-#ifdef CONFIG_GENERIC_VDSO_CLOCK_MODE
if (cs->vdso_clock_mode < 0 ||
cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
cs->name, cs->vdso_clock_mode);
cs->vdso_clock_mode = VDSO_CLOCKMODE_NONE;
}
-#endif
/* Initialize mult/shift and max_idle_ns */
__clocksource_update_freq_scale(cs, scale, freq);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9ebaab13339d..d20d489841c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -953,7 +953,7 @@ EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
* but without the sequence counter protect. This internal function
* is called just when timekeeping lock is already held.
*/
-time64_t __ktime_get_real_seconds(void)
+noinstr time64_t __ktime_get_real_seconds(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 24876faac753..a4020c0b4508 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -249,15 +249,6 @@ config TRACE_PREEMPT_TOGGLE
Enables hooks which will be called when preemption is first disabled,
and last enabled.
-config PREEMPTIRQ_EVENTS
- bool "Enable trace events for preempt and irq disable/enable"
- select TRACE_IRQFLAGS
- select TRACE_PREEMPT_TOGGLE if PREEMPTION
- select GENERIC_TRACER
- default n
- help
- Enable tracing of disable and enable events for preemption and irqs.
-
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
@@ -614,12 +605,30 @@ config TRACING_MAP
generally used outside of that context, and is normally
selected by tracers that use it.
+config SYNTH_EVENTS
+ bool "Synthetic trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ default n
+ help
+ Synthetic events are user-defined trace events that can be
+ used to combine data from other trace events or in fact any
+ data source. Synthetic events can be generated indirectly
+ via the trace() action of histogram triggers or directly
+ by way of an in-kernel API.
+
+ See Documentation/trace/events.rst or
+ Documentation/trace/histogram.rst for details and examples.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
select TRACING_MAP
select TRACING
select DYNAMIC_EVENTS
+ select SYNTH_EVENTS
default n
help
Hist triggers allow one or more arbitrary trace event fields
@@ -815,7 +824,7 @@ config PREEMPTIRQ_DELAY_TEST
config SYNTH_EVENT_GEN_TEST
tristate "Test module for in-kernel synthetic event generation"
- depends on HIST_TRIGGERS
+ depends on SYNTH_EVENTS
help
This option creates a test module to check the base
functionality of in-kernel synthetic event definition and
@@ -838,6 +847,29 @@ config KPROBE_EVENT_GEN_TEST
If unsure, say N.
+config HIST_TRIGGERS_DEBUG
+ bool "Hist trigger debug support"
+ depends on HIST_TRIGGERS
+ help
+ Add "hist_debug" file for each event, which when read will
+ dump out a bunch of internal details about the hist triggers
+ defined on that event.
+
+ The hist_debug file serves a couple of purposes:
+
+ - Helps developers verify that nothing is broken.
+
+ - Provides educational information to support the details
+ of the hist trigger internals as described by
+ Documentation/trace/histogram-design.rst.
+
+ The hist_debug output only covers the data structures
+ related to the histogram definitions themselves and doesn't
+ display the internals of map buckets or variable values of
+ running histograms.
+
+ If unsure, say N.
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f9dcd19165fa..6575bb0a0434 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -6,6 +6,9 @@ ifdef CONFIG_FUNCTION_TRACER
ORIG_CFLAGS := $(KBUILD_CFLAGS)
KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
+# Avoid recursion due to instrumentation.
+KCSAN_SANITIZE := n
+
ifdef CONFIG_FTRACE_SELFTEST
# selftest needs instrumentation
CFLAGS_trace_selftest_dynamic.o = $(CC_FLAGS_FTRACE)
@@ -72,6 +75,7 @@ endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
+obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ea47f2084087..5ef0484513ec 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -3,6 +3,9 @@
* Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
*
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
@@ -344,7 +347,8 @@ static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->blk_trace_mutex));
if (!bt)
return -EINVAL;
@@ -494,6 +498,17 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
*/
strreplace(buts->name, '/', '_');
+ /*
+ * bdev can be NULL, as with scsi-generic, this is a helpful as
+ * we can be.
+ */
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex))) {
+ pr_warn("Concurrent blktraces are not allowed on %s\n",
+ buts->name);
+ return -EBUSY;
+ }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
@@ -543,10 +558,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->pid = buts->pid;
bt->trace_state = Blktrace_setup;
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto err;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
ret = 0;
@@ -885,10 +897,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
}
static void blk_add_trace_bio_complete(void *ignore,
- struct request_queue *q, struct bio *bio,
- int error)
+ struct request_queue *q, struct bio *bio)
{
- blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
+ blk_add_trace_bio(q, bio, BLK_TA_COMPLETE,
+ blk_status_to_errno(bio->bi_status));
}
static void blk_add_trace_bio_backmerge(void *ignore,
@@ -995,8 +1007,10 @@ static void blk_add_trace_split(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector,
bio->bi_iter.bi_size, bio_op(bio), bio->bi_opf,
- BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
- &rpdu, blk_trace_bio_get_cgid(q, bio));
+ BLK_TA_SPLIT,
+ blk_status_to_errno(bio->bi_status),
+ sizeof(rpdu), &rpdu,
+ blk_trace_bio_get_cgid(q, bio));
}
rcu_read_unlock();
}
@@ -1033,7 +1047,8 @@ static void blk_add_trace_bio_remap(void *ignore,
r.sector_from = cpu_to_be64(from);
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
- bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
+ bio_op(bio), bio->bi_opf, BLK_TA_REMAP,
+ blk_status_to_errno(bio->bi_status),
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
@@ -1253,21 +1268,10 @@ static inline __u16 t_error(const struct trace_entry *ent)
static __u64 get_pdu_int(const struct trace_entry *ent, bool has_cg)
{
- const __u64 *val = pdu_start(ent, has_cg);
+ const __be64 *val = pdu_start(ent, has_cg);
return be64_to_cpu(*val);
}
-static void get_pdu_remap(const struct trace_entry *ent,
- struct blk_io_trace_remap *r, bool has_cg)
-{
- const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- __u64 sector_from = __r->sector_from;
-
- r->device_from = be32_to_cpu(__r->device_from);
- r->device_to = be32_to_cpu(__r->device_to);
- r->sector_from = be64_to_cpu(sector_from);
-}
-
typedef void (blk_log_action_t) (struct trace_iterator *iter, const char *act,
bool has_cg);
@@ -1407,13 +1411,13 @@ static void blk_log_with_error(struct trace_seq *s,
static void blk_log_remap(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
{
- struct blk_io_trace_remap r = { .device_from = 0, };
+ const struct blk_io_trace_remap *__r = pdu_start(ent, has_cg);
- get_pdu_remap(ent, &r, has_cg);
trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
t_sector(ent), t_sec(ent),
- MAJOR(r.device_from), MINOR(r.device_from),
- (unsigned long long)r.sector_from);
+ MAJOR(be32_to_cpu(__r->device_from)),
+ MINOR(be32_to_cpu(__r->device_from)),
+ be64_to_cpu(__r->sector_from));
}
static void blk_log_plug(struct trace_seq *s, const struct trace_entry *ent, bool has_cg)
@@ -1637,7 +1641,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
{
struct blk_trace *bt;
- bt = xchg(&q->blk_trace, NULL);
+ bt = rcu_replace_pointer(q->blk_trace, NULL,
+ lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
@@ -1669,10 +1674,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
blk_trace_setup_lba(bt, bdev);
- ret = -EBUSY;
- if (cmpxchg(&q->blk_trace, NULL, bt))
- goto free_bt;
-
+ rcu_assign_pointer(q->blk_trace, bt);
get_probe_ref();
return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3744372a24e2..7bc3d6175868 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -136,17 +136,23 @@ static const struct bpf_func_proto bpf_override_return_proto = {
};
#endif
-BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
- const void __user *, unsafe_ptr)
+static __always_inline int
+bpf_probe_read_user_common(void *dst, u32 size, const void __user *unsafe_ptr)
{
- int ret = probe_user_read(dst, unsafe_ptr, size);
+ int ret;
+ ret = copy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
-
return ret;
}
+BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ return bpf_probe_read_user_common(dst, size, unsafe_ptr);
+}
+
const struct bpf_func_proto bpf_probe_read_user_proto = {
.func = bpf_probe_read_user,
.gpl_only = true,
@@ -156,17 +162,24 @@ const struct bpf_func_proto bpf_probe_read_user_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
- const void __user *, unsafe_ptr)
+static __always_inline int
+bpf_probe_read_user_str_common(void *dst, u32 size,
+ const void __user *unsafe_ptr)
{
- int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size);
+ int ret;
+ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
-
return ret;
}
+BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
+ const void __user *, unsafe_ptr)
+{
+ return bpf_probe_read_user_str_common(dst, size, unsafe_ptr);
+}
+
const struct bpf_func_proto bpf_probe_read_user_str_proto = {
.func = bpf_probe_read_user_str,
.gpl_only = true,
@@ -177,25 +190,25 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = {
};
static __always_inline int
-bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr,
- const bool compat)
+bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr)
{
int ret = security_locked_down(LOCKDOWN_BPF_READ);
if (unlikely(ret < 0))
- goto out;
- ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) :
- probe_kernel_read_strict(dst, unsafe_ptr, size);
+ goto fail;
+ ret = copy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
-out:
- memset(dst, 0, size);
+ goto fail;
+ return ret;
+fail:
+ memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false);
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
}
const struct bpf_func_proto bpf_probe_read_kernel_proto = {
@@ -207,50 +220,37 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
- const void *, unsafe_ptr)
-{
- return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true);
-}
-
-static const struct bpf_func_proto bpf_probe_read_compat_proto = {
- .func = bpf_probe_read_compat,
- .gpl_only = true,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_UNINIT_MEM,
- .arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_ANYTHING,
-};
-
static __always_inline int
-bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr,
- const bool compat)
+bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr)
{
int ret = security_locked_down(LOCKDOWN_BPF_READ);
if (unlikely(ret < 0))
- goto out;
+ goto fail;
+
/*
- * The strncpy_from_unsafe_*() call will likely not fill the entire
- * buffer, but that's okay in this circumstance as we're probing
+ * The strncpy_from_kernel_nofault() call will likely not fill the
+ * entire buffer, but that's okay in this circumstance as we're probing
* arbitrary memory anyway similar to bpf_probe_read_*() and might
* as well probe the stack. Thus, memory is explicitly cleared
* only in error case, so that improper users ignoring return
* code altogether don't copy garbage; otherwise length of string
* is returned that can be used for bpf_perf_event_output() et al.
*/
- ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) :
- strncpy_from_unsafe_strict(dst, unsafe_ptr, size);
+ ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
-out:
- memset(dst, 0, size);
+ goto fail;
+
+ return ret;
+fail:
+ memset(dst, 0, size);
return ret;
}
BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false);
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
@@ -262,10 +262,34 @@ const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
.arg3_type = ARG_ANYTHING,
};
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
+ const void *, unsafe_ptr)
+{
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ return bpf_probe_read_user_common(dst, size,
+ (__force void __user *)unsafe_ptr);
+ }
+ return bpf_probe_read_kernel_common(dst, size, unsafe_ptr);
+}
+
+static const struct bpf_func_proto bpf_probe_read_compat_proto = {
+ .func = bpf_probe_read_compat,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+};
+
BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
const void *, unsafe_ptr)
{
- return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true);
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ return bpf_probe_read_user_str_common(dst, size,
+ (__force void __user *)unsafe_ptr);
+ }
+ return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr);
}
static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
@@ -276,6 +300,7 @@ static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
};
+#endif /* CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE */
BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
u32, size)
@@ -301,7 +326,7 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
- return probe_user_write(unsafe_ptr, src, size);
+ return copy_to_user_nofault(unsafe_ptr, src, size);
}
static const struct bpf_func_proto bpf_probe_write_user_proto = {
@@ -324,6 +349,31 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
return &bpf_probe_write_user_proto;
}
+static void bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype,
+ size_t bufsz)
+{
+ void __user *user_ptr = (__force void __user *)unsafe_ptr;
+
+ buf[0] = 0;
+
+ switch (fmt_ptype) {
+ case 's':
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)unsafe_ptr < TASK_SIZE) {
+ strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ break;
+ }
+ fallthrough;
+#endif
+ case 'k':
+ strncpy_from_kernel_nofault(buf, unsafe_ptr, bufsz);
+ break;
+ case 'u':
+ strncpy_from_user_nofault(buf, user_ptr, bufsz);
+ break;
+ }
+}
+
/*
* Only limited trace_printk() conversion specifiers allowed:
* %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s
@@ -406,24 +456,8 @@ fmt_str:
break;
}
- buf[0] = 0;
- switch (fmt_ptype) {
- case 's':
-#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
- strncpy_from_unsafe(buf, unsafe_ptr,
- sizeof(buf));
- break;
-#endif
- case 'k':
- strncpy_from_unsafe_strict(buf, unsafe_ptr,
- sizeof(buf));
- break;
- case 'u':
- strncpy_from_unsafe_user(buf,
- (__force void __user *)unsafe_ptr,
- sizeof(buf));
- break;
- }
+ bpf_trace_copy_string(buf, unsafe_ptr, fmt_ptype,
+ sizeof(buf));
goto fmt_next;
}
@@ -579,15 +613,17 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
}
if (fmt[i] == 's') {
+ void *unsafe_ptr;
+
/* try our best to copy */
if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
err = -E2BIG;
goto out;
}
- err = strncpy_from_unsafe_strict(bufs->buf[memcpy_cnt],
- (void *) (long) args[fmt_cnt],
- MAX_SEQ_PRINTF_STR_LEN);
+ unsafe_ptr = (void *)(long)args[fmt_cnt];
+ err = strncpy_from_kernel_nofault(bufs->buf[memcpy_cnt],
+ unsafe_ptr, MAX_SEQ_PRINTF_STR_LEN);
if (err < 0)
bufs->buf[memcpy_cnt][0] = '\0';
params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
@@ -625,7 +661,7 @@ BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
copy_size = (fmt[i + 2] == '4') ? 4 : 16;
- err = probe_kernel_read(bufs->buf[memcpy_cnt],
+ err = copy_from_kernel_nofault(bufs->buf[memcpy_cnt],
(void *) (long) args[fmt_cnt],
copy_size);
if (err < 0)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b5765aeea698..1903b80db6eb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2016,16 +2016,16 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
{
unsigned long ip = rec ? rec->ip : 0;
+ pr_info("------------[ ftrace bug ]------------\n");
+
switch (failed) {
case -EFAULT:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on modifying ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
break;
case -EINVAL:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace failed to modify ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
print_ip_ins(" actual: ", (unsigned char *)ip);
pr_cont("\n");
if (ftrace_expected) {
@@ -2034,14 +2034,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
}
break;
case -EPERM:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on writing ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
break;
default:
- FTRACE_WARN_ON_ONCE(1);
pr_info("ftrace faulted on unknown error ");
- print_ip_sym(ip);
+ print_ip_sym(KERN_INFO, ip);
}
print_bug_type();
if (rec) {
@@ -2066,6 +2064,8 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec)
ip = ftrace_get_addr_curr(rec);
pr_cont("\n expected tramp: %lx\n", ip);
}
+
+ FTRACE_WARN_ON_ONCE(1);
}
static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
@@ -2260,7 +2260,7 @@ ftrace_find_tramp_ops_next(struct dyn_ftrace *rec,
if (hash_contains_ip(ip, op->func_hash))
return op;
- }
+ }
return NULL;
}
@@ -3599,7 +3599,7 @@ static int t_show(struct seq_file *m, void *v)
if (direct)
seq_printf(m, "\n\tdirect-->%pS", (void *)direct);
}
- }
+ }
seq_putc(m, '\n');
@@ -7151,6 +7151,10 @@ static int pid_open(struct inode *inode, struct file *file, int type)
case TRACE_NO_PIDS:
seq_ops = &ftrace_no_pid_sops;
break;
+ default:
+ trace_array_put(tr);
+ WARN_ON_ONCE(1);
+ return -EINVAL;
}
ret = seq_open(file, seq_ops);
@@ -7229,6 +7233,10 @@ pid_write(struct file *filp, const char __user *ubuf,
other_pids = rcu_dereference_protected(tr->function_pids,
lockdep_is_held(&ftrace_lock));
break;
+ default:
+ ret = -EINVAL;
+ WARN_ON_ONCE(1);
+ goto out;
}
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b8e1ca48be50..00867ff82412 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2427,7 +2427,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
if (unlikely(info->add_timestamp)) {
bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
- event = rb_add_time_stamp(event, info->delta, abs);
+ event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
length -= RB_LEN_TIME_EXTEND;
delta = 0;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ab27022c20f..bb62269724d5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1299,8 +1299,11 @@ EXPORT_SYMBOL_GPL(tracing_off);
void disable_trace_on_warning(void)
{
- if (__disable_trace_on_warning)
+ if (__disable_trace_on_warning) {
+ trace_array_printk_buf(global_trace.array_buffer.buffer, _THIS_IP_,
+ "Disabling tracing due to warning\n");
tracing_off();
+ }
}
/**
@@ -3567,7 +3570,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
- struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter;
unsigned long entries = 0;
u64 ts;
@@ -3585,7 +3587,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
* that a reset never took place on a cpu. This is evident
* by the timestamp being before the start of the buffer.
*/
- while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
+ while (ring_buffer_iter_peek(buf_iter, &ts)) {
if (ts >= iter->array_buffer->time_start)
break;
entries++;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4eb1d004d5f2..13db4000af3f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -61,6 +61,9 @@ enum trace_type {
#undef __field_desc
#define __field_desc(type, container, item)
+#undef __field_packed
+#define __field_packed(type, container, item)
+
#undef __array
#define __array(type, item, size) type item[size];
@@ -1661,6 +1664,7 @@ extern struct list_head ftrace_events;
extern const struct file_operations event_trigger_fops;
extern const struct file_operations event_hist_fops;
+extern const struct file_operations event_hist_debug_fops;
extern const struct file_operations event_inject_fops;
#ifdef CONFIG_HIST_TRIGGERS
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 9de29bb45a27..fa0fc08c6ef8 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -101,12 +101,16 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
kprobe_event_cmd_init(&cmd, buf, MAX_BUF_LEN);
ret = kprobe_event_gen_cmd_start(&cmd, event, val);
- if (ret)
+ if (ret) {
+ pr_err("Failed to generate probe: %s\n", buf);
break;
+ }
ret = kprobe_event_gen_cmd_end(&cmd);
- if (ret)
+ if (ret) {
pr_err("Failed to add probe: %s\n", buf);
+ break;
+ }
}
return ret;
@@ -120,7 +124,7 @@ trace_boot_add_kprobe_event(struct xbc_node *node, const char *event)
}
#endif
-#ifdef CONFIG_HIST_TRIGGERS
+#ifdef CONFIG_SYNTH_EVENTS
static int __init
trace_boot_add_synth_event(struct xbc_node *node, const char *event)
{
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index a523da0dae0a..18c4a58aff79 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -78,8 +78,8 @@ FTRACE_ENTRY_PACKED(funcgraph_entry, ftrace_graph_ent_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ent, graph_ent )
- __field_desc( unsigned long, graph_ent, func )
- __field_desc( int, graph_ent, depth )
+ __field_packed( unsigned long, graph_ent, func )
+ __field_packed( int, graph_ent, depth )
),
F_printk("--> %ps (%d)", (void *)__entry->func, __entry->depth)
@@ -92,11 +92,11 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
F_STRUCT(
__field_struct( struct ftrace_graph_ret, ret )
- __field_desc( unsigned long, ret, func )
- __field_desc( unsigned long, ret, overrun )
- __field_desc( unsigned long long, ret, calltime)
- __field_desc( unsigned long long, ret, rettime )
- __field_desc( int, ret, depth )
+ __field_packed( unsigned long, ret, func )
+ __field_packed( unsigned long, ret, overrun )
+ __field_packed( unsigned long long, ret, calltime)
+ __field_packed( unsigned long long, ret, rettime )
+ __field_packed( int, ret, depth )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 242f59e7f17d..f6f55682d3e2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2209,6 +2209,10 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file)
trace_create_file("hist", 0444, file->dir, file,
&event_hist_fops);
#endif
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+ trace_create_file("hist_debug", 0444, file->dir, file,
+ &event_hist_debug_fops);
+#endif
trace_create_file("format", 0444, file->dir, call,
&ftrace_event_format_fops);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index fcab11cc6833..0b933546142e 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -19,13 +19,7 @@
#include <trace/events/mmflags.h>
#include "tracing_map.h"
-#include "trace.h"
-#include "trace_dynevent.h"
-
-#define SYNTH_SYSTEM "synthetic"
-#define SYNTH_FIELDS_MAX 32
-
-#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+#include "trace_synth.h"
#define ERRORS \
C(NONE, "No error"), \
@@ -380,69 +374,6 @@ struct hist_trigger_data {
unsigned int n_save_var_str;
};
-static int create_synth_event(int argc, const char **argv);
-static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
-static int synth_event_release(struct dyn_event *ev);
-static bool synth_event_is_busy(struct dyn_event *ev);
-static bool synth_event_match(const char *system, const char *event,
- int argc, const char **argv, struct dyn_event *ev);
-
-static struct dyn_event_operations synth_event_ops = {
- .create = create_synth_event,
- .show = synth_event_show,
- .is_busy = synth_event_is_busy,
- .free = synth_event_release,
- .match = synth_event_match,
-};
-
-struct synth_field {
- char *type;
- char *name;
- size_t size;
- unsigned int offset;
- bool is_signed;
- bool is_string;
-};
-
-struct synth_event {
- struct dyn_event devent;
- int ref;
- char *name;
- struct synth_field **fields;
- unsigned int n_fields;
- unsigned int n_u64;
- struct trace_event_class class;
- struct trace_event_call call;
- struct tracepoint *tp;
- struct module *mod;
-};
-
-static bool is_synth_event(struct dyn_event *ev)
-{
- return ev->ops == &synth_event_ops;
-}
-
-static struct synth_event *to_synth_event(struct dyn_event *ev)
-{
- return container_of(ev, struct synth_event, devent);
-}
-
-static bool synth_event_is_busy(struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
-
- return event->ref != 0;
-}
-
-static bool synth_event_match(const char *system, const char *event,
- int argc, const char **argv, struct dyn_event *ev)
-{
- struct synth_event *sev = to_synth_event(ev);
-
- return strcmp(sev->name, event) == 0 &&
- (!system || strcmp(system, SYNTH_SYSTEM) == 0);
-}
-
struct action_data;
typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
@@ -589,6 +520,7 @@ static struct track_data *track_data_alloc(unsigned int key_len,
track_data_free(data);
return ERR_PTR(-ENOMEM);
}
+
data->elt.private_data = elt_data;
elt_data->comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL);
@@ -621,7 +553,6 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
if (file) {
call = file->event_call;
-
system = call->class->system;
if (system) {
name = trace_event_name(call);
@@ -646,510 +577,6 @@ static void hist_err_clear(void)
last_cmd_loc[0] = '\0';
}
-struct synth_trace_event {
- struct trace_entry ent;
- u64 fields[];
-};
-
-static int synth_event_define_fields(struct trace_event_call *call)
-{
- struct synth_trace_event trace;
- int offset = offsetof(typeof(trace), fields);
- struct synth_event *event = call->data;
- unsigned int i, size, n_u64;
- char *name, *type;
- bool is_signed;
- int ret = 0;
-
- for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
- size = event->fields[i]->size;
- is_signed = event->fields[i]->is_signed;
- type = event->fields[i]->type;
- name = event->fields[i]->name;
- ret = trace_define_field(call, type, name, offset, size,
- is_signed, FILTER_OTHER);
- if (ret)
- break;
-
- event->fields[i]->offset = n_u64;
-
- if (event->fields[i]->is_string) {
- offset += STR_VAR_LEN_MAX;
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- offset += sizeof(u64);
- n_u64++;
- }
- }
-
- event->n_u64 = n_u64;
-
- return ret;
-}
-
-static bool synth_field_signed(char *type)
-{
- if (str_has_prefix(type, "u"))
- return false;
- if (strcmp(type, "gfp_t") == 0)
- return false;
-
- return true;
-}
-
-static int synth_field_is_string(char *type)
-{
- if (strstr(type, "char[") != NULL)
- return true;
-
- return false;
-}
-
-static int synth_field_string_size(char *type)
-{
- char buf[4], *end, *start;
- unsigned int len;
- int size, err;
-
- start = strstr(type, "char[");
- if (start == NULL)
- return -EINVAL;
- start += sizeof("char[") - 1;
-
- end = strchr(type, ']');
- if (!end || end < start)
- return -EINVAL;
-
- len = end - start;
- if (len > 3)
- return -EINVAL;
-
- strncpy(buf, start, len);
- buf[len] = '\0';
-
- err = kstrtouint(buf, 0, &size);
- if (err)
- return err;
-
- if (size > STR_VAR_LEN_MAX)
- return -EINVAL;
-
- return size;
-}
-
-static int synth_field_size(char *type)
-{
- int size = 0;
-
- if (strcmp(type, "s64") == 0)
- size = sizeof(s64);
- else if (strcmp(type, "u64") == 0)
- size = sizeof(u64);
- else if (strcmp(type, "s32") == 0)
- size = sizeof(s32);
- else if (strcmp(type, "u32") == 0)
- size = sizeof(u32);
- else if (strcmp(type, "s16") == 0)
- size = sizeof(s16);
- else if (strcmp(type, "u16") == 0)
- size = sizeof(u16);
- else if (strcmp(type, "s8") == 0)
- size = sizeof(s8);
- else if (strcmp(type, "u8") == 0)
- size = sizeof(u8);
- else if (strcmp(type, "char") == 0)
- size = sizeof(char);
- else if (strcmp(type, "unsigned char") == 0)
- size = sizeof(unsigned char);
- else if (strcmp(type, "int") == 0)
- size = sizeof(int);
- else if (strcmp(type, "unsigned int") == 0)
- size = sizeof(unsigned int);
- else if (strcmp(type, "long") == 0)
- size = sizeof(long);
- else if (strcmp(type, "unsigned long") == 0)
- size = sizeof(unsigned long);
- else if (strcmp(type, "pid_t") == 0)
- size = sizeof(pid_t);
- else if (strcmp(type, "gfp_t") == 0)
- size = sizeof(gfp_t);
- else if (synth_field_is_string(type))
- size = synth_field_string_size(type);
-
- return size;
-}
-
-static const char *synth_field_fmt(char *type)
-{
- const char *fmt = "%llu";
-
- if (strcmp(type, "s64") == 0)
- fmt = "%lld";
- else if (strcmp(type, "u64") == 0)
- fmt = "%llu";
- else if (strcmp(type, "s32") == 0)
- fmt = "%d";
- else if (strcmp(type, "u32") == 0)
- fmt = "%u";
- else if (strcmp(type, "s16") == 0)
- fmt = "%d";
- else if (strcmp(type, "u16") == 0)
- fmt = "%u";
- else if (strcmp(type, "s8") == 0)
- fmt = "%d";
- else if (strcmp(type, "u8") == 0)
- fmt = "%u";
- else if (strcmp(type, "char") == 0)
- fmt = "%d";
- else if (strcmp(type, "unsigned char") == 0)
- fmt = "%u";
- else if (strcmp(type, "int") == 0)
- fmt = "%d";
- else if (strcmp(type, "unsigned int") == 0)
- fmt = "%u";
- else if (strcmp(type, "long") == 0)
- fmt = "%ld";
- else if (strcmp(type, "unsigned long") == 0)
- fmt = "%lu";
- else if (strcmp(type, "pid_t") == 0)
- fmt = "%d";
- else if (strcmp(type, "gfp_t") == 0)
- fmt = "%x";
- else if (synth_field_is_string(type))
- fmt = "%s";
-
- return fmt;
-}
-
-static void print_synth_event_num_val(struct trace_seq *s,
- char *print_fmt, char *name,
- int size, u64 val, char *space)
-{
- switch (size) {
- case 1:
- trace_seq_printf(s, print_fmt, name, (u8)val, space);
- break;
-
- case 2:
- trace_seq_printf(s, print_fmt, name, (u16)val, space);
- break;
-
- case 4:
- trace_seq_printf(s, print_fmt, name, (u32)val, space);
- break;
-
- default:
- trace_seq_printf(s, print_fmt, name, val, space);
- break;
- }
-}
-
-static enum print_line_t print_synth_event(struct trace_iterator *iter,
- int flags,
- struct trace_event *event)
-{
- struct trace_array *tr = iter->tr;
- struct trace_seq *s = &iter->seq;
- struct synth_trace_event *entry;
- struct synth_event *se;
- unsigned int i, n_u64;
- char print_fmt[32];
- const char *fmt;
-
- entry = (struct synth_trace_event *)iter->ent;
- se = container_of(event, struct synth_event, call.event);
-
- trace_seq_printf(s, "%s: ", se->name);
-
- for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
- if (trace_seq_has_overflowed(s))
- goto end;
-
- fmt = synth_field_fmt(se->fields[i]->type);
-
- /* parameter types */
- if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
- trace_seq_printf(s, "%s ", fmt);
-
- snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
-
- /* parameter values */
- if (se->fields[i]->is_string) {
- trace_seq_printf(s, print_fmt, se->fields[i]->name,
- (char *)&entry->fields[n_u64],
- i == se->n_fields - 1 ? "" : " ");
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct trace_print_flags __flags[] = {
- __def_gfpflag_names, {-1, NULL} };
- char *space = (i == se->n_fields - 1 ? "" : " ");
-
- print_synth_event_num_val(s, print_fmt,
- se->fields[i]->name,
- se->fields[i]->size,
- entry->fields[n_u64],
- space);
-
- if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
- trace_seq_puts(s, " (");
- trace_print_flags_seq(s, "|",
- entry->fields[n_u64],
- __flags);
- trace_seq_putc(s, ')');
- }
- n_u64++;
- }
- }
-end:
- trace_seq_putc(s, '\n');
-
- return trace_handle_return(s);
-}
-
-static struct trace_event_functions synth_event_funcs = {
- .trace = print_synth_event
-};
-
-static notrace void trace_event_raw_event_synth(void *__data,
- u64 *var_ref_vals,
- unsigned int *var_ref_idx)
-{
- struct trace_event_file *trace_file = __data;
- struct synth_trace_event *entry;
- struct trace_event_buffer fbuffer;
- struct trace_buffer *buffer;
- struct synth_event *event;
- unsigned int i, n_u64, val_idx;
- int fields_size = 0;
-
- event = trace_file->event_call->data;
-
- if (trace_trigger_soft_disabled(trace_file))
- return;
-
- fields_size = event->n_u64 * sizeof(u64);
-
- /*
- * Avoid ring buffer recursion detection, as this event
- * is being performed within another event.
- */
- buffer = trace_file->tr->array_buffer.buffer;
- ring_buffer_nest_start(buffer);
-
- entry = trace_event_buffer_reserve(&fbuffer, trace_file,
- sizeof(*entry) + fields_size);
- if (!entry)
- goto out;
-
- for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
- val_idx = var_ref_idx[i];
- if (event->fields[i]->is_string) {
- char *str_val = (char *)(long)var_ref_vals[val_idx];
- char *str_field = (char *)&entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = event->fields[i];
- u64 val = var_ref_vals[val_idx];
-
- switch (field->size) {
- case 1:
- *(u8 *)&entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
-
- trace_event_buffer_commit(&fbuffer);
-out:
- ring_buffer_nest_end(buffer);
-}
-
-static void free_synth_event_print_fmt(struct trace_event_call *call)
-{
- if (call) {
- kfree(call->print_fmt);
- call->print_fmt = NULL;
- }
-}
-
-static int __set_synth_event_print_fmt(struct synth_event *event,
- char *buf, int len)
-{
- const char *fmt;
- int pos = 0;
- int i;
-
- /* When len=0, we just calculate the needed length */
-#define LEN_OR_ZERO (len ? len - pos : 0)
-
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
- for (i = 0; i < event->n_fields; i++) {
- fmt = synth_field_fmt(event->fields[i]->type);
- pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
- event->fields[i]->name, fmt,
- i == event->n_fields - 1 ? "" : ", ");
- }
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
-
- for (i = 0; i < event->n_fields; i++) {
- pos += snprintf(buf + pos, LEN_OR_ZERO,
- ", REC->%s", event->fields[i]->name);
- }
-
-#undef LEN_OR_ZERO
-
- /* return the length of print_fmt */
- return pos;
-}
-
-static int set_synth_event_print_fmt(struct trace_event_call *call)
-{
- struct synth_event *event = call->data;
- char *print_fmt;
- int len;
-
- /* First: called with 0 length to calculate the needed length */
- len = __set_synth_event_print_fmt(event, NULL, 0);
-
- print_fmt = kmalloc(len + 1, GFP_KERNEL);
- if (!print_fmt)
- return -ENOMEM;
-
- /* Second: actually write the @print_fmt */
- __set_synth_event_print_fmt(event, print_fmt, len + 1);
- call->print_fmt = print_fmt;
-
- return 0;
-}
-
-static void free_synth_field(struct synth_field *field)
-{
- kfree(field->type);
- kfree(field->name);
- kfree(field);
-}
-
-static struct synth_field *parse_synth_field(int argc, const char **argv,
- int *consumed)
-{
- struct synth_field *field;
- const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
- int len, ret = 0;
-
- if (field_type[0] == ';')
- field_type++;
-
- if (!strcmp(field_type, "unsigned")) {
- if (argc < 3)
- return ERR_PTR(-EINVAL);
- prefix = "unsigned ";
- field_type = argv[1];
- field_name = argv[2];
- *consumed = 3;
- } else {
- field_name = argv[1];
- *consumed = 2;
- }
-
- field = kzalloc(sizeof(*field), GFP_KERNEL);
- if (!field)
- return ERR_PTR(-ENOMEM);
-
- len = strlen(field_name);
- array = strchr(field_name, '[');
- if (array)
- len -= strlen(array);
- else if (field_name[len - 1] == ';')
- len--;
-
- field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
- if (!field->name) {
- ret = -ENOMEM;
- goto free;
- }
-
- if (field_type[0] == ';')
- field_type++;
- len = strlen(field_type) + 1;
- if (array)
- len += strlen(array);
- if (prefix)
- len += strlen(prefix);
-
- field->type = kzalloc(len, GFP_KERNEL);
- if (!field->type) {
- ret = -ENOMEM;
- goto free;
- }
- if (prefix)
- strcat(field->type, prefix);
- strcat(field->type, field_type);
- if (array) {
- strcat(field->type, array);
- if (field->type[len - 1] == ';')
- field->type[len - 1] = '\0';
- }
-
- field->size = synth_field_size(field->type);
- if (!field->size) {
- ret = -EINVAL;
- goto free;
- }
-
- if (synth_field_is_string(field->type))
- field->is_string = true;
-
- field->is_signed = synth_field_signed(field->type);
-
- out:
- return field;
- free:
- free_synth_field(field);
- field = ERR_PTR(ret);
- goto out;
-}
-
-static void free_synth_tracepoint(struct tracepoint *tp)
-{
- if (!tp)
- return;
-
- kfree(tp->name);
- kfree(tp);
-}
-
-static struct tracepoint *alloc_synth_tracepoint(char *name)
-{
- struct tracepoint *tp;
-
- tp = kzalloc(sizeof(*tp), GFP_KERNEL);
- if (!tp)
- return ERR_PTR(-ENOMEM);
-
- tp->name = kstrdup(name, GFP_KERNEL);
- if (!tp->name) {
- kfree(tp);
- return ERR_PTR(-ENOMEM);
- }
-
- return tp;
-}
-
typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
unsigned int *var_ref_idx);
@@ -1177,145 +604,6 @@ static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
}
}
-static struct synth_event *find_synth_event(const char *name)
-{
- struct dyn_event *pos;
- struct synth_event *event;
-
- for_each_dyn_event(pos) {
- if (!is_synth_event(pos))
- continue;
- event = to_synth_event(pos);
- if (strcmp(event->name, name) == 0)
- return event;
- }
-
- return NULL;
-}
-
-static struct trace_event_fields synth_event_fields_array[] = {
- { .type = TRACE_FUNCTION_TYPE,
- .define_fields = synth_event_define_fields },
- {}
-};
-
-static int register_synth_event(struct synth_event *event)
-{
- struct trace_event_call *call = &event->call;
- int ret = 0;
-
- event->call.class = &event->class;
- event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
- if (!event->class.system) {
- ret = -ENOMEM;
- goto out;
- }
-
- event->tp = alloc_synth_tracepoint(event->name);
- if (IS_ERR(event->tp)) {
- ret = PTR_ERR(event->tp);
- event->tp = NULL;
- goto out;
- }
-
- INIT_LIST_HEAD(&call->class->fields);
- call->event.funcs = &synth_event_funcs;
- call->class->fields_array = synth_event_fields_array;
-
- ret = register_trace_event(&call->event);
- if (!ret) {
- ret = -ENODEV;
- goto out;
- }
- call->flags = TRACE_EVENT_FL_TRACEPOINT;
- call->class->reg = trace_event_reg;
- call->class->probe = trace_event_raw_event_synth;
- call->data = event;
- call->tp = event->tp;
-
- ret = trace_add_event_call(call);
- if (ret) {
- pr_warn("Failed to register synthetic event: %s\n",
- trace_event_name(call));
- goto err;
- }
-
- ret = set_synth_event_print_fmt(call);
- if (ret < 0) {
- trace_remove_event_call(call);
- goto err;
- }
- out:
- return ret;
- err:
- unregister_trace_event(&call->event);
- goto out;
-}
-
-static int unregister_synth_event(struct synth_event *event)
-{
- struct trace_event_call *call = &event->call;
- int ret;
-
- ret = trace_remove_event_call(call);
-
- return ret;
-}
-
-static void free_synth_event(struct synth_event *event)
-{
- unsigned int i;
-
- if (!event)
- return;
-
- for (i = 0; i < event->n_fields; i++)
- free_synth_field(event->fields[i]);
-
- kfree(event->fields);
- kfree(event->name);
- kfree(event->class.system);
- free_synth_tracepoint(event->tp);
- free_synth_event_print_fmt(&event->call);
- kfree(event);
-}
-
-static struct synth_event *alloc_synth_event(const char *name, int n_fields,
- struct synth_field **fields)
-{
- struct synth_event *event;
- unsigned int i;
-
- event = kzalloc(sizeof(*event), GFP_KERNEL);
- if (!event) {
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- event->name = kstrdup(name, GFP_KERNEL);
- if (!event->name) {
- kfree(event);
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
- if (!event->fields) {
- free_synth_event(event);
- event = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- dyn_event_init(&event->devent, &synth_event_ops);
-
- for (i = 0; i < n_fields; i++)
- event->fields[i] = fields[i];
-
- event->n_fields = n_fields;
- out:
- return event;
-}
-
static void action_trace(struct hist_trigger_data *hist_data,
struct tracing_map_elt *elt, void *rec,
struct ring_buffer_event *rbe, void *key,
@@ -1331,1056 +619,6 @@ struct hist_var_data {
struct hist_trigger_data *hist_data;
};
-static int synth_event_check_arg_fn(void *data)
-{
- struct dynevent_arg_pair *arg_pair = data;
- int size;
-
- size = synth_field_size((char *)arg_pair->lhs);
-
- return size ? 0 : -EINVAL;
-}
-
-/**
- * synth_event_add_field - Add a new field to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @type: The type of the new field to add
- * @name: The name of the new field to add
- *
- * Add a new field to a synthetic event cmd object. Field ordering is in
- * the same order the fields are added.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_field(struct dynevent_cmd *cmd, const char *type,
- const char *name)
-{
- struct dynevent_arg_pair arg_pair;
- int ret;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (!type || !name)
- return -EINVAL;
-
- dynevent_arg_pair_init(&arg_pair, 0, ';');
-
- arg_pair.lhs = type;
- arg_pair.rhs = name;
-
- ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn);
- if (ret)
- return ret;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX)
- ret = -EINVAL;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_field);
-
-/**
- * synth_event_add_field_str - Add a new field to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @type_name: The type and name of the new field to add, as a single string
- *
- * Add a new field to a synthetic event cmd object, as a single
- * string. The @type_name string is expected to be of the form 'type
- * name', which will be appended by ';'. No sanity checking is done -
- * what's passed in is assumed to already be well-formed. Field
- * ordering is in the same order the fields are added.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name)
-{
- struct dynevent_arg arg;
- int ret;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (!type_name)
- return -EINVAL;
-
- dynevent_arg_init(&arg, ';');
-
- arg.str = type_name;
-
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX)
- ret = -EINVAL;
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_field_str);
-
-/**
- * synth_event_add_fields - Add multiple fields to a synthetic event cmd
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- *
- * Add a new set of fields to a synthetic event cmd object. The event
- * fields that will be defined for the event should be passed in as an
- * array of struct synth_field_desc, and the number of elements in the
- * array passed in as n_fields. Field ordering will retain the
- * ordering given in the fields array.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_add_fields(struct dynevent_cmd *cmd,
- struct synth_field_desc *fields,
- unsigned int n_fields)
-{
- unsigned int i;
- int ret = 0;
-
- for (i = 0; i < n_fields; i++) {
- if (fields[i].type == NULL || fields[i].name == NULL) {
- ret = -EINVAL;
- break;
- }
-
- ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
- if (ret)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_add_fields);
-
-/**
- * __synth_event_gen_cmd_start - Start a synthetic event command from arg list
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @name: The name of the synthetic event
- * @mod: The module creating the event, NULL if not created from a module
- * @args: Variable number of arg (pairs), one pair for each field
- *
- * NOTE: Users normally won't want to call this function directly, but
- * rather use the synth_event_gen_cmd_start() wrapper, which
- * automatically adds a NULL to the end of the arg list. If this
- * function is used directly, make sure the last arg in the variable
- * arg list is NULL.
- *
- * Generate a synthetic event command to be executed by
- * synth_event_gen_cmd_end(). This function can be used to generate
- * the complete command or only the first part of it; in the latter
- * case, synth_event_add_field(), synth_event_add_field_str(), or
- * synth_event_add_fields() can be used to add more fields following
- * this.
- *
- * There should be an even number variable args, each pair consisting
- * of a type followed by a field name.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name,
- struct module *mod, ...)
-{
- struct dynevent_arg arg;
- va_list args;
- int ret;
-
- cmd->event_name = name;
- cmd->private_data = mod;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- dynevent_arg_init(&arg, 0);
- arg.str = name;
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- va_start(args, mod);
- for (;;) {
- const char *type, *name;
-
- type = va_arg(args, const char *);
- if (!type)
- break;
- name = va_arg(args, const char *);
- if (!name)
- break;
-
- if (++cmd->n_fields > SYNTH_FIELDS_MAX) {
- ret = -EINVAL;
- break;
- }
-
- ret = synth_event_add_field(cmd, type, name);
- if (ret)
- break;
- }
- va_end(args);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
-
-/**
- * synth_event_gen_cmd_array_start - Start synthetic event command from an array
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @name: The name of the synthetic event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- *
- * Generate a synthetic event command to be executed by
- * synth_event_gen_cmd_end(). This function can be used to generate
- * the complete command or only the first part of it; in the latter
- * case, synth_event_add_field(), synth_event_add_field_str(), or
- * synth_event_add_fields() can be used to add more fields following
- * this.
- *
- * The event fields that will be defined for the event should be
- * passed in as an array of struct synth_field_desc, and the number of
- * elements in the array passed in as n_fields. Field ordering will
- * retain the ordering given in the fields array.
- *
- * See synth_field_size() for available types. If field_name contains
- * [n] the field is considered to be an array.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
- struct module *mod,
- struct synth_field_desc *fields,
- unsigned int n_fields)
-{
- struct dynevent_arg arg;
- unsigned int i;
- int ret = 0;
-
- cmd->event_name = name;
- cmd->private_data = mod;
-
- if (cmd->type != DYNEVENT_TYPE_SYNTH)
- return -EINVAL;
-
- if (n_fields > SYNTH_FIELDS_MAX)
- return -EINVAL;
-
- dynevent_arg_init(&arg, 0);
- arg.str = name;
- ret = dynevent_arg_add(cmd, &arg, NULL);
- if (ret)
- return ret;
-
- for (i = 0; i < n_fields; i++) {
- if (fields[i].type == NULL || fields[i].name == NULL)
- return -EINVAL;
-
- ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
- if (ret)
- break;
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
-
-static int __create_synth_event(int argc, const char *name, const char **argv)
-{
- struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
- struct synth_event *event = NULL;
- int i, consumed = 0, n_fields = 0, ret = 0;
-
- /*
- * Argument syntax:
- * - Add synthetic event: <event_name> field[;field] ...
- * - Remove synthetic event: !<event_name> field[;field] ...
- * where 'field' = type field_name
- */
-
- if (name[0] == '\0' || argc < 1)
- return -EINVAL;
-
- mutex_lock(&event_mutex);
-
- event = find_synth_event(name);
- if (event) {
- ret = -EEXIST;
- goto out;
- }
-
- for (i = 0; i < argc - 1; i++) {
- if (strcmp(argv[i], ";") == 0)
- continue;
- if (n_fields == SYNTH_FIELDS_MAX) {
- ret = -EINVAL;
- goto err;
- }
-
- field = parse_synth_field(argc - i, &argv[i], &consumed);
- if (IS_ERR(field)) {
- ret = PTR_ERR(field);
- goto err;
- }
- fields[n_fields++] = field;
- i += consumed - 1;
- }
-
- if (i < argc && strcmp(argv[i], ";") != 0) {
- ret = -EINVAL;
- goto err;
- }
-
- event = alloc_synth_event(name, n_fields, fields);
- if (IS_ERR(event)) {
- ret = PTR_ERR(event);
- event = NULL;
- goto err;
- }
- ret = register_synth_event(event);
- if (!ret)
- dyn_event_add(&event->devent);
- else
- free_synth_event(event);
- out:
- mutex_unlock(&event_mutex);
-
- return ret;
- err:
- for (i = 0; i < n_fields; i++)
- free_synth_field(fields[i]);
-
- goto out;
-}
-
-/**
- * synth_event_create - Create a new synthetic event
- * @name: The name of the new sythetic event
- * @fields: An array of type/name field descriptions
- * @n_fields: The number of field descriptions contained in the fields array
- * @mod: The module creating the event, NULL if not created from a module
- *
- * Create a new synthetic event with the given name under the
- * trace/events/synthetic/ directory. The event fields that will be
- * defined for the event should be passed in as an array of struct
- * synth_field_desc, and the number elements in the array passed in as
- * n_fields. Field ordering will retain the ordering given in the
- * fields array.
- *
- * If the new synthetic event is being created from a module, the mod
- * param must be non-NULL. This will ensure that the trace buffer
- * won't contain unreadable events.
- *
- * The new synth event should be deleted using synth_event_delete()
- * function. The new synthetic event can be generated from modules or
- * other kernel code using trace_synth_event() and related functions.
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_create(const char *name, struct synth_field_desc *fields,
- unsigned int n_fields, struct module *mod)
-{
- struct dynevent_cmd cmd;
- char *buf;
- int ret;
-
- buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
-
- ret = synth_event_gen_cmd_array_start(&cmd, name, mod,
- fields, n_fields);
- if (ret)
- goto out;
-
- ret = synth_event_gen_cmd_end(&cmd);
- out:
- kfree(buf);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_create);
-
-static int destroy_synth_event(struct synth_event *se)
-{
- int ret;
-
- if (se->ref)
- ret = -EBUSY;
- else {
- ret = unregister_synth_event(se);
- if (!ret) {
- dyn_event_remove(&se->devent);
- free_synth_event(se);
- }
- }
-
- return ret;
-}
-
-/**
- * synth_event_delete - Delete a synthetic event
- * @event_name: The name of the new sythetic event
- *
- * Delete a synthetic event that was created with synth_event_create().
- *
- * Return: 0 if successful, error otherwise.
- */
-int synth_event_delete(const char *event_name)
-{
- struct synth_event *se = NULL;
- struct module *mod = NULL;
- int ret = -ENOENT;
-
- mutex_lock(&event_mutex);
- se = find_synth_event(event_name);
- if (se) {
- mod = se->mod;
- ret = destroy_synth_event(se);
- }
- mutex_unlock(&event_mutex);
-
- if (mod) {
- mutex_lock(&trace_types_lock);
- /*
- * It is safest to reset the ring buffer if the module
- * being unloaded registered any events that were
- * used. The only worry is if a new module gets
- * loaded, and takes on the same id as the events of
- * this module. When printing out the buffer, traced
- * events left over from this module may be passed to
- * the new module events and unexpected results may
- * occur.
- */
- tracing_reset_all_online_cpus();
- mutex_unlock(&trace_types_lock);
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_delete);
-
-static int create_or_delete_synth_event(int argc, char **argv)
-{
- const char *name = argv[0];
- int ret;
-
- /* trace_run_command() ensures argc != 0 */
- if (name[0] == '!') {
- ret = synth_event_delete(name + 1);
- return ret;
- }
-
- ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
- return ret == -ECANCELED ? -EINVAL : ret;
-}
-
-static int synth_event_run_command(struct dynevent_cmd *cmd)
-{
- struct synth_event *se;
- int ret;
-
- ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
- if (ret)
- return ret;
-
- se = find_synth_event(cmd->event_name);
- if (WARN_ON(!se))
- return -ENOENT;
-
- se->mod = cmd->private_data;
-
- return ret;
-}
-
-/**
- * synth_event_cmd_init - Initialize a synthetic event command object
- * @cmd: A pointer to the dynevent_cmd struct representing the new event
- * @buf: A pointer to the buffer used to build the command
- * @maxlen: The length of the buffer passed in @buf
- *
- * Initialize a synthetic event command object. Use this before
- * calling any of the other dyenvent_cmd functions.
- */
-void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
-{
- dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH,
- synth_event_run_command);
-}
-EXPORT_SYMBOL_GPL(synth_event_cmd_init);
-
-static inline int
-__synth_event_trace_start(struct trace_event_file *file,
- struct synth_event_trace_state *trace_state)
-{
- int entry_size, fields_size = 0;
- int ret = 0;
-
- memset(trace_state, '\0', sizeof(*trace_state));
-
- /*
- * Normal event tracing doesn't get called at all unless the
- * ENABLED bit is set (which attaches the probe thus allowing
- * this code to be called, etc). Because this is called
- * directly by the user, we don't have that but we still need
- * to honor not logging when disabled. For the the iterated
- * trace case, we save the enabed state upon start and just
- * ignore the following data calls.
- */
- if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
- trace_trigger_soft_disabled(file)) {
- trace_state->disabled = true;
- ret = -ENOENT;
- goto out;
- }
-
- trace_state->event = file->event_call->data;
-
- fields_size = trace_state->event->n_u64 * sizeof(u64);
-
- /*
- * Avoid ring buffer recursion detection, as this event
- * is being performed within another event.
- */
- trace_state->buffer = file->tr->array_buffer.buffer;
- ring_buffer_nest_start(trace_state->buffer);
-
- entry_size = sizeof(*trace_state->entry) + fields_size;
- trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer,
- file,
- entry_size);
- if (!trace_state->entry) {
- ring_buffer_nest_end(trace_state->buffer);
- ret = -EINVAL;
- }
-out:
- return ret;
-}
-
-static inline void
-__synth_event_trace_end(struct synth_event_trace_state *trace_state)
-{
- trace_event_buffer_commit(&trace_state->fbuffer);
-
- ring_buffer_nest_end(trace_state->buffer);
-}
-
-/**
- * synth_event_trace - Trace a synthetic event
- * @file: The trace_event_file representing the synthetic event
- * @n_vals: The number of values in vals
- * @args: Variable number of args containing the event values
- *
- * Trace a synthetic event using the values passed in the variable
- * argument list.
- *
- * The argument list should be a list 'n_vals' u64 values. The number
- * of vals must match the number of field in the synthetic event, and
- * must be in the same order as the synthetic event fields.
- *
- * All vals should be cast to u64, and string vals are just pointers
- * to strings, cast to u64. Strings will be copied into space
- * reserved in the event for the string, using these pointers.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
-{
- struct synth_event_trace_state state;
- unsigned int i, n_u64;
- va_list args;
- int ret;
-
- ret = __synth_event_trace_start(file, &state);
- if (ret) {
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
- return ret;
- }
-
- if (n_vals != state.event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
-
- va_start(args, n_vals);
- for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
- u64 val;
-
- val = va_arg(args, u64);
-
- if (state.event->fields[i]->is_string) {
- char *str_val = (char *)(long)val;
- char *str_field = (char *)&state.entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = state.event->fields[i];
-
- switch (field->size) {
- case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- state.entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
- va_end(args);
-out:
- __synth_event_trace_end(&state);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace);
-
-/**
- * synth_event_trace_array - Trace a synthetic event from an array
- * @file: The trace_event_file representing the synthetic event
- * @vals: Array of values
- * @n_vals: The number of values in vals
- *
- * Trace a synthetic event using the values passed in as 'vals'.
- *
- * The 'vals' array is just an array of 'n_vals' u64. The number of
- * vals must match the number of field in the synthetic event, and
- * must be in the same order as the synthetic event fields.
- *
- * All vals should be cast to u64, and string vals are just pointers
- * to strings, cast to u64. Strings will be copied into space
- * reserved in the event for the string, using these pointers.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
- unsigned int n_vals)
-{
- struct synth_event_trace_state state;
- unsigned int i, n_u64;
- int ret;
-
- ret = __synth_event_trace_start(file, &state);
- if (ret) {
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
- return ret;
- }
-
- if (n_vals != state.event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
-
- for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
- if (state.event->fields[i]->is_string) {
- char *str_val = (char *)(long)vals[i];
- char *str_field = (char *)&state.entry->fields[n_u64];
-
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
- } else {
- struct synth_field *field = state.event->fields[i];
- u64 val = vals[i];
-
- switch (field->size) {
- case 1:
- *(u8 *)&state.entry->fields[n_u64] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&state.entry->fields[n_u64] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&state.entry->fields[n_u64] = (u32)val;
- break;
-
- default:
- state.entry->fields[n_u64] = val;
- break;
- }
- n_u64++;
- }
- }
-out:
- __synth_event_trace_end(&state);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_array);
-
-/**
- * synth_event_trace_start - Start piecewise synthetic event trace
- * @file: The trace_event_file representing the synthetic event
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Start the trace of a synthetic event field-by-field rather than all
- * at once.
- *
- * This function 'opens' an event trace, which means space is reserved
- * for the event in the trace buffer, after which the event's
- * individual field values can be set through either
- * synth_event_add_next_val() or synth_event_add_val().
- *
- * A pointer to a trace_state object is passed in, which will keep
- * track of the current event trace state until the event trace is
- * closed (and the event finally traced) using
- * synth_event_trace_end().
- *
- * Note that synth_event_trace_end() must be called after all values
- * have been added for each event trace, regardless of whether adding
- * all field values succeeded or not.
- *
- * Note also that for a given event trace, all fields must be added
- * using either synth_event_add_next_val() or synth_event_add_val()
- * but not both together or interleaved.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_start(struct trace_event_file *file,
- struct synth_event_trace_state *trace_state)
-{
- int ret;
-
- if (!trace_state)
- return -EINVAL;
-
- ret = __synth_event_trace_start(file, trace_state);
- if (ret == -ENOENT)
- ret = 0; /* just disabled, not really an error */
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_start);
-
-static int __synth_event_add_val(const char *field_name, u64 val,
- struct synth_event_trace_state *trace_state)
-{
- struct synth_field *field = NULL;
- struct synth_trace_event *entry;
- struct synth_event *event;
- int i, ret = 0;
-
- if (!trace_state) {
- ret = -EINVAL;
- goto out;
- }
-
- /* can't mix add_next_synth_val() with add_synth_val() */
- if (field_name) {
- if (trace_state->add_next) {
- ret = -EINVAL;
- goto out;
- }
- trace_state->add_name = true;
- } else {
- if (trace_state->add_name) {
- ret = -EINVAL;
- goto out;
- }
- trace_state->add_next = true;
- }
-
- if (trace_state->disabled)
- goto out;
-
- event = trace_state->event;
- if (trace_state->add_name) {
- for (i = 0; i < event->n_fields; i++) {
- field = event->fields[i];
- if (strcmp(field->name, field_name) == 0)
- break;
- }
- if (!field) {
- ret = -EINVAL;
- goto out;
- }
- } else {
- if (trace_state->cur_field >= event->n_fields) {
- ret = -EINVAL;
- goto out;
- }
- field = event->fields[trace_state->cur_field++];
- }
-
- entry = trace_state->entry;
- if (field->is_string) {
- char *str_val = (char *)(long)val;
- char *str_field;
-
- if (!str_val) {
- ret = -EINVAL;
- goto out;
- }
-
- str_field = (char *)&entry->fields[field->offset];
- strscpy(str_field, str_val, STR_VAR_LEN_MAX);
- } else {
- switch (field->size) {
- case 1:
- *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
- break;
-
- case 2:
- *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
- break;
-
- case 4:
- *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
- break;
-
- default:
- trace_state->entry->fields[field->offset] = val;
- break;
- }
- }
- out:
- return ret;
-}
-
-/**
- * synth_event_add_next_val - Add the next field's value to an open synth trace
- * @val: The value to set the next field to
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Set the value of the next field in an event that's been opened by
- * synth_event_trace_start().
- *
- * The val param should be the value cast to u64. If the value points
- * to a string, the val param should be a char * cast to u64.
- *
- * This function assumes all the fields in an event are to be set one
- * after another - successive calls to this function are made, one for
- * each field, in the order of the fields in the event, until all
- * fields have been set. If you'd rather set each field individually
- * without regard to ordering, synth_event_add_val() can be used
- * instead.
- *
- * Note however that synth_event_add_next_val() and
- * synth_event_add_val() can't be intermixed for a given event trace -
- * one or the other but not both can be used at the same time.
- *
- * Note also that synth_event_trace_end() must be called after all
- * values have been added for each event trace, regardless of whether
- * adding all field values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_add_next_val(u64 val,
- struct synth_event_trace_state *trace_state)
-{
- return __synth_event_add_val(NULL, val, trace_state);
-}
-EXPORT_SYMBOL_GPL(synth_event_add_next_val);
-
-/**
- * synth_event_add_val - Add a named field's value to an open synth trace
- * @field_name: The name of the synthetic event field value to set
- * @val: The value to set the next field to
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * Set the value of the named field in an event that's been opened by
- * synth_event_trace_start().
- *
- * The val param should be the value cast to u64. If the value points
- * to a string, the val param should be a char * cast to u64.
- *
- * This function looks up the field name, and if found, sets the field
- * to the specified value. This lookup makes this function more
- * expensive than synth_event_add_next_val(), so use that or the
- * none-piecewise synth_event_trace() instead if efficiency is more
- * important.
- *
- * Note however that synth_event_add_next_val() and
- * synth_event_add_val() can't be intermixed for a given event trace -
- * one or the other but not both can be used at the same time.
- *
- * Note also that synth_event_trace_end() must be called after all
- * values have been added for each event trace, regardless of whether
- * adding all field values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_add_val(const char *field_name, u64 val,
- struct synth_event_trace_state *trace_state)
-{
- return __synth_event_add_val(field_name, val, trace_state);
-}
-EXPORT_SYMBOL_GPL(synth_event_add_val);
-
-/**
- * synth_event_trace_end - End piecewise synthetic event trace
- * @trace_state: A pointer to object tracking the piecewise trace state
- *
- * End the trace of a synthetic event opened by
- * synth_event_trace__start().
- *
- * This function 'closes' an event trace, which basically means that
- * it commits the reserved event and cleans up other loose ends.
- *
- * A pointer to a trace_state object is passed in, which will keep
- * track of the current event trace state opened with
- * synth_event_trace_start().
- *
- * Note that this function must be called after all values have been
- * added for each event trace, regardless of whether adding all field
- * values succeeded or not.
- *
- * Return: 0 on success, err otherwise.
- */
-int synth_event_trace_end(struct synth_event_trace_state *trace_state)
-{
- if (!trace_state)
- return -EINVAL;
-
- __synth_event_trace_end(trace_state);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(synth_event_trace_end);
-
-static int create_synth_event(int argc, const char **argv)
-{
- const char *name = argv[0];
- int len;
-
- if (name[0] != 's' || name[1] != ':')
- return -ECANCELED;
- name += 2;
-
- /* This interface accepts group name prefix */
- if (strchr(name, '/')) {
- len = str_has_prefix(name, SYNTH_SYSTEM "/");
- if (len == 0)
- return -EINVAL;
- name += len;
- }
- return __create_synth_event(argc - 1, name, argv + 1);
-}
-
-static int synth_event_release(struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
- int ret;
-
- if (event->ref)
- return -EBUSY;
-
- ret = unregister_synth_event(event);
- if (ret)
- return ret;
-
- dyn_event_remove(ev);
- free_synth_event(event);
- return 0;
-}
-
-static int __synth_event_show(struct seq_file *m, struct synth_event *event)
-{
- struct synth_field *field;
- unsigned int i;
-
- seq_printf(m, "%s\t", event->name);
-
- for (i = 0; i < event->n_fields; i++) {
- field = event->fields[i];
-
- /* parameter values */
- seq_printf(m, "%s %s%s", field->type, field->name,
- i == event->n_fields - 1 ? "" : "; ");
- }
-
- seq_putc(m, '\n');
-
- return 0;
-}
-
-static int synth_event_show(struct seq_file *m, struct dyn_event *ev)
-{
- struct synth_event *event = to_synth_event(ev);
-
- seq_printf(m, "s:%s/", event->class.system);
-
- return __synth_event_show(m, event);
-}
-
-static int synth_events_seq_show(struct seq_file *m, void *v)
-{
- struct dyn_event *ev = v;
-
- if (!is_synth_event(ev))
- return 0;
-
- return __synth_event_show(m, to_synth_event(ev));
-}
-
-static const struct seq_operations synth_events_seq_op = {
- .start = dyn_event_seq_start,
- .next = dyn_event_seq_next,
- .stop = dyn_event_seq_stop,
- .show = synth_events_seq_show,
-};
-
-static int synth_events_open(struct inode *inode, struct file *file)
-{
- int ret;
-
- ret = security_locked_down(LOCKDOWN_TRACEFS);
- if (ret)
- return ret;
-
- if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
- ret = dyn_events_release_all(&synth_event_ops);
- if (ret < 0)
- return ret;
- }
-
- return seq_open(file, &synth_events_seq_op);
-}
-
-static ssize_t synth_events_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *ppos)
-{
- return trace_parse_run_command(file, buffer, count, ppos,
- create_or_delete_synth_event);
-}
-
-static const struct file_operations synth_events_fops = {
- .open = synth_events_open,
- .write = synth_events_write,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
static u64 hist_field_timestamp(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct ring_buffer_event *rbe,
@@ -6491,6 +4729,279 @@ const struct file_operations event_hist_fops = {
.release = single_release,
};
+#ifdef CONFIG_HIST_TRIGGERS_DEBUG
+static void hist_field_debug_show_flags(struct seq_file *m,
+ unsigned long flags)
+{
+ seq_puts(m, " flags:\n");
+
+ if (flags & HIST_FIELD_FL_KEY)
+ seq_puts(m, " HIST_FIELD_FL_KEY\n");
+ else if (flags & HIST_FIELD_FL_HITCOUNT)
+ seq_puts(m, " VAL: HIST_FIELD_FL_HITCOUNT\n");
+ else if (flags & HIST_FIELD_FL_VAR)
+ seq_puts(m, " HIST_FIELD_FL_VAR\n");
+ else if (flags & HIST_FIELD_FL_VAR_REF)
+ seq_puts(m, " HIST_FIELD_FL_VAR_REF\n");
+ else
+ seq_puts(m, " VAL: normal u64 value\n");
+
+ if (flags & HIST_FIELD_FL_ALIAS)
+ seq_puts(m, " HIST_FIELD_FL_ALIAS\n");
+}
+
+static int hist_field_debug_show(struct seq_file *m,
+ struct hist_field *field, unsigned long flags)
+{
+ if ((field->flags & flags) != flags) {
+ seq_printf(m, "ERROR: bad flags - %lx\n", flags);
+ return -EINVAL;
+ }
+
+ hist_field_debug_show_flags(m, field->flags);
+ if (field->field)
+ seq_printf(m, " ftrace_event_field name: %s\n",
+ field->field->name);
+
+ if (field->flags & HIST_FIELD_FL_VAR) {
+ seq_printf(m, " var.name: %s\n", field->var.name);
+ seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n",
+ field->var.idx);
+ }
+
+ if (field->flags & HIST_FIELD_FL_ALIAS)
+ seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n",
+ field->var_ref_idx);
+
+ if (field->flags & HIST_FIELD_FL_VAR_REF) {
+ seq_printf(m, " name: %s\n", field->name);
+ seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n",
+ field->var.idx);
+ seq_printf(m, " var.hist_data: %p\n", field->var.hist_data);
+ seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n",
+ field->var_ref_idx);
+ if (field->system)
+ seq_printf(m, " system: %s\n", field->system);
+ if (field->event_name)
+ seq_printf(m, " event_name: %s\n", field->event_name);
+ }
+
+ seq_printf(m, " type: %s\n", field->type);
+ seq_printf(m, " size: %u\n", field->size);
+ seq_printf(m, " is_signed: %u\n", field->is_signed);
+
+ return 0;
+}
+
+static int field_var_debug_show(struct seq_file *m,
+ struct field_var *field_var, unsigned int i,
+ bool save_vars)
+{
+ const char *vars_name = save_vars ? "save_vars" : "field_vars";
+ struct hist_field *field;
+ int ret = 0;
+
+ seq_printf(m, "\n hist_data->%s[%d]:\n", vars_name, i);
+
+ field = field_var->var;
+
+ seq_printf(m, "\n %s[%d].var:\n", vars_name, i);
+
+ hist_field_debug_show_flags(m, field->flags);
+ seq_printf(m, " var.name: %s\n", field->var.name);
+ seq_printf(m, " var.idx (into tracing_map_elt.vars[]): %u\n",
+ field->var.idx);
+
+ field = field_var->val;
+
+ seq_printf(m, "\n %s[%d].val:\n", vars_name, i);
+ if (field->field)
+ seq_printf(m, " ftrace_event_field name: %s\n",
+ field->field->name);
+ else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ seq_printf(m, " type: %s\n", field->type);
+ seq_printf(m, " size: %u\n", field->size);
+ seq_printf(m, " is_signed: %u\n", field->is_signed);
+out:
+ return ret;
+}
+
+static int hist_action_debug_show(struct seq_file *m,
+ struct action_data *data, int i)
+{
+ int ret = 0;
+
+ if (data->handler == HANDLER_ONMAX ||
+ data->handler == HANDLER_ONCHANGE) {
+ seq_printf(m, "\n hist_data->actions[%d].track_data.var_ref:\n", i);
+ ret = hist_field_debug_show(m, data->track_data.var_ref,
+ HIST_FIELD_FL_VAR_REF);
+ if (ret)
+ goto out;
+
+ seq_printf(m, "\n hist_data->actions[%d].track_data.track_var:\n", i);
+ ret = hist_field_debug_show(m, data->track_data.track_var,
+ HIST_FIELD_FL_VAR);
+ if (ret)
+ goto out;
+ }
+
+ if (data->handler == HANDLER_ONMATCH) {
+ seq_printf(m, "\n hist_data->actions[%d].match_data.event_system: %s\n",
+ i, data->match_data.event_system);
+ seq_printf(m, " hist_data->actions[%d].match_data.event: %s\n",
+ i, data->match_data.event);
+ }
+out:
+ return ret;
+}
+
+static int hist_actions_debug_show(struct seq_file *m,
+ struct hist_trigger_data *hist_data)
+{
+ int i, ret = 0;
+
+ if (hist_data->n_actions)
+ seq_puts(m, "\n action tracking variables (for onmax()/onchange()/onmatch()):\n");
+
+ for (i = 0; i < hist_data->n_actions; i++) {
+ struct action_data *action = hist_data->actions[i];
+
+ ret = hist_action_debug_show(m, action, i);
+ if (ret)
+ goto out;
+ }
+
+ if (hist_data->n_save_vars)
+ seq_puts(m, "\n save action variables (save() params):\n");
+
+ for (i = 0; i < hist_data->n_save_vars; i++) {
+ ret = field_var_debug_show(m, hist_data->save_vars[i], i, true);
+ if (ret)
+ goto out;
+ }
+out:
+ return ret;
+}
+
+static void hist_trigger_debug_show(struct seq_file *m,
+ struct event_trigger_data *data, int n)
+{
+ struct hist_trigger_data *hist_data;
+ int i, ret;
+
+ if (n > 0)
+ seq_puts(m, "\n\n");
+
+ seq_puts(m, "# event histogram\n#\n# trigger info: ");
+ data->ops->print(m, data->ops, data);
+ seq_puts(m, "#\n\n");
+
+ hist_data = data->private_data;
+
+ seq_printf(m, "hist_data: %p\n\n", hist_data);
+ seq_printf(m, " n_vals: %u\n", hist_data->n_vals);
+ seq_printf(m, " n_keys: %u\n", hist_data->n_keys);
+ seq_printf(m, " n_fields: %u\n", hist_data->n_fields);
+
+ seq_puts(m, "\n val fields:\n\n");
+
+ seq_puts(m, " hist_data->fields[0]:\n");
+ ret = hist_field_debug_show(m, hist_data->fields[0],
+ HIST_FIELD_FL_HITCOUNT);
+ if (ret)
+ return;
+
+ for (i = 1; i < hist_data->n_vals; i++) {
+ seq_printf(m, "\n hist_data->fields[%d]:\n", i);
+ ret = hist_field_debug_show(m, hist_data->fields[i], 0);
+ if (ret)
+ return;
+ }
+
+ seq_puts(m, "\n key fields:\n");
+
+ for (i = hist_data->n_vals; i < hist_data->n_fields; i++) {
+ seq_printf(m, "\n hist_data->fields[%d]:\n", i);
+ ret = hist_field_debug_show(m, hist_data->fields[i],
+ HIST_FIELD_FL_KEY);
+ if (ret)
+ return;
+ }
+
+ if (hist_data->n_var_refs)
+ seq_puts(m, "\n variable reference fields:\n");
+
+ for (i = 0; i < hist_data->n_var_refs; i++) {
+ seq_printf(m, "\n hist_data->var_refs[%d]:\n", i);
+ ret = hist_field_debug_show(m, hist_data->var_refs[i],
+ HIST_FIELD_FL_VAR_REF);
+ if (ret)
+ return;
+ }
+
+ if (hist_data->n_field_vars)
+ seq_puts(m, "\n field variables:\n");
+
+ for (i = 0; i < hist_data->n_field_vars; i++) {
+ ret = field_var_debug_show(m, hist_data->field_vars[i], i, false);
+ if (ret)
+ return;
+ }
+
+ ret = hist_actions_debug_show(m, hist_data);
+ if (ret)
+ return;
+}
+
+static int hist_debug_show(struct seq_file *m, void *v)
+{
+ struct event_trigger_data *data;
+ struct trace_event_file *event_file;
+ int n = 0, ret = 0;
+
+ mutex_lock(&event_mutex);
+
+ event_file = event_file_data(m->private);
+ if (unlikely(!event_file)) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ list_for_each_entry(data, &event_file->triggers, list) {
+ if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
+ hist_trigger_debug_show(m, data, n++);
+ }
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+static int event_hist_debug_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
+ return single_open(file, hist_debug_show, file);
+}
+
+const struct file_operations event_hist_debug_fops = {
+ .open = event_hist_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif
+
static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
{
const char *field_name = hist_field_name(hist_field, 0);
@@ -7393,37 +5904,3 @@ __init int register_trigger_hist_enable_disable_cmds(void)
return ret;
}
-
-static __init int trace_events_hist_init(void)
-{
- struct dentry *entry = NULL;
- struct dentry *d_tracer;
- int err = 0;
-
- err = dyn_event_register(&synth_event_ops);
- if (err) {
- pr_warn("Could not register synth_event_ops\n");
- return err;
- }
-
- d_tracer = tracing_init_dentry();
- if (IS_ERR(d_tracer)) {
- err = PTR_ERR(d_tracer);
- goto err;
- }
-
- entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
- NULL, &synth_events_fops);
- if (!entry) {
- err = -ENODEV;
- goto err;
- }
-
- return err;
- err:
- pr_warn("Could not create tracefs 'synthetic_events' entry\n");
-
- return err;
-}
-
-fs_initcall(trace_events_hist_init);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
new file mode 100644
index 000000000000..c6cca0d1d584
--- /dev/null
+++ b/kernel/trace/trace_events_synth.c
@@ -0,0 +1,1789 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * trace_events_synth - synthetic trace events
+ *
+ * Copyright (C) 2015, 2020 Tom Zanussi <tom.zanussi@linux.intel.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/security.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/rculist.h>
+#include <linux/tracefs.h>
+
+/* for gfp flag names */
+#include <linux/trace_events.h>
+#include <trace/events/mmflags.h>
+
+#include "trace_synth.h"
+
+static int create_synth_event(int argc, const char **argv);
+static int synth_event_show(struct seq_file *m, struct dyn_event *ev);
+static int synth_event_release(struct dyn_event *ev);
+static bool synth_event_is_busy(struct dyn_event *ev);
+static bool synth_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev);
+
+static struct dyn_event_operations synth_event_ops = {
+ .create = create_synth_event,
+ .show = synth_event_show,
+ .is_busy = synth_event_is_busy,
+ .free = synth_event_release,
+ .match = synth_event_match,
+};
+
+static bool is_synth_event(struct dyn_event *ev)
+{
+ return ev->ops == &synth_event_ops;
+}
+
+static struct synth_event *to_synth_event(struct dyn_event *ev)
+{
+ return container_of(ev, struct synth_event, devent);
+}
+
+static bool synth_event_is_busy(struct dyn_event *ev)
+{
+ struct synth_event *event = to_synth_event(ev);
+
+ return event->ref != 0;
+}
+
+static bool synth_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct synth_event *sev = to_synth_event(ev);
+
+ return strcmp(sev->name, event) == 0 &&
+ (!system || strcmp(system, SYNTH_SYSTEM) == 0);
+}
+
+struct synth_trace_event {
+ struct trace_entry ent;
+ u64 fields[];
+};
+
+static int synth_event_define_fields(struct trace_event_call *call)
+{
+ struct synth_trace_event trace;
+ int offset = offsetof(typeof(trace), fields);
+ struct synth_event *event = call->data;
+ unsigned int i, size, n_u64;
+ char *name, *type;
+ bool is_signed;
+ int ret = 0;
+
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ size = event->fields[i]->size;
+ is_signed = event->fields[i]->is_signed;
+ type = event->fields[i]->type;
+ name = event->fields[i]->name;
+ ret = trace_define_field(call, type, name, offset, size,
+ is_signed, FILTER_OTHER);
+ if (ret)
+ break;
+
+ event->fields[i]->offset = n_u64;
+
+ if (event->fields[i]->is_string) {
+ offset += STR_VAR_LEN_MAX;
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ offset += sizeof(u64);
+ n_u64++;
+ }
+ }
+
+ event->n_u64 = n_u64;
+
+ return ret;
+}
+
+static bool synth_field_signed(char *type)
+{
+ if (str_has_prefix(type, "u"))
+ return false;
+ if (strcmp(type, "gfp_t") == 0)
+ return false;
+
+ return true;
+}
+
+static int synth_field_is_string(char *type)
+{
+ if (strstr(type, "char[") != NULL)
+ return true;
+
+ return false;
+}
+
+static int synth_field_string_size(char *type)
+{
+ char buf[4], *end, *start;
+ unsigned int len;
+ int size, err;
+
+ start = strstr(type, "char[");
+ if (start == NULL)
+ return -EINVAL;
+ start += sizeof("char[") - 1;
+
+ end = strchr(type, ']');
+ if (!end || end < start)
+ return -EINVAL;
+
+ len = end - start;
+ if (len > 3)
+ return -EINVAL;
+
+ strncpy(buf, start, len);
+ buf[len] = '\0';
+
+ err = kstrtouint(buf, 0, &size);
+ if (err)
+ return err;
+
+ if (size > STR_VAR_LEN_MAX)
+ return -EINVAL;
+
+ return size;
+}
+
+static int synth_field_size(char *type)
+{
+ int size = 0;
+
+ if (strcmp(type, "s64") == 0)
+ size = sizeof(s64);
+ else if (strcmp(type, "u64") == 0)
+ size = sizeof(u64);
+ else if (strcmp(type, "s32") == 0)
+ size = sizeof(s32);
+ else if (strcmp(type, "u32") == 0)
+ size = sizeof(u32);
+ else if (strcmp(type, "s16") == 0)
+ size = sizeof(s16);
+ else if (strcmp(type, "u16") == 0)
+ size = sizeof(u16);
+ else if (strcmp(type, "s8") == 0)
+ size = sizeof(s8);
+ else if (strcmp(type, "u8") == 0)
+ size = sizeof(u8);
+ else if (strcmp(type, "char") == 0)
+ size = sizeof(char);
+ else if (strcmp(type, "unsigned char") == 0)
+ size = sizeof(unsigned char);
+ else if (strcmp(type, "int") == 0)
+ size = sizeof(int);
+ else if (strcmp(type, "unsigned int") == 0)
+ size = sizeof(unsigned int);
+ else if (strcmp(type, "long") == 0)
+ size = sizeof(long);
+ else if (strcmp(type, "unsigned long") == 0)
+ size = sizeof(unsigned long);
+ else if (strcmp(type, "pid_t") == 0)
+ size = sizeof(pid_t);
+ else if (strcmp(type, "gfp_t") == 0)
+ size = sizeof(gfp_t);
+ else if (synth_field_is_string(type))
+ size = synth_field_string_size(type);
+
+ return size;
+}
+
+static const char *synth_field_fmt(char *type)
+{
+ const char *fmt = "%llu";
+
+ if (strcmp(type, "s64") == 0)
+ fmt = "%lld";
+ else if (strcmp(type, "u64") == 0)
+ fmt = "%llu";
+ else if (strcmp(type, "s32") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u32") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "s16") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u16") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "s8") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "u8") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "char") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "unsigned char") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "int") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "unsigned int") == 0)
+ fmt = "%u";
+ else if (strcmp(type, "long") == 0)
+ fmt = "%ld";
+ else if (strcmp(type, "unsigned long") == 0)
+ fmt = "%lu";
+ else if (strcmp(type, "pid_t") == 0)
+ fmt = "%d";
+ else if (strcmp(type, "gfp_t") == 0)
+ fmt = "%x";
+ else if (synth_field_is_string(type))
+ fmt = "%s";
+
+ return fmt;
+}
+
+static void print_synth_event_num_val(struct trace_seq *s,
+ char *print_fmt, char *name,
+ int size, u64 val, char *space)
+{
+ switch (size) {
+ case 1:
+ trace_seq_printf(s, print_fmt, name, (u8)val, space);
+ break;
+
+ case 2:
+ trace_seq_printf(s, print_fmt, name, (u16)val, space);
+ break;
+
+ case 4:
+ trace_seq_printf(s, print_fmt, name, (u32)val, space);
+ break;
+
+ default:
+ trace_seq_printf(s, print_fmt, name, val, space);
+ break;
+ }
+}
+
+static enum print_line_t print_synth_event(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ struct trace_array *tr = iter->tr;
+ struct trace_seq *s = &iter->seq;
+ struct synth_trace_event *entry;
+ struct synth_event *se;
+ unsigned int i, n_u64;
+ char print_fmt[32];
+ const char *fmt;
+
+ entry = (struct synth_trace_event *)iter->ent;
+ se = container_of(event, struct synth_event, call.event);
+
+ trace_seq_printf(s, "%s: ", se->name);
+
+ for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
+ if (trace_seq_has_overflowed(s))
+ goto end;
+
+ fmt = synth_field_fmt(se->fields[i]->type);
+
+ /* parameter types */
+ if (tr && tr->trace_flags & TRACE_ITER_VERBOSE)
+ trace_seq_printf(s, "%s ", fmt);
+
+ snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
+
+ /* parameter values */
+ if (se->fields[i]->is_string) {
+ trace_seq_printf(s, print_fmt, se->fields[i]->name,
+ (char *)&entry->fields[n_u64],
+ i == se->n_fields - 1 ? "" : " ");
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ struct trace_print_flags __flags[] = {
+ __def_gfpflag_names, {-1, NULL} };
+ char *space = (i == se->n_fields - 1 ? "" : " ");
+
+ print_synth_event_num_val(s, print_fmt,
+ se->fields[i]->name,
+ se->fields[i]->size,
+ entry->fields[n_u64],
+ space);
+
+ if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
+ trace_seq_puts(s, " (");
+ trace_print_flags_seq(s, "|",
+ entry->fields[n_u64],
+ __flags);
+ trace_seq_putc(s, ')');
+ }
+ n_u64++;
+ }
+ }
+end:
+ trace_seq_putc(s, '\n');
+
+ return trace_handle_return(s);
+}
+
+static struct trace_event_functions synth_event_funcs = {
+ .trace = print_synth_event
+};
+
+static notrace void trace_event_raw_event_synth(void *__data,
+ u64 *var_ref_vals,
+ unsigned int *var_ref_idx)
+{
+ struct trace_event_file *trace_file = __data;
+ struct synth_trace_event *entry;
+ struct trace_event_buffer fbuffer;
+ struct trace_buffer *buffer;
+ struct synth_event *event;
+ unsigned int i, n_u64, val_idx;
+ int fields_size = 0;
+
+ event = trace_file->event_call->data;
+
+ if (trace_trigger_soft_disabled(trace_file))
+ return;
+
+ fields_size = event->n_u64 * sizeof(u64);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ buffer = trace_file->tr->array_buffer.buffer;
+ ring_buffer_nest_start(buffer);
+
+ entry = trace_event_buffer_reserve(&fbuffer, trace_file,
+ sizeof(*entry) + fields_size);
+ if (!entry)
+ goto out;
+
+ for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
+ val_idx = var_ref_idx[i];
+ if (event->fields[i]->is_string) {
+ char *str_val = (char *)(long)var_ref_vals[val_idx];
+ char *str_field = (char *)&entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ struct synth_field *field = event->fields[i];
+ u64 val = var_ref_vals[val_idx];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ entry->fields[n_u64] = val;
+ break;
+ }
+ n_u64++;
+ }
+ }
+
+ trace_event_buffer_commit(&fbuffer);
+out:
+ ring_buffer_nest_end(buffer);
+}
+
+static void free_synth_event_print_fmt(struct trace_event_call *call)
+{
+ if (call) {
+ kfree(call->print_fmt);
+ call->print_fmt = NULL;
+ }
+}
+
+static int __set_synth_event_print_fmt(struct synth_event *event,
+ char *buf, int len)
+{
+ const char *fmt;
+ int pos = 0;
+ int i;
+
+ /* When len=0, we just calculate the needed length */
+#define LEN_OR_ZERO (len ? len - pos : 0)
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+ for (i = 0; i < event->n_fields; i++) {
+ fmt = synth_field_fmt(event->fields[i]->type);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
+ event->fields[i]->name, fmt,
+ i == event->n_fields - 1 ? "" : ", ");
+ }
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ for (i = 0; i < event->n_fields; i++) {
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", event->fields[i]->name);
+ }
+
+#undef LEN_OR_ZERO
+
+ /* return the length of print_fmt */
+ return pos;
+}
+
+static int set_synth_event_print_fmt(struct trace_event_call *call)
+{
+ struct synth_event *event = call->data;
+ char *print_fmt;
+ int len;
+
+ /* First: called with 0 length to calculate the needed length */
+ len = __set_synth_event_print_fmt(event, NULL, 0);
+
+ print_fmt = kmalloc(len + 1, GFP_KERNEL);
+ if (!print_fmt)
+ return -ENOMEM;
+
+ /* Second: actually write the @print_fmt */
+ __set_synth_event_print_fmt(event, print_fmt, len + 1);
+ call->print_fmt = print_fmt;
+
+ return 0;
+}
+
+static void free_synth_field(struct synth_field *field)
+{
+ kfree(field->type);
+ kfree(field->name);
+ kfree(field);
+}
+
+static struct synth_field *parse_synth_field(int argc, const char **argv,
+ int *consumed)
+{
+ struct synth_field *field;
+ const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
+ int len, ret = 0;
+
+ if (field_type[0] == ';')
+ field_type++;
+
+ if (!strcmp(field_type, "unsigned")) {
+ if (argc < 3)
+ return ERR_PTR(-EINVAL);
+ prefix = "unsigned ";
+ field_type = argv[1];
+ field_name = argv[2];
+ *consumed = 3;
+ } else {
+ field_name = argv[1];
+ *consumed = 2;
+ }
+
+ field = kzalloc(sizeof(*field), GFP_KERNEL);
+ if (!field)
+ return ERR_PTR(-ENOMEM);
+
+ len = strlen(field_name);
+ array = strchr(field_name, '[');
+ if (array)
+ len -= strlen(array);
+ else if (field_name[len - 1] == ';')
+ len--;
+
+ field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
+ if (!field->name) {
+ ret = -ENOMEM;
+ goto free;
+ }
+
+ if (field_type[0] == ';')
+ field_type++;
+ len = strlen(field_type) + 1;
+ if (array)
+ len += strlen(array);
+ if (prefix)
+ len += strlen(prefix);
+
+ field->type = kzalloc(len, GFP_KERNEL);
+ if (!field->type) {
+ ret = -ENOMEM;
+ goto free;
+ }
+ if (prefix)
+ strcat(field->type, prefix);
+ strcat(field->type, field_type);
+ if (array) {
+ strcat(field->type, array);
+ if (field->type[len - 1] == ';')
+ field->type[len - 1] = '\0';
+ }
+
+ field->size = synth_field_size(field->type);
+ if (!field->size) {
+ ret = -EINVAL;
+ goto free;
+ }
+
+ if (synth_field_is_string(field->type))
+ field->is_string = true;
+
+ field->is_signed = synth_field_signed(field->type);
+
+ out:
+ return field;
+ free:
+ free_synth_field(field);
+ field = ERR_PTR(ret);
+ goto out;
+}
+
+static void free_synth_tracepoint(struct tracepoint *tp)
+{
+ if (!tp)
+ return;
+
+ kfree(tp->name);
+ kfree(tp);
+}
+
+static struct tracepoint *alloc_synth_tracepoint(char *name)
+{
+ struct tracepoint *tp;
+
+ tp = kzalloc(sizeof(*tp), GFP_KERNEL);
+ if (!tp)
+ return ERR_PTR(-ENOMEM);
+
+ tp->name = kstrdup(name, GFP_KERNEL);
+ if (!tp->name) {
+ kfree(tp);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return tp;
+}
+
+struct synth_event *find_synth_event(const char *name)
+{
+ struct dyn_event *pos;
+ struct synth_event *event;
+
+ for_each_dyn_event(pos) {
+ if (!is_synth_event(pos))
+ continue;
+ event = to_synth_event(pos);
+ if (strcmp(event->name, name) == 0)
+ return event;
+ }
+
+ return NULL;
+}
+
+static struct trace_event_fields synth_event_fields_array[] = {
+ { .type = TRACE_FUNCTION_TYPE,
+ .define_fields = synth_event_define_fields },
+ {}
+};
+
+static int register_synth_event(struct synth_event *event)
+{
+ struct trace_event_call *call = &event->call;
+ int ret = 0;
+
+ event->call.class = &event->class;
+ event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
+ if (!event->class.system) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ event->tp = alloc_synth_tracepoint(event->name);
+ if (IS_ERR(event->tp)) {
+ ret = PTR_ERR(event->tp);
+ event->tp = NULL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&call->class->fields);
+ call->event.funcs = &synth_event_funcs;
+ call->class->fields_array = synth_event_fields_array;
+
+ ret = register_trace_event(&call->event);
+ if (!ret) {
+ ret = -ENODEV;
+ goto out;
+ }
+ call->flags = TRACE_EVENT_FL_TRACEPOINT;
+ call->class->reg = trace_event_reg;
+ call->class->probe = trace_event_raw_event_synth;
+ call->data = event;
+ call->tp = event->tp;
+
+ ret = trace_add_event_call(call);
+ if (ret) {
+ pr_warn("Failed to register synthetic event: %s\n",
+ trace_event_name(call));
+ goto err;
+ }
+
+ ret = set_synth_event_print_fmt(call);
+ if (ret < 0) {
+ trace_remove_event_call(call);
+ goto err;
+ }
+ out:
+ return ret;
+ err:
+ unregister_trace_event(&call->event);
+ goto out;
+}
+
+static int unregister_synth_event(struct synth_event *event)
+{
+ struct trace_event_call *call = &event->call;
+ int ret;
+
+ ret = trace_remove_event_call(call);
+
+ return ret;
+}
+
+static void free_synth_event(struct synth_event *event)
+{
+ unsigned int i;
+
+ if (!event)
+ return;
+
+ for (i = 0; i < event->n_fields; i++)
+ free_synth_field(event->fields[i]);
+
+ kfree(event->fields);
+ kfree(event->name);
+ kfree(event->class.system);
+ free_synth_tracepoint(event->tp);
+ free_synth_event_print_fmt(&event->call);
+ kfree(event);
+}
+
+static struct synth_event *alloc_synth_event(const char *name, int n_fields,
+ struct synth_field **fields)
+{
+ struct synth_event *event;
+ unsigned int i;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event) {
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ event->name = kstrdup(name, GFP_KERNEL);
+ if (!event->name) {
+ kfree(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
+ if (!event->fields) {
+ free_synth_event(event);
+ event = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ dyn_event_init(&event->devent, &synth_event_ops);
+
+ for (i = 0; i < n_fields; i++)
+ event->fields[i] = fields[i];
+
+ event->n_fields = n_fields;
+ out:
+ return event;
+}
+
+static int synth_event_check_arg_fn(void *data)
+{
+ struct dynevent_arg_pair *arg_pair = data;
+ int size;
+
+ size = synth_field_size((char *)arg_pair->lhs);
+
+ return size ? 0 : -EINVAL;
+}
+
+/**
+ * synth_event_add_field - Add a new field to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @type: The type of the new field to add
+ * @name: The name of the new field to add
+ *
+ * Add a new field to a synthetic event cmd object. Field ordering is in
+ * the same order the fields are added.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_field(struct dynevent_cmd *cmd, const char *type,
+ const char *name)
+{
+ struct dynevent_arg_pair arg_pair;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (!type || !name)
+ return -EINVAL;
+
+ dynevent_arg_pair_init(&arg_pair, 0, ';');
+
+ arg_pair.lhs = type;
+ arg_pair.rhs = name;
+
+ ret = dynevent_arg_pair_add(cmd, &arg_pair, synth_event_check_arg_fn);
+ if (ret)
+ return ret;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX)
+ ret = -EINVAL;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_field);
+
+/**
+ * synth_event_add_field_str - Add a new field to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @type_name: The type and name of the new field to add, as a single string
+ *
+ * Add a new field to a synthetic event cmd object, as a single
+ * string. The @type_name string is expected to be of the form 'type
+ * name', which will be appended by ';'. No sanity checking is done -
+ * what's passed in is assumed to already be well-formed. Field
+ * ordering is in the same order the fields are added.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_field_str(struct dynevent_cmd *cmd, const char *type_name)
+{
+ struct dynevent_arg arg;
+ int ret;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (!type_name)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, ';');
+
+ arg.str = type_name;
+
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX)
+ ret = -EINVAL;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_field_str);
+
+/**
+ * synth_event_add_fields - Add multiple fields to a synthetic event cmd
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ *
+ * Add a new set of fields to a synthetic event cmd object. The event
+ * fields that will be defined for the event should be passed in as an
+ * array of struct synth_field_desc, and the number of elements in the
+ * array passed in as n_fields. Field ordering will retain the
+ * ordering given in the fields array.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_add_fields(struct dynevent_cmd *cmd,
+ struct synth_field_desc *fields,
+ unsigned int n_fields)
+{
+ unsigned int i;
+ int ret = 0;
+
+ for (i = 0; i < n_fields; i++) {
+ if (fields[i].type == NULL || fields[i].name == NULL) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_add_fields);
+
+/**
+ * __synth_event_gen_cmd_start - Start a synthetic event command from arg list
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the synthetic event
+ * @mod: The module creating the event, NULL if not created from a module
+ * @args: Variable number of arg (pairs), one pair for each field
+ *
+ * NOTE: Users normally won't want to call this function directly, but
+ * rather use the synth_event_gen_cmd_start() wrapper, which
+ * automatically adds a NULL to the end of the arg list. If this
+ * function is used directly, make sure the last arg in the variable
+ * arg list is NULL.
+ *
+ * Generate a synthetic event command to be executed by
+ * synth_event_gen_cmd_end(). This function can be used to generate
+ * the complete command or only the first part of it; in the latter
+ * case, synth_event_add_field(), synth_event_add_field_str(), or
+ * synth_event_add_fields() can be used to add more fields following
+ * this.
+ *
+ * There should be an even number variable args, each pair consisting
+ * of a type followed by a field name.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int __synth_event_gen_cmd_start(struct dynevent_cmd *cmd, const char *name,
+ struct module *mod, ...)
+{
+ struct dynevent_arg arg;
+ va_list args;
+ int ret;
+
+ cmd->event_name = name;
+ cmd->private_data = mod;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = name;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ va_start(args, mod);
+ for (;;) {
+ const char *type, *name;
+
+ type = va_arg(args, const char *);
+ if (!type)
+ break;
+ name = va_arg(args, const char *);
+ if (!name)
+ break;
+
+ if (++cmd->n_fields > SYNTH_FIELDS_MAX) {
+ ret = -EINVAL;
+ break;
+ }
+
+ ret = synth_event_add_field(cmd, type, name);
+ if (ret)
+ break;
+ }
+ va_end(args);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__synth_event_gen_cmd_start);
+
+/**
+ * synth_event_gen_cmd_array_start - Start synthetic event command from an array
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @name: The name of the synthetic event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ *
+ * Generate a synthetic event command to be executed by
+ * synth_event_gen_cmd_end(). This function can be used to generate
+ * the complete command or only the first part of it; in the latter
+ * case, synth_event_add_field(), synth_event_add_field_str(), or
+ * synth_event_add_fields() can be used to add more fields following
+ * this.
+ *
+ * The event fields that will be defined for the event should be
+ * passed in as an array of struct synth_field_desc, and the number of
+ * elements in the array passed in as n_fields. Field ordering will
+ * retain the ordering given in the fields array.
+ *
+ * See synth_field_size() for available types. If field_name contains
+ * [n] the field is considered to be an array.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_gen_cmd_array_start(struct dynevent_cmd *cmd, const char *name,
+ struct module *mod,
+ struct synth_field_desc *fields,
+ unsigned int n_fields)
+{
+ struct dynevent_arg arg;
+ unsigned int i;
+ int ret = 0;
+
+ cmd->event_name = name;
+ cmd->private_data = mod;
+
+ if (cmd->type != DYNEVENT_TYPE_SYNTH)
+ return -EINVAL;
+
+ if (n_fields > SYNTH_FIELDS_MAX)
+ return -EINVAL;
+
+ dynevent_arg_init(&arg, 0);
+ arg.str = name;
+ ret = dynevent_arg_add(cmd, &arg, NULL);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < n_fields; i++) {
+ if (fields[i].type == NULL || fields[i].name == NULL)
+ return -EINVAL;
+
+ ret = synth_event_add_field(cmd, fields[i].type, fields[i].name);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_gen_cmd_array_start);
+
+static int __create_synth_event(int argc, const char *name, const char **argv)
+{
+ struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
+ struct synth_event *event = NULL;
+ int i, consumed = 0, n_fields = 0, ret = 0;
+
+ /*
+ * Argument syntax:
+ * - Add synthetic event: <event_name> field[;field] ...
+ * - Remove synthetic event: !<event_name> field[;field] ...
+ * where 'field' = type field_name
+ */
+
+ if (name[0] == '\0' || argc < 1)
+ return -EINVAL;
+
+ mutex_lock(&event_mutex);
+
+ event = find_synth_event(name);
+ if (event) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ for (i = 0; i < argc - 1; i++) {
+ if (strcmp(argv[i], ";") == 0)
+ continue;
+ if (n_fields == SYNTH_FIELDS_MAX) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ field = parse_synth_field(argc - i, &argv[i], &consumed);
+ if (IS_ERR(field)) {
+ ret = PTR_ERR(field);
+ goto err;
+ }
+ fields[n_fields++] = field;
+ i += consumed - 1;
+ }
+
+ if (i < argc && strcmp(argv[i], ";") != 0) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ event = alloc_synth_event(name, n_fields, fields);
+ if (IS_ERR(event)) {
+ ret = PTR_ERR(event);
+ event = NULL;
+ goto err;
+ }
+ ret = register_synth_event(event);
+ if (!ret)
+ dyn_event_add(&event->devent);
+ else
+ free_synth_event(event);
+ out:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+ err:
+ for (i = 0; i < n_fields; i++)
+ free_synth_field(fields[i]);
+
+ goto out;
+}
+
+/**
+ * synth_event_create - Create a new synthetic event
+ * @name: The name of the new sythetic event
+ * @fields: An array of type/name field descriptions
+ * @n_fields: The number of field descriptions contained in the fields array
+ * @mod: The module creating the event, NULL if not created from a module
+ *
+ * Create a new synthetic event with the given name under the
+ * trace/events/synthetic/ directory. The event fields that will be
+ * defined for the event should be passed in as an array of struct
+ * synth_field_desc, and the number elements in the array passed in as
+ * n_fields. Field ordering will retain the ordering given in the
+ * fields array.
+ *
+ * If the new synthetic event is being created from a module, the mod
+ * param must be non-NULL. This will ensure that the trace buffer
+ * won't contain unreadable events.
+ *
+ * The new synth event should be deleted using synth_event_delete()
+ * function. The new synthetic event can be generated from modules or
+ * other kernel code using trace_synth_event() and related functions.
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_create(const char *name, struct synth_field_desc *fields,
+ unsigned int n_fields, struct module *mod)
+{
+ struct dynevent_cmd cmd;
+ char *buf;
+ int ret;
+
+ buf = kzalloc(MAX_DYNEVENT_CMD_LEN, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ synth_event_cmd_init(&cmd, buf, MAX_DYNEVENT_CMD_LEN);
+
+ ret = synth_event_gen_cmd_array_start(&cmd, name, mod,
+ fields, n_fields);
+ if (ret)
+ goto out;
+
+ ret = synth_event_gen_cmd_end(&cmd);
+ out:
+ kfree(buf);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_create);
+
+static int destroy_synth_event(struct synth_event *se)
+{
+ int ret;
+
+ if (se->ref)
+ ret = -EBUSY;
+ else {
+ ret = unregister_synth_event(se);
+ if (!ret) {
+ dyn_event_remove(&se->devent);
+ free_synth_event(se);
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * synth_event_delete - Delete a synthetic event
+ * @event_name: The name of the new sythetic event
+ *
+ * Delete a synthetic event that was created with synth_event_create().
+ *
+ * Return: 0 if successful, error otherwise.
+ */
+int synth_event_delete(const char *event_name)
+{
+ struct synth_event *se = NULL;
+ struct module *mod = NULL;
+ int ret = -ENOENT;
+
+ mutex_lock(&event_mutex);
+ se = find_synth_event(event_name);
+ if (se) {
+ mod = se->mod;
+ ret = destroy_synth_event(se);
+ }
+ mutex_unlock(&event_mutex);
+
+ if (mod) {
+ mutex_lock(&trace_types_lock);
+ /*
+ * It is safest to reset the ring buffer if the module
+ * being unloaded registered any events that were
+ * used. The only worry is if a new module gets
+ * loaded, and takes on the same id as the events of
+ * this module. When printing out the buffer, traced
+ * events left over from this module may be passed to
+ * the new module events and unexpected results may
+ * occur.
+ */
+ tracing_reset_all_online_cpus();
+ mutex_unlock(&trace_types_lock);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_delete);
+
+static int create_or_delete_synth_event(int argc, char **argv)
+{
+ const char *name = argv[0];
+ int ret;
+
+ /* trace_run_command() ensures argc != 0 */
+ if (name[0] == '!') {
+ ret = synth_event_delete(name + 1);
+ return ret;
+ }
+
+ ret = __create_synth_event(argc - 1, name, (const char **)argv + 1);
+ return ret == -ECANCELED ? -EINVAL : ret;
+}
+
+static int synth_event_run_command(struct dynevent_cmd *cmd)
+{
+ struct synth_event *se;
+ int ret;
+
+ ret = trace_run_command(cmd->seq.buffer, create_or_delete_synth_event);
+ if (ret)
+ return ret;
+
+ se = find_synth_event(cmd->event_name);
+ if (WARN_ON(!se))
+ return -ENOENT;
+
+ se->mod = cmd->private_data;
+
+ return ret;
+}
+
+/**
+ * synth_event_cmd_init - Initialize a synthetic event command object
+ * @cmd: A pointer to the dynevent_cmd struct representing the new event
+ * @buf: A pointer to the buffer used to build the command
+ * @maxlen: The length of the buffer passed in @buf
+ *
+ * Initialize a synthetic event command object. Use this before
+ * calling any of the other dyenvent_cmd functions.
+ */
+void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
+{
+ dynevent_cmd_init(cmd, buf, maxlen, DYNEVENT_TYPE_SYNTH,
+ synth_event_run_command);
+}
+EXPORT_SYMBOL_GPL(synth_event_cmd_init);
+
+static inline int
+__synth_event_trace_start(struct trace_event_file *file,
+ struct synth_event_trace_state *trace_state)
+{
+ int entry_size, fields_size = 0;
+ int ret = 0;
+
+ memset(trace_state, '\0', sizeof(*trace_state));
+
+ /*
+ * Normal event tracing doesn't get called at all unless the
+ * ENABLED bit is set (which attaches the probe thus allowing
+ * this code to be called, etc). Because this is called
+ * directly by the user, we don't have that but we still need
+ * to honor not logging when disabled. For the the iterated
+ * trace case, we save the enabed state upon start and just
+ * ignore the following data calls.
+ */
+ if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file)) {
+ trace_state->disabled = true;
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trace_state->event = file->event_call->data;
+
+ fields_size = trace_state->event->n_u64 * sizeof(u64);
+
+ /*
+ * Avoid ring buffer recursion detection, as this event
+ * is being performed within another event.
+ */
+ trace_state->buffer = file->tr->array_buffer.buffer;
+ ring_buffer_nest_start(trace_state->buffer);
+
+ entry_size = sizeof(*trace_state->entry) + fields_size;
+ trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer,
+ file,
+ entry_size);
+ if (!trace_state->entry) {
+ ring_buffer_nest_end(trace_state->buffer);
+ ret = -EINVAL;
+ }
+out:
+ return ret;
+}
+
+static inline void
+__synth_event_trace_end(struct synth_event_trace_state *trace_state)
+{
+ trace_event_buffer_commit(&trace_state->fbuffer);
+
+ ring_buffer_nest_end(trace_state->buffer);
+}
+
+/**
+ * synth_event_trace - Trace a synthetic event
+ * @file: The trace_event_file representing the synthetic event
+ * @n_vals: The number of values in vals
+ * @args: Variable number of args containing the event values
+ *
+ * Trace a synthetic event using the values passed in the variable
+ * argument list.
+ *
+ * The argument list should be a list 'n_vals' u64 values. The number
+ * of vals must match the number of field in the synthetic event, and
+ * must be in the same order as the synthetic event fields.
+ *
+ * All vals should be cast to u64, and string vals are just pointers
+ * to strings, cast to u64. Strings will be copied into space
+ * reserved in the event for the string, using these pointers.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
+{
+ struct synth_event_trace_state state;
+ unsigned int i, n_u64;
+ va_list args;
+ int ret;
+
+ ret = __synth_event_trace_start(file, &state);
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+ return ret;
+ }
+
+ if (n_vals != state.event->n_fields) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ va_start(args, n_vals);
+ for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
+ u64 val;
+
+ val = va_arg(args, u64);
+
+ if (state.event->fields[i]->is_string) {
+ char *str_val = (char *)(long)val;
+ char *str_field = (char *)&state.entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ struct synth_field *field = state.event->fields[i];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ state.entry->fields[n_u64] = val;
+ break;
+ }
+ n_u64++;
+ }
+ }
+ va_end(args);
+out:
+ __synth_event_trace_end(&state);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace);
+
+/**
+ * synth_event_trace_array - Trace a synthetic event from an array
+ * @file: The trace_event_file representing the synthetic event
+ * @vals: Array of values
+ * @n_vals: The number of values in vals
+ *
+ * Trace a synthetic event using the values passed in as 'vals'.
+ *
+ * The 'vals' array is just an array of 'n_vals' u64. The number of
+ * vals must match the number of field in the synthetic event, and
+ * must be in the same order as the synthetic event fields.
+ *
+ * All vals should be cast to u64, and string vals are just pointers
+ * to strings, cast to u64. Strings will be copied into space
+ * reserved in the event for the string, using these pointers.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
+ unsigned int n_vals)
+{
+ struct synth_event_trace_state state;
+ unsigned int i, n_u64;
+ int ret;
+
+ ret = __synth_event_trace_start(file, &state);
+ if (ret) {
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+ return ret;
+ }
+
+ if (n_vals != state.event->n_fields) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
+ if (state.event->fields[i]->is_string) {
+ char *str_val = (char *)(long)vals[i];
+ char *str_field = (char *)&state.entry->fields[n_u64];
+
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
+ } else {
+ struct synth_field *field = state.event->fields[i];
+ u64 val = vals[i];
+
+ switch (field->size) {
+ case 1:
+ *(u8 *)&state.entry->fields[n_u64] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&state.entry->fields[n_u64] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&state.entry->fields[n_u64] = (u32)val;
+ break;
+
+ default:
+ state.entry->fields[n_u64] = val;
+ break;
+ }
+ n_u64++;
+ }
+ }
+out:
+ __synth_event_trace_end(&state);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_array);
+
+/**
+ * synth_event_trace_start - Start piecewise synthetic event trace
+ * @file: The trace_event_file representing the synthetic event
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Start the trace of a synthetic event field-by-field rather than all
+ * at once.
+ *
+ * This function 'opens' an event trace, which means space is reserved
+ * for the event in the trace buffer, after which the event's
+ * individual field values can be set through either
+ * synth_event_add_next_val() or synth_event_add_val().
+ *
+ * A pointer to a trace_state object is passed in, which will keep
+ * track of the current event trace state until the event trace is
+ * closed (and the event finally traced) using
+ * synth_event_trace_end().
+ *
+ * Note that synth_event_trace_end() must be called after all values
+ * have been added for each event trace, regardless of whether adding
+ * all field values succeeded or not.
+ *
+ * Note also that for a given event trace, all fields must be added
+ * using either synth_event_add_next_val() or synth_event_add_val()
+ * but not both together or interleaved.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_start(struct trace_event_file *file,
+ struct synth_event_trace_state *trace_state)
+{
+ int ret;
+
+ if (!trace_state)
+ return -EINVAL;
+
+ ret = __synth_event_trace_start(file, trace_state);
+ if (ret == -ENOENT)
+ ret = 0; /* just disabled, not really an error */
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_start);
+
+static int __synth_event_add_val(const char *field_name, u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ struct synth_field *field = NULL;
+ struct synth_trace_event *entry;
+ struct synth_event *event;
+ int i, ret = 0;
+
+ if (!trace_state) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* can't mix add_next_synth_val() with add_synth_val() */
+ if (field_name) {
+ if (trace_state->add_next) {
+ ret = -EINVAL;
+ goto out;
+ }
+ trace_state->add_name = true;
+ } else {
+ if (trace_state->add_name) {
+ ret = -EINVAL;
+ goto out;
+ }
+ trace_state->add_next = true;
+ }
+
+ if (trace_state->disabled)
+ goto out;
+
+ event = trace_state->event;
+ if (trace_state->add_name) {
+ for (i = 0; i < event->n_fields; i++) {
+ field = event->fields[i];
+ if (strcmp(field->name, field_name) == 0)
+ break;
+ }
+ if (!field) {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else {
+ if (trace_state->cur_field >= event->n_fields) {
+ ret = -EINVAL;
+ goto out;
+ }
+ field = event->fields[trace_state->cur_field++];
+ }
+
+ entry = trace_state->entry;
+ if (field->is_string) {
+ char *str_val = (char *)(long)val;
+ char *str_field;
+
+ if (!str_val) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ str_field = (char *)&entry->fields[field->offset];
+ strscpy(str_field, str_val, STR_VAR_LEN_MAX);
+ } else {
+ switch (field->size) {
+ case 1:
+ *(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
+ break;
+
+ case 2:
+ *(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
+ break;
+
+ case 4:
+ *(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
+ break;
+
+ default:
+ trace_state->entry->fields[field->offset] = val;
+ break;
+ }
+ }
+ out:
+ return ret;
+}
+
+/**
+ * synth_event_add_next_val - Add the next field's value to an open synth trace
+ * @val: The value to set the next field to
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Set the value of the next field in an event that's been opened by
+ * synth_event_trace_start().
+ *
+ * The val param should be the value cast to u64. If the value points
+ * to a string, the val param should be a char * cast to u64.
+ *
+ * This function assumes all the fields in an event are to be set one
+ * after another - successive calls to this function are made, one for
+ * each field, in the order of the fields in the event, until all
+ * fields have been set. If you'd rather set each field individually
+ * without regard to ordering, synth_event_add_val() can be used
+ * instead.
+ *
+ * Note however that synth_event_add_next_val() and
+ * synth_event_add_val() can't be intermixed for a given event trace -
+ * one or the other but not both can be used at the same time.
+ *
+ * Note also that synth_event_trace_end() must be called after all
+ * values have been added for each event trace, regardless of whether
+ * adding all field values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_add_next_val(u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ return __synth_event_add_val(NULL, val, trace_state);
+}
+EXPORT_SYMBOL_GPL(synth_event_add_next_val);
+
+/**
+ * synth_event_add_val - Add a named field's value to an open synth trace
+ * @field_name: The name of the synthetic event field value to set
+ * @val: The value to set the next field to
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * Set the value of the named field in an event that's been opened by
+ * synth_event_trace_start().
+ *
+ * The val param should be the value cast to u64. If the value points
+ * to a string, the val param should be a char * cast to u64.
+ *
+ * This function looks up the field name, and if found, sets the field
+ * to the specified value. This lookup makes this function more
+ * expensive than synth_event_add_next_val(), so use that or the
+ * none-piecewise synth_event_trace() instead if efficiency is more
+ * important.
+ *
+ * Note however that synth_event_add_next_val() and
+ * synth_event_add_val() can't be intermixed for a given event trace -
+ * one or the other but not both can be used at the same time.
+ *
+ * Note also that synth_event_trace_end() must be called after all
+ * values have been added for each event trace, regardless of whether
+ * adding all field values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_add_val(const char *field_name, u64 val,
+ struct synth_event_trace_state *trace_state)
+{
+ return __synth_event_add_val(field_name, val, trace_state);
+}
+EXPORT_SYMBOL_GPL(synth_event_add_val);
+
+/**
+ * synth_event_trace_end - End piecewise synthetic event trace
+ * @trace_state: A pointer to object tracking the piecewise trace state
+ *
+ * End the trace of a synthetic event opened by
+ * synth_event_trace__start().
+ *
+ * This function 'closes' an event trace, which basically means that
+ * it commits the reserved event and cleans up other loose ends.
+ *
+ * A pointer to a trace_state object is passed in, which will keep
+ * track of the current event trace state opened with
+ * synth_event_trace_start().
+ *
+ * Note that this function must be called after all values have been
+ * added for each event trace, regardless of whether adding all field
+ * values succeeded or not.
+ *
+ * Return: 0 on success, err otherwise.
+ */
+int synth_event_trace_end(struct synth_event_trace_state *trace_state)
+{
+ if (!trace_state)
+ return -EINVAL;
+
+ __synth_event_trace_end(trace_state);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(synth_event_trace_end);
+
+static int create_synth_event(int argc, const char **argv)
+{
+ const char *name = argv[0];
+ int len;
+
+ if (name[0] != 's' || name[1] != ':')
+ return -ECANCELED;
+ name += 2;
+
+ /* This interface accepts group name prefix */
+ if (strchr(name, '/')) {
+ len = str_has_prefix(name, SYNTH_SYSTEM "/");
+ if (len == 0)
+ return -EINVAL;
+ name += len;
+ }
+ return __create_synth_event(argc - 1, name, argv + 1);
+}
+
+static int synth_event_release(struct dyn_event *ev)
+{
+ struct synth_event *event = to_synth_event(ev);
+ int ret;
+
+ if (event->ref)
+ return -EBUSY;
+
+ ret = unregister_synth_event(event);
+ if (ret)
+ return ret;
+
+ dyn_event_remove(ev);
+ free_synth_event(event);
+ return 0;
+}
+
+static int __synth_event_show(struct seq_file *m, struct synth_event *event)
+{
+ struct synth_field *field;
+ unsigned int i;
+
+ seq_printf(m, "%s\t", event->name);
+
+ for (i = 0; i < event->n_fields; i++) {
+ field = event->fields[i];
+
+ /* parameter values */
+ seq_printf(m, "%s %s%s", field->type, field->name,
+ i == event->n_fields - 1 ? "" : "; ");
+ }
+
+ seq_putc(m, '\n');
+
+ return 0;
+}
+
+static int synth_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct synth_event *event = to_synth_event(ev);
+
+ seq_printf(m, "s:%s/", event->class.system);
+
+ return __synth_event_show(m, event);
+}
+
+static int synth_events_seq_show(struct seq_file *m, void *v)
+{
+ struct dyn_event *ev = v;
+
+ if (!is_synth_event(ev))
+ return 0;
+
+ return __synth_event_show(m, to_synth_event(ev));
+}
+
+static const struct seq_operations synth_events_seq_op = {
+ .start = dyn_event_seq_start,
+ .next = dyn_event_seq_next,
+ .stop = dyn_event_seq_stop,
+ .show = synth_events_seq_show,
+};
+
+static int synth_events_open(struct inode *inode, struct file *file)
+{
+ int ret;
+
+ ret = security_locked_down(LOCKDOWN_TRACEFS);
+ if (ret)
+ return ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = dyn_events_release_all(&synth_event_ops);
+ if (ret < 0)
+ return ret;
+ }
+
+ return seq_open(file, &synth_events_seq_op);
+}
+
+static ssize_t synth_events_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ return trace_parse_run_command(file, buffer, count, ppos,
+ create_or_delete_synth_event);
+}
+
+static const struct file_operations synth_events_fops = {
+ .open = synth_events_open,
+ .write = synth_events_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static __init int trace_events_synth_init(void)
+{
+ struct dentry *entry = NULL;
+ struct dentry *d_tracer;
+ int err = 0;
+
+ err = dyn_event_register(&synth_event_ops);
+ if (err) {
+ pr_warn("Could not register synth_event_ops\n");
+ return err;
+ }
+
+ d_tracer = tracing_init_dentry();
+ if (IS_ERR(d_tracer)) {
+ err = PTR_ERR(d_tracer);
+ goto err;
+ }
+
+ entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
+ NULL, &synth_events_fops);
+ if (!entry) {
+ err = -ENODEV;
+ goto err;
+ }
+
+ return err;
+ err:
+ pr_warn("Could not create tracefs 'synthetic_events' entry\n");
+
+ return err;
+}
+
+fs_initcall(trace_events_synth_init);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 3a74736da363..f725802160c0 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -216,11 +216,17 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
int trigger_process_regex(struct trace_event_file *file, char *buff)
{
- char *command, *next = buff;
+ char *command, *next;
struct event_command *p;
int ret = -EINVAL;
+ next = buff = skip_spaces(buff);
command = strsep(&next, ": \t");
+ if (next) {
+ next = skip_spaces(next);
+ if (!*next)
+ next = NULL;
+ }
command = (command[0] != '!') ? command : command + 1;
mutex_lock(&trigger_cmd_mutex);
@@ -630,8 +636,14 @@ event_trigger_callback(struct event_command *cmd_ops,
int ret;
/* separate the trigger from the filter (t:n [if filter]) */
- if (param && isdigit(param[0]))
+ if (param && isdigit(param[0])) {
trigger = strsep(&param, " \t");
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
+ }
trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
@@ -1368,6 +1380,11 @@ int event_enable_trigger_func(struct event_command *cmd_ops,
trigger = strsep(&param, " \t");
if (!trigger)
return -EINVAL;
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
system = strsep(&trigger, ":");
if (!trigger)
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 77ce5a3b6773..70d3d0a09053 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -45,6 +45,9 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef __field_desc
#define __field_desc(type, container, item) type item;
+#undef __field_packed
+#define __field_packed(type, container, item) type item;
+
#undef __array
#define __array(type, item, size) type item[size];
@@ -85,6 +88,13 @@ static void __always_unused ____ftrace_check_##name(void) \
.size = sizeof(_type), .align = __alignof__(_type), \
is_signed_type(_type), .filter_type = _filter_type },
+
+#undef __field_ext_packed
+#define __field_ext_packed(_type, _item, _filter_type) { \
+ .type = #_type, .name = #_item, \
+ .size = sizeof(_type), .align = 1, \
+ is_signed_type(_type), .filter_type = _filter_type },
+
#undef __field
#define __field(_type, _item) __field_ext(_type, _item, FILTER_OTHER)
@@ -94,6 +104,9 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __field_desc
#define __field_desc(_type, _container, _item) __field_ext(_type, _item, FILTER_OTHER)
+#undef __field_packed
+#define __field_packed(_type, _container, _item) __field_ext_packed(_type, _item, FILTER_OTHER)
+
#undef __array
#define __array(_type, _item, _len) { \
.type = #_type"["__stringify(_len)"]", .name = #_item, \
@@ -129,6 +142,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \
#undef __field_desc
#define __field_desc(type, container, item)
+#undef __field_packed
+#define __field_packed(type, container, item)
+
#undef __array
#define __array(type, item, len)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8a4c8d5c2c98..dd4dff71d89a 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -42,7 +42,7 @@ static int allocate_ftrace_ops(struct trace_array *tr)
if (!ops)
return -ENOMEM;
- /* Currently only the non stack verision is supported */
+ /* Currently only the non stack version is supported */
ops->func = function_trace_call;
ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 35989383ae11..aefb6065b508 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1202,35 +1202,41 @@ static const struct file_operations kprobe_profile_ops = {
/* Return the length of string -- including null terminal byte */
static nokprobe_inline int
+fetch_store_strlen_user(unsigned long addr)
+{
+ const void __user *uaddr = (__force const void __user *)addr;
+
+ return strnlen_user_nofault(uaddr, MAX_STRING_SIZE);
+}
+
+/* Return the length of string -- including null terminal byte */
+static nokprobe_inline int
fetch_store_strlen(unsigned long addr)
{
int ret, len = 0;
u8 c;
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if (addr < TASK_SIZE)
+ return fetch_store_strlen_user(addr);
+#endif
+
do {
- ret = probe_kernel_read(&c, (u8 *)addr + len, 1);
+ ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1);
len++;
} while (c && ret == 0 && len < MAX_STRING_SIZE);
return (ret < 0) ? ret : len;
}
-/* Return the length of string -- including null terminal byte */
-static nokprobe_inline int
-fetch_store_strlen_user(unsigned long addr)
-{
- const void __user *uaddr = (__force const void __user *)addr;
-
- return strnlen_unsafe_user(uaddr, MAX_STRING_SIZE);
-}
-
/*
- * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
- * length and relative data location.
+ * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
+ * with max length and relative data location.
*/
static nokprobe_inline int
-fetch_store_string(unsigned long addr, void *dest, void *base)
+fetch_store_string_user(unsigned long addr, void *dest, void *base)
{
+ const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
@@ -1240,11 +1246,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
__dest = get_loc_data(dest, base);
- /*
- * Try to get string again, since the string can be changed while
- * probing.
- */
- ret = strncpy_from_unsafe(__dest, (void *)addr, maxlen);
+ ret = strncpy_from_user_nofault(__dest, uaddr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
@@ -1252,23 +1254,31 @@ fetch_store_string(unsigned long addr, void *dest, void *base)
}
/*
- * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf
- * with max length and relative data location.
+ * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max
+ * length and relative data location.
*/
static nokprobe_inline int
-fetch_store_string_user(unsigned long addr, void *dest, void *base)
+fetch_store_string(unsigned long addr, void *dest, void *base)
{
- const void __user *uaddr = (__force const void __user *)addr;
int maxlen = get_loc_len(*(u32 *)dest);
void *__dest;
long ret;
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)addr < TASK_SIZE)
+ return fetch_store_string_user(addr, dest, base);
+#endif
+
if (unlikely(!maxlen))
return -ENOMEM;
__dest = get_loc_data(dest, base);
- ret = strncpy_from_unsafe_user(__dest, uaddr, maxlen);
+ /*
+ * Try to get string again, since the string can be changed while
+ * probing.
+ */
+ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen);
if (ret >= 0)
*(u32 *)dest = make_data_loc(ret, __dest - base);
@@ -1276,17 +1286,21 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base)
}
static nokprobe_inline int
-probe_mem_read(void *dest, void *src, size_t size)
+probe_mem_read_user(void *dest, void *src, size_t size)
{
- return probe_kernel_read(dest, src, size);
+ const void __user *uaddr = (__force const void __user *)src;
+
+ return copy_from_user_nofault(dest, uaddr, size);
}
static nokprobe_inline int
-probe_mem_read_user(void *dest, void *src, size_t size)
+probe_mem_read(void *dest, void *src, size_t size)
{
- const void __user *uaddr = (__force const void __user *)src;
-
- return probe_user_read(dest, uaddr, size);
+#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
+ if ((unsigned long)src < TASK_SIZE)
+ return probe_mem_read_user(dest, src, size);
+#endif
+ return copy_from_kernel_nofault(dest, src, size);
}
/* Note that we don't verify it, since the code does not come from user space */
@@ -1629,7 +1643,7 @@ int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
if (perf_type_tracepoint)
tk = find_trace_kprobe(pevent, group);
else
- tk = event->tp_event->data;
+ tk = trace_kprobe_primary_from_call(event->tp_event);
if (!tk)
return -EINVAL;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 9a121e147102..73976de7f8cc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -393,7 +393,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
if (mm) {
const struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, ip);
if (vma) {
file = vma->vm_file;
@@ -405,7 +405,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
trace_seq_printf(s, "[+0x%lx]",
ip - vmstart);
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
trace_seq_printf(s, " <" IP_FMT ">", ip);
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index fb0691b8a88d..f10073e62603 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -58,7 +58,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on);
* and lockdep uses a staged approach which splits the lockdep hardirq
* tracking into a RCU on and a RCU off section.
*/
-void trace_hardirqs_off_prepare(void)
+void trace_hardirqs_off_finish(void)
{
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
@@ -68,19 +68,19 @@ void trace_hardirqs_off_prepare(void)
}
}
-EXPORT_SYMBOL(trace_hardirqs_off_prepare);
-NOKPROBE_SYMBOL(trace_hardirqs_off_prepare);
+EXPORT_SYMBOL(trace_hardirqs_off_finish);
+NOKPROBE_SYMBOL(trace_hardirqs_off_finish);
void trace_hardirqs_off(void)
{
+ lockdep_hardirqs_off(CALLER_ADDR0);
+
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1);
if (!in_nmi())
trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
}
-
- lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off);
NOKPROBE_SYMBOL(trace_hardirqs_off);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index ab8b6436d53f..d2867ccc6aca 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -639,8 +639,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size,
ret = -EINVAL;
goto fail;
}
- if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM) ||
- parg->count) {
+ if ((code->op == FETCH_OP_IMM || code->op == FETCH_OP_COMM ||
+ code->op == FETCH_OP_DATA) || parg->count) {
/*
* IMM, DATA and COMM is pointing actual address, those
* must be kept, and if parg->count != 0, this is an
@@ -1006,7 +1006,7 @@ int trace_probe_init(struct trace_probe *tp, const char *event,
INIT_LIST_HEAD(&tp->event->class.fields);
INIT_LIST_HEAD(&tp->event->probes);
INIT_LIST_HEAD(&tp->list);
- list_add(&tp->event->probes, &tp->list);
+ list_add(&tp->list, &tp->event->probes);
call = trace_probe_event_call(tp);
call->class = &tp->event->class;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index a0ff9e200ef6..a22b62813f8c 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -236,7 +236,7 @@ struct trace_probe_event {
struct trace_event_call call;
struct list_head files;
struct list_head probes;
- struct trace_uprobe_filter filter[0];
+ struct trace_uprobe_filter filter[];
};
struct trace_probe {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c557f42a9397..98bba4764c52 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -515,9 +515,8 @@ static const struct file_operations stack_trace_filter_fops = {
#endif /* CONFIG_DYNAMIC_FTRACE */
int
-stack_trace_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+stack_trace_sysctl(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int was_enabled;
int ret;
diff --git a/kernel/trace/trace_synth.h b/kernel/trace/trace_synth.h
new file mode 100644
index 000000000000..ac35c45207c4
--- /dev/null
+++ b/kernel/trace/trace_synth.h
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __TRACE_SYNTH_H
+#define __TRACE_SYNTH_H
+
+#include "trace_dynevent.h"
+
+#define SYNTH_SYSTEM "synthetic"
+#define SYNTH_FIELDS_MAX 32
+
+#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
+
+struct synth_field {
+ char *type;
+ char *name;
+ size_t size;
+ unsigned int offset;
+ bool is_signed;
+ bool is_string;
+};
+
+struct synth_event {
+ struct dyn_event devent;
+ int ref;
+ char *name;
+ struct synth_field **fields;
+ unsigned int n_fields;
+ unsigned int n_u64;
+ struct trace_event_class class;
+ struct trace_event_call call;
+ struct tracepoint *tp;
+ struct module *mod;
+};
+
+extern struct synth_event *find_synth_event(const char *name);
+
+#endif /* __TRACE_SYNTH_H */
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2a8e8e9c1c75..fdd47f99b18f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1412,7 +1412,7 @@ int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
if (perf_type_tracepoint)
tu = find_probe_event(pevent, group);
else
- tu = event->tp_event->data;
+ tu = trace_uprobe_primary_from_call(event->tp_event);
if (!tu)
return -EINVAL;
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
new file mode 100644
index 000000000000..f74020f6bd9d
--- /dev/null
+++ b/kernel/watch_queue.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Watch queue and general notification mechanism, built on pipes
+ *
+ * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * See Documentation/watch_queue.rst
+ */
+
+#define pr_fmt(fmt) "watchq: " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/printk.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/file.h>
+#include <linux/security.h>
+#include <linux/cred.h>
+#include <linux/sched/signal.h>
+#include <linux/watch_queue.h>
+#include <linux/pipe_fs_i.h>
+
+MODULE_DESCRIPTION("Watch queue");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+#define WATCH_QUEUE_NOTE_SIZE 128
+#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)
+
+static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ struct watch_queue *wqueue = (struct watch_queue *)buf->private;
+ struct page *page;
+ unsigned int bit;
+
+ /* We need to work out which note within the page this refers to, but
+ * the note might have been maximum size, so merely ANDing the offset
+ * off doesn't work. OTOH, the note must've been more than zero size.
+ */
+ bit = buf->offset + buf->len;
+ if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0)
+ bit -= WATCH_QUEUE_NOTE_SIZE;
+ bit /= WATCH_QUEUE_NOTE_SIZE;
+
+ page = buf->page;
+ bit += page->index;
+
+ set_bit(bit, wqueue->notes_bitmap);
+}
+
+// No try_steal function => no stealing
+#define watch_queue_pipe_buf_try_steal NULL
+
+/* New data written to a pipe may be appended to a buffer with this type. */
+static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
+ .release = watch_queue_pipe_buf_release,
+ .try_steal = watch_queue_pipe_buf_try_steal,
+ .get = generic_pipe_buf_get,
+};
+
+/*
+ * Post a notification to a watch queue.
+ */
+static bool post_one_notification(struct watch_queue *wqueue,
+ struct watch_notification *n)
+{
+ void *p;
+ struct pipe_inode_info *pipe = wqueue->pipe;
+ struct pipe_buffer *buf;
+ struct page *page;
+ unsigned int head, tail, mask, note, offset, len;
+ bool done = false;
+
+ if (!pipe)
+ return false;
+
+ spin_lock_irq(&pipe->rd_wait.lock);
+
+ if (wqueue->defunct)
+ goto out;
+
+ mask = pipe->ring_size - 1;
+ head = pipe->head;
+ tail = pipe->tail;
+ if (pipe_full(head, tail, pipe->ring_size))
+ goto lost;
+
+ note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
+ if (note >= wqueue->nr_notes)
+ goto lost;
+
+ page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE];
+ offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
+ get_page(page);
+ len = n->info & WATCH_INFO_LENGTH;
+ p = kmap_atomic(page);
+ memcpy(p + offset, n, len);
+ kunmap_atomic(p);
+
+ buf = &pipe->bufs[head & mask];
+ buf->page = page;
+ buf->private = (unsigned long)wqueue;
+ buf->ops = &watch_queue_pipe_buf_ops;
+ buf->offset = offset;
+ buf->len = len;
+ buf->flags = PIPE_BUF_FLAG_WHOLE;
+ pipe->head = head + 1;
+
+ if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ BUG();
+ }
+ wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
+ done = true;
+
+out:
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ if (done)
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ return done;
+
+lost:
+ buf = &pipe->bufs[(head - 1) & mask];
+ buf->flags |= PIPE_BUF_FLAG_LOSS;
+ goto out;
+}
+
+/*
+ * Apply filter rules to a notification.
+ */
+static bool filter_watch_notification(const struct watch_filter *wf,
+ const struct watch_notification *n)
+{
+ const struct watch_type_filter *wt;
+ unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8;
+ unsigned int st_index = n->subtype / st_bits;
+ unsigned int st_bit = 1U << (n->subtype % st_bits);
+ int i;
+
+ if (!test_bit(n->type, wf->type_filter))
+ return false;
+
+ for (i = 0; i < wf->nr_filters; i++) {
+ wt = &wf->filters[i];
+ if (n->type == wt->type &&
+ (wt->subtype_filter[st_index] & st_bit) &&
+ (n->info & wt->info_mask) == wt->info_filter)
+ return true;
+ }
+
+ return false; /* If there is a filter, the default is to reject. */
+}
+
+/**
+ * __post_watch_notification - Post an event notification
+ * @wlist: The watch list to post the event to.
+ * @n: The notification record to post.
+ * @cred: The creds of the process that triggered the notification.
+ * @id: The ID to match on the watch.
+ *
+ * Post a notification of an event into a set of watch queues and let the users
+ * know.
+ *
+ * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and
+ * should be in units of sizeof(*n).
+ */
+void __post_watch_notification(struct watch_list *wlist,
+ struct watch_notification *n,
+ const struct cred *cred,
+ u64 id)
+{
+ const struct watch_filter *wf;
+ struct watch_queue *wqueue;
+ struct watch *watch;
+
+ if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) {
+ WARN_ON(1);
+ return;
+ }
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) {
+ if (watch->id != id)
+ continue;
+ n->info &= ~WATCH_INFO_ID;
+ n->info |= watch->info_id;
+
+ wqueue = rcu_dereference(watch->queue);
+ wf = rcu_dereference(wqueue->filter);
+ if (wf && !filter_watch_notification(wf, n))
+ continue;
+
+ if (security_post_notification(watch->cred, cred, n) < 0)
+ continue;
+
+ post_one_notification(wqueue, n);
+ }
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(__post_watch_notification);
+
+/*
+ * Allocate sufficient pages to preallocation for the requested number of
+ * notifications.
+ */
+long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
+{
+ struct watch_queue *wqueue = pipe->watch_queue;
+ struct page **pages;
+ unsigned long *bitmap;
+ unsigned long user_bufs;
+ unsigned int bmsize;
+ int ret, i, nr_pages;
+
+ if (!wqueue)
+ return -ENODEV;
+ if (wqueue->notes)
+ return -EBUSY;
+
+ if (nr_notes < 1 ||
+ nr_notes > 512) /* TODO: choose a better hard limit */
+ return -EINVAL;
+
+ nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1);
+ nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE;
+ user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages);
+
+ if (nr_pages > pipe->max_usage &&
+ (too_many_pipe_buffers_hard(user_bufs) ||
+ too_many_pipe_buffers_soft(user_bufs)) &&
+ pipe_is_unprivileged_user()) {
+ ret = -EPERM;
+ goto error;
+ }
+
+ ret = pipe_resize_ring(pipe, nr_notes);
+ if (ret < 0)
+ goto error;
+
+ pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL);
+ if (!pages)
+ goto error;
+
+ for (i = 0; i < nr_pages; i++) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i])
+ goto error_p;
+ pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
+ }
+
+ bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG;
+ bmsize *= sizeof(unsigned long);
+ bitmap = kmalloc(bmsize, GFP_KERNEL);
+ if (!bitmap)
+ goto error_p;
+
+ memset(bitmap, 0xff, bmsize);
+ wqueue->notes = pages;
+ wqueue->notes_bitmap = bitmap;
+ wqueue->nr_pages = nr_pages;
+ wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ return 0;
+
+error_p:
+ for (i = 0; i < nr_pages; i++)
+ __free_page(pages[i]);
+ kfree(pages);
+error:
+ (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted);
+ return ret;
+}
+
+/*
+ * Set the filter on a watch queue.
+ */
+long watch_queue_set_filter(struct pipe_inode_info *pipe,
+ struct watch_notification_filter __user *_filter)
+{
+ struct watch_notification_type_filter *tf;
+ struct watch_notification_filter filter;
+ struct watch_type_filter *q;
+ struct watch_filter *wfilter;
+ struct watch_queue *wqueue = pipe->watch_queue;
+ int ret, nr_filter = 0, i;
+
+ if (!wqueue)
+ return -ENODEV;
+
+ if (!_filter) {
+ /* Remove the old filter */
+ wfilter = NULL;
+ goto set;
+ }
+
+ /* Grab the user's filter specification */
+ if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
+ return -EFAULT;
+ if (filter.nr_filters == 0 ||
+ filter.nr_filters > 16 ||
+ filter.__reserved != 0)
+ return -EINVAL;
+
+ tf = memdup_user(_filter->filters, filter.nr_filters * sizeof(*tf));
+ if (IS_ERR(tf))
+ return PTR_ERR(tf);
+
+ ret = -EINVAL;
+ for (i = 0; i < filter.nr_filters; i++) {
+ if ((tf[i].info_filter & ~tf[i].info_mask) ||
+ tf[i].info_mask & WATCH_INFO_LENGTH)
+ goto err_filter;
+ /* Ignore any unknown types */
+ if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
+ continue;
+ nr_filter++;
+ }
+
+ /* Now we need to build the internal filter from only the relevant
+ * user-specified filters.
+ */
+ ret = -ENOMEM;
+ wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);
+ if (!wfilter)
+ goto err_filter;
+ wfilter->nr_filters = nr_filter;
+
+ q = wfilter->filters;
+ for (i = 0; i < filter.nr_filters; i++) {
+ if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
+ continue;
+
+ q->type = tf[i].type;
+ q->info_filter = tf[i].info_filter;
+ q->info_mask = tf[i].info_mask;
+ q->subtype_filter[0] = tf[i].subtype_filter[0];
+ __set_bit(q->type, wfilter->type_filter);
+ q++;
+ }
+
+ kfree(tf);
+set:
+ pipe_lock(pipe);
+ wfilter = rcu_replace_pointer(wqueue->filter, wfilter,
+ lockdep_is_held(&pipe->mutex));
+ pipe_unlock(pipe);
+ if (wfilter)
+ kfree_rcu(wfilter, rcu);
+ return 0;
+
+err_filter:
+ kfree(tf);
+ return ret;
+}
+
+static void __put_watch_queue(struct kref *kref)
+{
+ struct watch_queue *wqueue =
+ container_of(kref, struct watch_queue, usage);
+ struct watch_filter *wfilter;
+ int i;
+
+ for (i = 0; i < wqueue->nr_pages; i++)
+ __free_page(wqueue->notes[i]);
+
+ wfilter = rcu_access_pointer(wqueue->filter);
+ if (wfilter)
+ kfree_rcu(wfilter, rcu);
+ kfree_rcu(wqueue, rcu);
+}
+
+/**
+ * put_watch_queue - Dispose of a ref on a watchqueue.
+ * @wqueue: The watch queue to unref.
+ */
+void put_watch_queue(struct watch_queue *wqueue)
+{
+ kref_put(&wqueue->usage, __put_watch_queue);
+}
+EXPORT_SYMBOL(put_watch_queue);
+
+static void free_watch(struct rcu_head *rcu)
+{
+ struct watch *watch = container_of(rcu, struct watch, rcu);
+
+ put_watch_queue(rcu_access_pointer(watch->queue));
+ put_cred(watch->cred);
+}
+
+static void __put_watch(struct kref *kref)
+{
+ struct watch *watch = container_of(kref, struct watch, usage);
+
+ call_rcu(&watch->rcu, free_watch);
+}
+
+/*
+ * Discard a watch.
+ */
+static void put_watch(struct watch *watch)
+{
+ kref_put(&watch->usage, __put_watch);
+}
+
+/**
+ * init_watch_queue - Initialise a watch
+ * @watch: The watch to initialise.
+ * @wqueue: The queue to assign.
+ *
+ * Initialise a watch and set the watch queue.
+ */
+void init_watch(struct watch *watch, struct watch_queue *wqueue)
+{
+ kref_init(&watch->usage);
+ INIT_HLIST_NODE(&watch->list_node);
+ INIT_HLIST_NODE(&watch->queue_node);
+ rcu_assign_pointer(watch->queue, wqueue);
+}
+
+/**
+ * add_watch_to_object - Add a watch on an object to a watch list
+ * @watch: The watch to add
+ * @wlist: The watch list to add to
+ *
+ * @watch->queue must have been set to point to the queue to post notifications
+ * to and the watch list of the object to be watched. @watch->cred must also
+ * have been set to the appropriate credentials and a ref taken on them.
+ *
+ * The caller must pin the queue and the list both and must hold the list
+ * locked against racing watch additions/removals.
+ */
+int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
+{
+ struct watch_queue *wqueue = rcu_access_pointer(watch->queue);
+ struct watch *w;
+
+ hlist_for_each_entry(w, &wlist->watchers, list_node) {
+ struct watch_queue *wq = rcu_access_pointer(w->queue);
+ if (wqueue == wq && watch->id == w->id)
+ return -EBUSY;
+ }
+
+ watch->cred = get_current_cred();
+ rcu_assign_pointer(watch->watch_list, wlist);
+
+ spin_lock_bh(&wqueue->lock);
+ kref_get(&wqueue->usage);
+ kref_get(&watch->usage);
+ hlist_add_head(&watch->queue_node, &wqueue->watches);
+ spin_unlock_bh(&wqueue->lock);
+
+ hlist_add_head(&watch->list_node, &wlist->watchers);
+ return 0;
+}
+EXPORT_SYMBOL(add_watch_to_object);
+
+/**
+ * remove_watch_from_object - Remove a watch or all watches from an object.
+ * @wlist: The watch list to remove from
+ * @wq: The watch queue of interest (ignored if @all is true)
+ * @id: The ID of the watch to remove (ignored if @all is true)
+ * @all: True to remove all objects
+ *
+ * Remove a specific watch or all watches from an object. A notification is
+ * sent to the watcher to tell them that this happened.
+ */
+int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq,
+ u64 id, bool all)
+{
+ struct watch_notification_removal n;
+ struct watch_queue *wqueue;
+ struct watch *watch;
+ int ret = -EBADSLT;
+
+ rcu_read_lock();
+
+again:
+ spin_lock(&wlist->lock);
+ hlist_for_each_entry(watch, &wlist->watchers, list_node) {
+ if (all ||
+ (watch->id == id && rcu_access_pointer(watch->queue) == wq))
+ goto found;
+ }
+ spin_unlock(&wlist->lock);
+ goto out;
+
+found:
+ ret = 0;
+ hlist_del_init_rcu(&watch->list_node);
+ rcu_assign_pointer(watch->watch_list, NULL);
+ spin_unlock(&wlist->lock);
+
+ /* We now own the reference on watch that used to belong to wlist. */
+
+ n.watch.type = WATCH_TYPE_META;
+ n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION;
+ n.watch.info = watch->info_id | watch_sizeof(n.watch);
+ n.id = id;
+ if (id != 0)
+ n.watch.info = watch->info_id | watch_sizeof(n);
+
+ wqueue = rcu_dereference(watch->queue);
+
+ /* We don't need the watch list lock for the next bit as RCU is
+ * protecting *wqueue from deallocation.
+ */
+ if (wqueue) {
+ post_one_notification(wqueue, &n.watch);
+
+ spin_lock_bh(&wqueue->lock);
+
+ if (!hlist_unhashed(&watch->queue_node)) {
+ hlist_del_init_rcu(&watch->queue_node);
+ put_watch(watch);
+ }
+
+ spin_unlock_bh(&wqueue->lock);
+ }
+
+ if (wlist->release_watch) {
+ void (*release_watch)(struct watch *);
+
+ release_watch = wlist->release_watch;
+ rcu_read_unlock();
+ (*release_watch)(watch);
+ rcu_read_lock();
+ }
+ put_watch(watch);
+
+ if (all && !hlist_empty(&wlist->watchers))
+ goto again;
+out:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(remove_watch_from_object);
+
+/*
+ * Remove all the watches that are contributory to a queue. This has the
+ * potential to race with removal of the watches by the destruction of the
+ * objects being watched or with the distribution of notifications.
+ */
+void watch_queue_clear(struct watch_queue *wqueue)
+{
+ struct watch_list *wlist;
+ struct watch *watch;
+ bool release;
+
+ rcu_read_lock();
+ spin_lock_bh(&wqueue->lock);
+
+ /* Prevent new additions and prevent notifications from happening */
+ wqueue->defunct = true;
+
+ while (!hlist_empty(&wqueue->watches)) {
+ watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
+ hlist_del_init_rcu(&watch->queue_node);
+ /* We now own a ref on the watch. */
+ spin_unlock_bh(&wqueue->lock);
+
+ /* We can't do the next bit under the queue lock as we need to
+ * get the list lock - which would cause a deadlock if someone
+ * was removing from the opposite direction at the same time or
+ * posting a notification.
+ */
+ wlist = rcu_dereference(watch->watch_list);
+ if (wlist) {
+ void (*release_watch)(struct watch *);
+
+ spin_lock(&wlist->lock);
+
+ release = !hlist_unhashed(&watch->list_node);
+ if (release) {
+ hlist_del_init_rcu(&watch->list_node);
+ rcu_assign_pointer(watch->watch_list, NULL);
+
+ /* We now own a second ref on the watch. */
+ }
+
+ release_watch = wlist->release_watch;
+ spin_unlock(&wlist->lock);
+
+ if (release) {
+ if (release_watch) {
+ rcu_read_unlock();
+ /* This might need to call dput(), so
+ * we have to drop all the locks.
+ */
+ (*release_watch)(watch);
+ rcu_read_lock();
+ }
+ put_watch(watch);
+ }
+ }
+
+ put_watch(watch);
+ spin_lock_bh(&wqueue->lock);
+ }
+
+ spin_unlock_bh(&wqueue->lock);
+ rcu_read_unlock();
+}
+
+/**
+ * get_watch_queue - Get a watch queue from its file descriptor.
+ * @fd: The fd to query.
+ */
+struct watch_queue *get_watch_queue(int fd)
+{
+ struct pipe_inode_info *pipe;
+ struct watch_queue *wqueue = ERR_PTR(-EINVAL);
+ struct fd f;
+
+ f = fdget(fd);
+ if (f.file) {
+ pipe = get_pipe_info(f.file, false);
+ if (pipe && pipe->watch_queue) {
+ wqueue = pipe->watch_queue;
+ kref_get(&wqueue->usage);
+ }
+ fdput(f);
+ }
+
+ return wqueue;
+}
+EXPORT_SYMBOL(get_watch_queue);
+
+/*
+ * Initialise a watch queue
+ */
+int watch_queue_init(struct pipe_inode_info *pipe)
+{
+ struct watch_queue *wqueue;
+
+ wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL);
+ if (!wqueue)
+ return -ENOMEM;
+
+ wqueue->pipe = pipe;
+ kref_init(&wqueue->usage);
+ spin_lock_init(&wqueue->lock);
+ INIT_HLIST_HEAD(&wqueue->watches);
+
+ pipe->watch_queue = wqueue;
+ return 0;
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9fbe1e237563..c41c3c17b86a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4638,11 +4638,11 @@ void print_worker_info(const char *log_lvl, struct task_struct *task)
* Carefully copy the associated workqueue's workfn, name and desc.
* Keep the original last '\0' in case the original is garbage.
*/
- probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
- probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
- probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
- probe_kernel_read(name, wq->name, sizeof(name) - 1);
- probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+ copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
+ copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
+ copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
+ copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
+ copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);
if (fn || name[0] || desc[0]) {
printk("%sWorkqueue: %s %ps", log_lvl, name, fn);