summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/core.c50
-rw-r--r--kernel/bpf/syscall.c19
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/cfi.c15
-rw-r--r--kernel/cgroup/cgroup-v1.c14
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/cgroup/rstat.c195
-rw-r--r--kernel/entry/common.c3
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/irq/chip.c8
-rw-r--r--kernel/kexec_handover.c4
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/module/main.c2
-rw-r--r--kernel/padata.c154
-rw-r--r--kernel/panic.c18
-rw-r--r--kernel/power/hibernate.c26
-rw-r--r--kernel/printk/internal.h1
-rw-r--r--kernel/rcu/tree_stall.h7
-rw-r--r--kernel/sched/core.c42
-rw-r--r--kernel/sched/ext.c250
-rw-r--r--kernel/sched/ext.h20
-rw-r--r--kernel/sched/ext_idle.c45
-rw-r--r--kernel/sched/ext_idle.h12
-rw-r--r--kernel/sched/sched.h9
-rw-r--r--kernel/sched/wait.c22
-rw-r--r--kernel/sys.c50
-rw-r--r--kernel/time/clocksource.c5
-rw-r--r--kernel/trace/Kconfig53
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/fgraph.c16
-rw-r--r--kernel/trace/fprobe.c9
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/preemptirq_delay_test.c13
-rw-r--r--kernel/trace/ring_buffer.c168
-rw-r--r--kernel/trace/rv/Kconfig43
-rw-r--r--kernel/trace/rv/Makefile9
-rw-r--r--kernel/trace/rv/monitors/nrp/Kconfig (renamed from kernel/trace/rv/monitors/tss/Kconfig)12
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.c138
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp.h75
-rw-r--r--kernel/trace/rv/monitors/nrp/nrp_trace.h15
-rw-r--r--kernel/trace/rv/monitors/opid/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/opid/opid.c168
-rw-r--r--kernel/trace/rv/monitors/opid/opid.h104
-rw-r--r--kernel/trace/rv/monitors/opid/opid_trace.h (renamed from kernel/trace/rv/monitors/sncid/sncid_trace.h)8
-rw-r--r--kernel/trace/rv/monitors/pagefault/Kconfig20
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.c88
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault.h64
-rw-r--r--kernel/trace/rv/monitors/pagefault/pagefault_trace.h14
-rw-r--r--kernel/trace/rv/monitors/rtapp/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.c33
-rw-r--r--kernel/trace/rv/monitors/rtapp/rtapp.h3
-rw-r--r--kernel/trace/rv/monitors/sched/Kconfig1
-rw-r--r--kernel/trace/rv/monitors/sched/sched.c3
-rw-r--r--kernel/trace/rv/monitors/sco/sco.c7
-rw-r--r--kernel/trace/rv/monitors/scpd/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/scpd/scpd.c7
-rw-r--r--kernel/trace/rv/monitors/sleep/Kconfig22
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.c237
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep.h257
-rw-r--r--kernel/trace/rv/monitors/sleep/sleep_trace.h14
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.c96
-rw-r--r--kernel/trace/rv/monitors/sncid/sncid.h49
-rw-r--r--kernel/trace/rv/monitors/snep/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/snep/snep.c7
-rw-r--r--kernel/trace/rv/monitors/snep/snep.h14
-rw-r--r--kernel/trace/rv/monitors/snroc/snroc.c3
-rw-r--r--kernel/trace/rv/monitors/sssw/Kconfig (renamed from kernel/trace/rv/monitors/sncid/Kconfig)10
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.c116
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw.h105
-rw-r--r--kernel/trace/rv/monitors/sssw/sssw_trace.h15
-rw-r--r--kernel/trace/rv/monitors/sts/Kconfig19
-rw-r--r--kernel/trace/rv/monitors/sts/sts.c156
-rw-r--r--kernel/trace/rv/monitors/sts/sts.h117
-rw-r--r--kernel/trace/rv/monitors/sts/sts_trace.h (renamed from kernel/trace/rv/monitors/tss/tss_trace.h)8
-rw-r--r--kernel/trace/rv/monitors/tss/tss.c91
-rw-r--r--kernel/trace/rv/monitors/tss/tss.h47
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig2
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c3
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c3
-rw-r--r--kernel/trace/rv/reactor_panic.c8
-rw-r--r--kernel/trace/rv/reactor_printk.c8
-rw-r--r--kernel/trace/rv/rv.c226
-rw-r--r--kernel/trace/rv/rv.h39
-rw-r--r--kernel/trace/rv/rv_reactors.c138
-rw-r--r--kernel/trace/rv/rv_trace.h166
-rw-r--r--kernel/trace/trace.c63
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_eprobe.c53
-rw-r--r--kernel/trace/trace_events.c154
-rw-r--r--kernel/trace/trace_events_filter.c32
-rw-r--r--kernel/trace/trace_fprobe.c614
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_hwlat.c5
-rw-r--r--kernel/trace/trace_kdb.c8
-rw-r--r--kernel/trace/trace_kprobe.c57
-rw-r--r--kernel/trace/trace_probe.c150
-rw-r--r--kernel/trace/trace_probe.h26
-rw-r--r--kernel/trace/trace_uprobe.c53
-rw-r--r--kernel/unwind/Makefile1
-rw-r--r--kernel/unwind/deferred.c362
-rw-r--r--kernel/unwind/user.c128
-rw-r--r--kernel/vhost_task.c2
-rw-r--r--kernel/watchdog_buddy.c5
-rw-r--r--kernel/workqueue.c74
108 files changed, 4173 insertions, 1708 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a89602a2a0ab..366987d9914a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,6 +54,7 @@ obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
obj-y += entry/
+obj-y += unwind/
obj-$(CONFIG_MODULES) += module/
obj-$(CONFIG_KCMP) += kcmp.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 09dde5b00d0c..5d1650af899d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2377,28 +2377,44 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
const struct bpf_prog *fp)
{
enum bpf_prog_type prog_type = resolve_prog_type(fp);
- bool ret;
struct bpf_prog_aux *aux = fp->aux;
+ enum bpf_cgroup_storage_type i;
+ bool ret = false;
+ u64 cookie;
if (fp->kprobe_override)
- return false;
+ return ret;
- spin_lock(&map->owner.lock);
- if (!map->owner.type) {
- /* There's no owner yet where we could check for
- * compatibility.
- */
- map->owner.type = prog_type;
- map->owner.jited = fp->jited;
- map->owner.xdp_has_frags = aux->xdp_has_frags;
- map->owner.attach_func_proto = aux->attach_func_proto;
+ spin_lock(&map->owner_lock);
+ /* There's no owner yet where we could check for compatibility. */
+ if (!map->owner) {
+ map->owner = bpf_map_owner_alloc(map);
+ if (!map->owner)
+ goto err;
+ map->owner->type = prog_type;
+ map->owner->jited = fp->jited;
+ map->owner->xdp_has_frags = aux->xdp_has_frags;
+ map->owner->attach_func_proto = aux->attach_func_proto;
+ for_each_cgroup_storage_type(i) {
+ map->owner->storage_cookie[i] =
+ aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ }
ret = true;
} else {
- ret = map->owner.type == prog_type &&
- map->owner.jited == fp->jited &&
- map->owner.xdp_has_frags == aux->xdp_has_frags;
+ ret = map->owner->type == prog_type &&
+ map->owner->jited == fp->jited &&
+ map->owner->xdp_has_frags == aux->xdp_has_frags;
+ for_each_cgroup_storage_type(i) {
+ if (!ret)
+ break;
+ cookie = aux->cgroup_storage[i] ?
+ aux->cgroup_storage[i]->cookie : 0;
+ ret = map->owner->storage_cookie[i] == cookie ||
+ !cookie;
+ }
if (ret &&
- map->owner.attach_func_proto != aux->attach_func_proto) {
+ map->owner->attach_func_proto != aux->attach_func_proto) {
switch (prog_type) {
case BPF_PROG_TYPE_TRACING:
case BPF_PROG_TYPE_LSM:
@@ -2411,8 +2427,8 @@ static bool __bpf_prog_map_compatible(struct bpf_map *map,
}
}
}
- spin_unlock(&map->owner.lock);
-
+err:
+ spin_unlock(&map->owner_lock);
return ret;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index e63039817af3..0fbfa8532c39 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -37,6 +37,7 @@
#include <linux/trace_events.h>
#include <linux/tracepoint.h>
#include <linux/overflow.h>
+#include <linux/cookie.h>
#include <net/netfilter/nf_bpf_link.h>
#include <net/netkit.h>
@@ -53,6 +54,7 @@
#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY)
DEFINE_PER_CPU(int, bpf_prog_active);
+DEFINE_COOKIE(bpf_map_cookie);
static DEFINE_IDR(prog_idr);
static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
@@ -885,6 +887,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
security_bpf_map_free(map);
bpf_map_release_memcg(map);
+ bpf_map_owner_free(map);
bpf_map_free(map);
}
@@ -979,12 +982,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map_type_contains_progs(map)) {
- spin_lock(&map->owner.lock);
- type = map->owner.type;
- jited = map->owner.jited;
- spin_unlock(&map->owner.lock);
+ spin_lock(&map->owner_lock);
+ if (map->owner) {
+ type = map->owner->type;
+ jited = map->owner->jited;
}
+ spin_unlock(&map->owner_lock);
seq_printf(m,
"map_type:\t%u\n"
@@ -1487,10 +1490,14 @@ static int map_create(union bpf_attr *attr, bool kernel)
if (err < 0)
goto free_map;
+ preempt_disable();
+ map->cookie = gen_cookie_next(&bpf_map_cookie);
+ preempt_enable();
+
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
- spin_lock_init(&map->owner.lock);
+ spin_lock_init(&map->owner_lock);
if (attr->btf_key_type_id || attr->btf_value_type_id ||
/* Even the map's value is a kernel's struct,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 399f03e62508..0806295945e4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21445,7 +21445,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
&target_size);
if (cnt == 0 || cnt >= INSN_BUF_SIZE ||
(ctx_field_size && !target_size)) {
- verifier_bug(env, "error during ctx access conversion");
+ verifier_bug(env, "error during ctx access conversion (%d)", cnt);
return -EFAULT;
}
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 422fa4f958ae..4dad04ead06c 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -5,6 +5,8 @@
* Copyright (C) 2022 Google LLC
*/
+#include <linux/bpf.h>
+#include <linux/cfi_types.h>
#include <linux/cfi.h>
bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE);
@@ -27,6 +29,19 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
return BUG_TRAP_TYPE_BUG;
}
+/*
+ * Declare two non-existent functions with types that match bpf_func_t and
+ * bpf_callback_t pointers, and use DEFINE_CFI_TYPE to define type hash
+ * variables for each function type. The cfi_bpf_* variables are used by
+ * arch-specific BPF JIT implementations to ensure indirectly callable JIT
+ * code has matching CFI type hashes.
+ */
+extern typeof(*(bpf_func_t)0) __bpf_prog_runX;
+DEFINE_CFI_TYPE(cfi_bpf_hash, __bpf_prog_runX);
+
+extern typeof(*(bpf_callback_t)0) __bpf_callback_fn;
+DEFINE_CFI_TYPE(cfi_bpf_subprog_hash, __bpf_callback_fn);
+
#ifdef CONFIG_ARCH_USES_CFI_TRAPS
static inline unsigned long trap_address(s32 *p)
{
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index fa24c032ed6f..2a4a387f867a 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -32,6 +32,9 @@ static u16 cgroup_no_v1_mask;
/* disable named v1 mounts */
static bool cgroup_no_v1_named;
+/* Show unavailable controllers in /proc/cgroups */
+static bool proc_show_all;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -683,10 +686,11 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
*/
for_each_subsys(ss, i) {
- if (cgroup1_subsys_absent(ss))
- continue;
cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+ if (!proc_show_all && cgroup1_subsys_absent(ss))
+ continue;
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
@@ -1359,3 +1363,9 @@ static int __init cgroup_no_v1(char *str)
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
+
+static int __init cgroup_v1_proc(char *str)
+{
+ return (kstrtobool(str, &proc_show_all) == 0);
+}
+__setup("cgroup_v1_proc=", cgroup_v1_proc);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 3bc4301466f3..f74d04429a29 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -4051,7 +4051,7 @@ void __init cpuset_init_smp(void)
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
- hotplug_memory_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
+ hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index cbeaa499a96a..981e2f77ad4e 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -10,7 +10,7 @@
#include <trace/events/cgroup.h>
static DEFINE_SPINLOCK(rstat_base_lock);
-static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock);
+static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list);
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);
@@ -45,84 +45,11 @@ static spinlock_t *ss_rstat_lock(struct cgroup_subsys *ss)
return &rstat_base_lock;
}
-static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu)
+static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu)
{
- if (ss) {
- /*
- * Depending on config, the subsystem per-cpu lock type may be an
- * empty struct. In enviromnents where this is the case, allocation
- * of this field is not performed in ss_rstat_init(). Avoid a
- * cpu-based offset relative to NULL by returning early. When the
- * lock type is zero in size, the corresponding lock functions are
- * no-ops so passing them NULL is acceptable.
- */
- if (sizeof(*ss->rstat_ss_cpu_lock) == 0)
- return NULL;
-
- return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu);
- }
-
- return per_cpu_ptr(&rstat_base_cpu_lock, cpu);
-}
-
-/*
- * Helper functions for rstat per CPU locks.
- *
- * This makes it easier to diagnose locking issues and contention in
- * production environments. The parameter @fast_path determine the
- * tracepoints being added, allowing us to diagnose "flush" related
- * operations without handling high-frequency fast-path "update" events.
- */
-static __always_inline
-unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu,
- const bool fast_path)
-{
- struct cgroup *cgrp = css->cgroup;
- raw_spinlock_t *cpu_lock;
- unsigned long flags;
- bool contended;
-
- /*
- * The _irqsave() is needed because the locks used for flushing are
- * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock
- * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT
- * kernel. The raw_spinlock_t below disables interrupts on both
- * configurations. The _irqsave() ensures that interrupts are always
- * disabled and later restored.
- */
- cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
- contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
- if (contended) {
- if (fast_path)
- trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
- else
- trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);
-
- raw_spin_lock_irqsave(cpu_lock, flags);
- }
-
- if (fast_path)
- trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
- else
- trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);
-
- return flags;
-}
-
-static __always_inline
-void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
- unsigned long flags, const bool fast_path)
-{
- struct cgroup *cgrp = css->cgroup;
- raw_spinlock_t *cpu_lock;
-
- if (fast_path)
- trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
- else
- trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);
-
- cpu_lock = ss_rstat_cpu_lock(css->ss, cpu);
- raw_spin_unlock_irqrestore(cpu_lock, flags);
+ if (ss)
+ return per_cpu_ptr(ss->lhead, cpu);
+ return per_cpu_ptr(&rstat_backlog_list, cpu);
}
/**
@@ -130,13 +57,22 @@ void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu,
* @css: target cgroup subsystem state
* @cpu: cpu on which rstat_cpu was updated
*
- * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching
- * rstat_cpu->updated_children list. See the comment on top of
- * css_rstat_cpu definition for details.
+ * Atomically inserts the css in the ss's llist for the given cpu. This is
+ * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist
+ * will be processed at the flush time to create the update tree.
+ *
+ * NOTE: if the user needs the guarantee that the updater either add itself in
+ * the lockless list or the concurrent flusher flushes its updated stats, a
+ * memory barrier is needed before the call to css_rstat_updated() i.e. a
+ * barrier after updating the per-cpu stats and before calling
+ * css_rstat_updated().
*/
__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
{
- unsigned long flags;
+ struct llist_head *lhead;
+ struct css_rstat_cpu *rstatc;
+ struct css_rstat_cpu __percpu *rstatc_pcpu;
+ struct llist_node *self;
/*
* Since bpf programs can call this function, prevent access to
@@ -145,19 +81,49 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
if (!css_uses_rstat(css))
return;
+ lockdep_assert_preemption_disabled();
+
+ /*
+ * For archs withnot nmi safe cmpxchg or percpu ops support, ignore
+ * the requests from nmi context.
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi())
+ return;
+
+ rstatc = css_rstat_cpu(css, cpu);
/*
- * Speculative already-on-list test. This may race leading to
- * temporary inaccuracies, which is fine.
+ * If already on list return. This check is racy and smp_mb() is needed
+ * to pair it with the smp_mb() in css_process_update_tree() if the
+ * guarantee that the updated stats are visible to concurrent flusher is
+ * needed.
+ */
+ if (llist_on_list(&rstatc->lnode))
+ return;
+
+ /*
+ * This function can be renentered by irqs and nmis for the same cgroup
+ * and may try to insert the same per-cpu lnode into the llist. Note
+ * that llist_add() does not protect against such scenarios.
*
- * Because @parent's updated_children is terminated with @parent
- * instead of NULL, we can tell whether @css is on the list by
- * testing the next pointer for NULL.
+ * To protect against such stacked contexts of irqs/nmis, we use the
+ * fact that lnode points to itself when not on a list and then use
+ * this_cpu_cmpxchg() to atomically set to NULL to select the winner
+ * which will call llist_add(). The losers can assume the insertion is
+ * successful and the winner will eventually add the per-cpu lnode to
+ * the llist.
*/
- if (data_race(css_rstat_cpu(css, cpu)->updated_next))
+ self = &rstatc->lnode;
+ rstatc_pcpu = css->rstat_cpu;
+ if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self)
return;
- flags = _css_rstat_cpu_lock(css, cpu, true);
+ lhead = ss_lhead_cpu(css->ss, cpu);
+ llist_add(&rstatc->lnode, lhead);
+}
+static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu)
+{
/* put @css and all ancestors on the corresponding updated lists */
while (true) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
@@ -183,8 +149,34 @@ __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu)
css = parent;
}
+}
+
+static void css_process_update_tree(struct cgroup_subsys *ss, int cpu)
+{
+ struct llist_head *lhead = ss_lhead_cpu(ss, cpu);
+ struct llist_node *lnode;
+
+ while ((lnode = llist_del_first_init(lhead))) {
+ struct css_rstat_cpu *rstatc;
+
+ /*
+ * smp_mb() is needed here (more specifically in between
+ * init_llist_node() and per-cpu stats flushing) if the
+ * guarantee is required by a rstat user where etiher the
+ * updater should add itself on the lockless list or the
+ * flusher flush the stats updated by the updater who have
+ * observed that they are already on the list. The
+ * corresponding barrier pair for this one should be before
+ * css_rstat_updated() by the user.
+ *
+ * For now, there aren't any such user, so not adding the
+ * barrier here but if such a use-case arise, please add
+ * smp_mb() here.
+ */
- _css_rstat_cpu_unlock(css, cpu, flags, true);
+ rstatc = container_of(lnode, struct css_rstat_cpu, lnode);
+ __css_process_update_tree(rstatc->owner, cpu);
+ }
}
/**
@@ -288,13 +280,12 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
{
struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu);
struct cgroup_subsys_state *head = NULL, *parent, *child;
- unsigned long flags;
- flags = _css_rstat_cpu_lock(root, cpu, false);
+ css_process_update_tree(root->ss, cpu);
/* Return NULL if this subtree is not on-list */
if (!rstatc->updated_next)
- goto unlock_ret;
+ return NULL;
/*
* Unlink @root from its parent. As the updated_children list is
@@ -326,8 +317,7 @@ static struct cgroup_subsys_state *css_rstat_updated_list(
rstatc->updated_children = root;
if (child != root)
head = css_rstat_push_children(head, child, cpu);
-unlock_ret:
- _css_rstat_cpu_unlock(root, cpu, flags, false);
+
return head;
}
@@ -468,7 +458,8 @@ int css_rstat_init(struct cgroup_subsys_state *css)
for_each_possible_cpu(cpu) {
struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu);
- rstatc->updated_children = css;
+ rstatc->owner = rstatc->updated_children = css;
+ init_llist_node(&rstatc->lnode);
if (is_self) {
struct cgroup_rstat_base_cpu *rstatbc;
@@ -522,19 +513,15 @@ int __init ss_rstat_init(struct cgroup_subsys *ss)
{
int cpu;
- /*
- * Depending on config, the subsystem per-cpu lock type may be an empty
- * struct. Avoid allocating a size of zero in this case.
- */
- if (ss && sizeof(*ss->rstat_ss_cpu_lock)) {
- ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t);
- if (!ss->rstat_ss_cpu_lock)
+ if (ss) {
+ ss->lhead = alloc_percpu(struct llist_head);
+ if (!ss->lhead)
return -ENOMEM;
}
spin_lock_init(ss_rstat_lock(ss));
for_each_possible_cpu(cpu)
- raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu));
+ init_llist_head(ss_lhead_cpu(ss, cpu));
return 0;
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index b82032777310..408d28b5179d 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -109,7 +109,8 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* TINY_RCU does not support EQS, so let the compiler eliminate
* this part when enabled.
*/
- if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+ if (!IS_ENABLED(CONFIG_TINY_RCU) &&
+ (is_idle_task(current) || arch_in_rcu_eqs())) {
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 4c965ba77f9f..f774367c8e71 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -436,8 +436,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
* there are no unexpected folio references ...
*/
if (is_register || userfaultfd_missing(vma) ||
- (folio_ref_count(folio) != folio_mapcount(folio) + 1 +
- folio_test_swapcache(folio) * folio_nr_pages(folio)))
+ (folio_ref_count(folio) != folio_expected_ref_count(folio) + 1))
goto remap;
/*
@@ -540,7 +539,7 @@ retry:
}
ret = 0;
- if (unlikely(!folio_test_anon(folio))) {
+ if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) {
VM_WARN_ON_ONCE(is_register);
folio_put(folio);
goto out;
diff --git a/kernel/exit.c b/kernel/exit.c
index bb184a67ac73..1d8c8ac33c4f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,6 +68,7 @@
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
+#include <linux/unwind_deferred.h>
#include <linux/uaccess.h>
#include <linux/pidfs.h>
@@ -938,6 +939,7 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
+ unwind_deferred_task_exit(tsk);
trace_sched_process_exit(tsk, group_dead);
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index aef41211c72c..e45354cc7cac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
#include <linux/tick.h>
+#include <linux/unwind_deferred.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -732,6 +733,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ unwind_task_free(tsk);
sched_ext_free(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
@@ -1890,10 +1892,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
- int i;
-
- for (i = 0; i < RV_PER_TASK_MONITORS; i++)
- p->rv[i].da_mon.monitoring = false;
+ memset(&p->rv, 0, sizeof(p->rv));
}
#else
#define rv_task_fork(p) do {} while (0)
@@ -2138,6 +2137,8 @@ __latent_entropy struct task_struct *copy_process(
p->bpf_ctx = NULL;
#endif
+ unwind_task_init(p);
+
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 624106e886ad..0d0276378c70 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -611,7 +611,13 @@ void handle_simple_irq(struct irq_desc *desc)
{
guard(raw_spinlock)(&desc->lock);
- if (!irq_can_handle(desc))
+ if (!irq_can_handle_pm(desc)) {
+ if (irqd_needs_resend_when_in_progress(&desc->irq_data))
+ desc->istate |= IRQS_PENDING;
+ return;
+ }
+
+ if (!irq_can_handle_actions(desc))
return;
kstat_incr_irqs_this_cpu(desc);
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index d3b13a909913..e49743ae52c5 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -1100,8 +1100,8 @@ static void __init kho_release_scratch(void)
ulong pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages)
- set_pageblock_migratetype(pfn_to_page(pfn),
- MIGRATE_CMA);
+ init_pageblock_migratetype(pfn_to_page(pfn),
+ MIGRATE_CMA, false);
}
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ffe0c3d52306..ab8f9fc1f0d1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -135,8 +135,12 @@ struct kprobe_insn_cache kprobe_insn_slots = {
static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
- * __get_insn_slot() - Find a slot on an executable page for an instruction.
- * We allocate an executable page if there's no room on existing ones.
+ * __get_insn_slot - Find a slot on an executable page for an instruction.
+ * @c: Pointer to kprobe instruction cache
+ *
+ * Description: Locates available slot on existing executable pages,
+ * allocates an executable page if there's no room on existing ones.
+ * Return: Pointer to instruction slot on success, NULL on failure.
*/
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
diff --git a/kernel/module/main.c b/kernel/module/main.c
index cdcc50a5353d..81f9df8859dc 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -2673,7 +2673,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
sizeof(*mod->trace_bprintk_fmt_start),
&mod->num_trace_bprintk_fmt);
#endif
-#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+#ifdef CONFIG_DYNAMIC_FTRACE
/* sechdrs[0].sh_size is always zero */
mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
sizeof(*mod->ftrace_callsites),
diff --git a/kernel/padata.c b/kernel/padata.c
index 7eee94166357..f85f8bd788d0 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -63,17 +63,6 @@ static inline void padata_put_pd(struct parallel_data *pd)
padata_put_pd_cnt(pd, 1);
}
-static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
-{
- int cpu, target_cpu;
-
- target_cpu = cpumask_first(pd->cpumask.pcpu);
- for (cpu = 0; cpu < cpu_index; cpu++)
- target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
-
- return target_cpu;
-}
-
static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
{
/*
@@ -82,7 +71,7 @@ static int padata_cpu_hash(struct parallel_data *pd, unsigned int seq_nr)
*/
int cpu_index = seq_nr % cpumask_weight(pd->cpumask.pcpu);
- return padata_index_to_cpu(pd, cpu_index);
+ return cpumask_nth(cpu_index, pd->cpumask.pcpu);
}
static struct padata_work *padata_work_alloc(void)
@@ -192,9 +181,9 @@ int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu)
{
struct padata_instance *pinst = ps->pinst;
- int i, cpu, cpu_index, err;
struct parallel_data *pd;
struct padata_work *pw;
+ int cpu_index, err;
rcu_read_lock_bh();
@@ -210,12 +199,7 @@ int padata_do_parallel(struct padata_shell *ps,
/* Select an alternate fallback CPU and notify the caller. */
cpu_index = *cb_cpu % cpumask_weight(pd->cpumask.cbcpu);
-
- cpu = cpumask_first(pd->cpumask.cbcpu);
- for (i = 0; i < cpu_index; i++)
- cpu = cpumask_next(cpu, pd->cpumask.cbcpu);
-
- *cb_cpu = cpu;
+ *cb_cpu = cpumask_nth(cpu_index, pd->cpumask.cbcpu);
}
err = -EBUSY;
@@ -261,20 +245,17 @@ EXPORT_SYMBOL(padata_do_parallel);
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
*/
-static struct padata_priv *padata_find_next(struct parallel_data *pd,
- bool remove_object)
+static struct padata_priv *padata_find_next(struct parallel_data *pd, int cpu,
+ unsigned int processed)
{
struct padata_priv *padata;
struct padata_list *reorder;
- int cpu = pd->cpu;
reorder = per_cpu_ptr(pd->reorder_list, cpu);
spin_lock(&reorder->lock);
- if (list_empty(&reorder->list)) {
- spin_unlock(&reorder->lock);
- return NULL;
- }
+ if (list_empty(&reorder->list))
+ goto notfound;
padata = list_entry(reorder->list.next, struct padata_priv, list);
@@ -282,97 +263,52 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
* Checks the rare case where two or more parallel jobs have hashed to
* the same CPU and one of the later ones finishes first.
*/
- if (padata->seq_nr != pd->processed) {
- spin_unlock(&reorder->lock);
- return NULL;
- }
-
- if (remove_object) {
- list_del_init(&padata->list);
- ++pd->processed;
- pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
- }
+ if (padata->seq_nr != processed)
+ goto notfound;
+ list_del_init(&padata->list);
spin_unlock(&reorder->lock);
return padata;
+
+notfound:
+ pd->processed = processed;
+ pd->cpu = cpu;
+ spin_unlock(&reorder->lock);
+ return NULL;
}
-static void padata_reorder(struct parallel_data *pd)
+static void padata_reorder(struct padata_priv *padata)
{
+ struct parallel_data *pd = padata->pd;
struct padata_instance *pinst = pd->ps->pinst;
- int cb_cpu;
- struct padata_priv *padata;
- struct padata_serial_queue *squeue;
- struct padata_list *reorder;
+ unsigned int processed;
+ int cpu;
- /*
- * We need to ensure that only one cpu can work on dequeueing of
- * the reorder queue the time. Calculating in which percpu reorder
- * queue the next object will arrive takes some time. A spinlock
- * would be highly contended. Also it is not clear in which order
- * the objects arrive to the reorder queues. So a cpu could wait to
- * get the lock just to notice that there is nothing to do at the
- * moment. Therefore we use a trylock and let the holder of the lock
- * care for all the objects enqueued during the holdtime of the lock.
- */
- if (!spin_trylock_bh(&pd->lock))
- return;
+ processed = pd->processed;
+ cpu = pd->cpu;
- while (1) {
- padata = padata_find_next(pd, true);
+ do {
+ struct padata_serial_queue *squeue;
+ int cb_cpu;
- /*
- * If the next object that needs serialization is parallel
- * processed by another cpu and is still on it's way to the
- * cpu's reorder queue, nothing to do for now.
- */
- if (!padata)
- break;
+ cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
+ processed++;
cb_cpu = padata->cb_cpu;
squeue = per_cpu_ptr(pd->squeue, cb_cpu);
spin_lock(&squeue->serial.lock);
list_add_tail(&padata->list, &squeue->serial.list);
- spin_unlock(&squeue->serial.lock);
-
queue_work_on(cb_cpu, pinst->serial_wq, &squeue->work);
- }
-
- spin_unlock_bh(&pd->lock);
-
- /*
- * The next object that needs serialization might have arrived to
- * the reorder queues in the meantime.
- *
- * Ensure reorder queue is read after pd->lock is dropped so we see
- * new objects from another task in padata_do_serial. Pairs with
- * smp_mb in padata_do_serial.
- */
- smp_mb();
- reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
- if (!list_empty(&reorder->list) && padata_find_next(pd, false)) {
/*
- * Other context(eg. the padata_serial_worker) can finish the request.
- * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, end the loop.
*/
- padata_get_pd(pd);
- if (!queue_work(pinst->serial_wq, &pd->reorder_work))
- padata_put_pd(pd);
- }
-}
-
-static void invoke_padata_reorder(struct work_struct *work)
-{
- struct parallel_data *pd;
-
- local_bh_disable();
- pd = container_of(work, struct parallel_data, reorder_work);
- padata_reorder(pd);
- local_bh_enable();
- /* Pairs with putting the reorder_work in the serial_wq */
- padata_put_pd(pd);
+ padata = padata_find_next(pd, cpu, processed);
+ spin_unlock(&squeue->serial.lock);
+ } while (padata);
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -423,6 +359,7 @@ void padata_do_serial(struct padata_priv *padata)
struct padata_list *reorder = per_cpu_ptr(pd->reorder_list, hashed_cpu);
struct padata_priv *cur;
struct list_head *pos;
+ bool gotit = true;
spin_lock(&reorder->lock);
/* Sort in ascending order of sequence number. */
@@ -432,17 +369,14 @@ void padata_do_serial(struct padata_priv *padata)
if ((signed int)(cur->seq_nr - padata->seq_nr) < 0)
break;
}
- list_add(&padata->list, pos);
+ if (padata->seq_nr != pd->processed) {
+ gotit = false;
+ list_add(&padata->list, pos);
+ }
spin_unlock(&reorder->lock);
- /*
- * Ensure the addition to the reorder list is ordered correctly
- * with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
- * in padata_reorder.
- */
- smp_mb();
-
- padata_reorder(pd);
+ if (gotit)
+ padata_reorder(padata);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -632,9 +566,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_squeues(pd);
pd->seq_nr = -1;
refcount_set(&pd->refcnt, 1);
- spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
- INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
@@ -1144,12 +1076,6 @@ void padata_free_shell(struct padata_shell *ps)
if (!ps)
return;
- /*
- * Wait for all _do_serial calls to finish to avoid touching
- * freed pd's and ps's.
- */
- synchronize_rcu();
-
mutex_lock(&ps->pinst->lock);
list_del(&ps->list);
pd = rcu_dereference_protected(ps->pd, 1);
diff --git a/kernel/panic.c b/kernel/panic.c
index 64e58835086d..43817111c979 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -367,15 +367,15 @@ static void panic_other_cpus_shutdown(bool crash_kexec)
}
/**
- * panic - halt the system
+ * vpanic - halt the system
* @fmt: The text string to print
+ * @args: Arguments for the format string
*
* Display a message, then perform cleanups. This function never returns.
*/
-void panic(const char *fmt, ...)
+void vpanic(const char *fmt, va_list args)
{
static char buf[1024];
- va_list args;
long i, i_next = 0, len;
int state = 0;
int old_cpu, this_cpu;
@@ -426,9 +426,7 @@ void panic(const char *fmt, ...)
console_verbose();
bust_spinlocks(1);
- va_start(args, fmt);
len = vscnprintf(buf, sizeof(buf), fmt, args);
- va_end(args);
if (len && buf[len - 1] == '\n')
buf[len - 1] = '\0';
@@ -565,7 +563,17 @@ void panic(const char *fmt, ...)
mdelay(PANIC_TIMER_STEP);
}
}
+EXPORT_SYMBOL(vpanic);
+/* Identical to vpanic(), except it takes variadic arguments instead of va_list */
+void panic(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vpanic(fmt, args);
+ va_end(args);
+}
EXPORT_SYMBOL(panic);
#define TAINT_FLAG(taint, _c_true, _c_false, _module) \
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 9216e3b91d3b..1f1f30cca573 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -381,6 +381,23 @@ static int create_image(int platform_mode)
return error;
}
+static void shrink_shmem_memory(void)
+{
+ struct sysinfo info;
+ unsigned long nr_shmem_pages, nr_freed_pages;
+
+ si_meminfo(&info);
+ nr_shmem_pages = info.sharedram; /* current page count used for shmem */
+ /*
+ * The intent is to reclaim all shmem pages. Though shrink_all_memory() can
+ * only reclaim about half of them, it's enough for creating the hibernation
+ * image.
+ */
+ nr_freed_pages = shrink_all_memory(nr_shmem_pages);
+ pr_debug("requested to reclaim %lu shmem pages, actually freed %lu pages\n",
+ nr_shmem_pages, nr_freed_pages);
+}
+
/**
* hibernation_snapshot - Quiesce devices and create a hibernation image.
* @platform_mode: If set, use platform driver to prepare for the transition.
@@ -422,6 +439,15 @@ int hibernation_snapshot(int platform_mode)
goto Thaw;
}
+ /*
+ * Device drivers may move lots of data to shmem in dpm_prepare(). The shmem
+ * pages will use lots of system memory, causing hibernation image creation
+ * fail due to insufficient free memory.
+ * This call is to force flush the shmem pages to swap disk and reclaim
+ * the system memory so that image creation can succeed.
+ */
+ shrink_shmem_memory();
+
console_suspend_all();
error = dpm_suspend(PMSG_FREEZE);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 48a24e7b309d..bbed41ad29cf 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -72,7 +72,6 @@ int vprintk_store(int facility, int level,
const char *fmt, va_list args);
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
-__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2a5a5329322d..d16afeb11506 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,6 +163,13 @@ static void panic_on_rcu_stall(void)
{
static int cpu_stall;
+ /*
+ * Attempt to kick out the BPF scheduler if it's installed and defer
+ * the panic to give the system a chance to recover.
+ */
+ if (scx_rcu_cpu_stall())
+ return;
+
if (++cpu_stall < sysctl_max_rcu_stall_to_panic)
return;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3ec00d08d46a..be00629f0ba4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1113,6 +1113,7 @@ static void __resched_curr(struct rq *rq, int tif)
cpu = cpu_of(rq);
+ trace_sched_set_need_resched_tp(curr, cpu, tif);
if (cpu == smp_processor_id()) {
set_ti_thread_flag(cti, tif);
if (tif == TIF_NEED_RESCHED)
@@ -1128,6 +1129,11 @@ static void __resched_curr(struct rq *rq, int tif)
}
}
+void __trace_set_need_resched(struct task_struct *curr, int tif)
+{
+ trace_sched_set_need_resched_tp(curr, smp_processor_id(), tif);
+}
+
void resched_curr(struct rq *rq)
{
__resched_curr(rq, TIF_NEED_RESCHED);
@@ -5279,7 +5285,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
* switched the context for the first time. It is returning from
* schedule for the first time in this path.
*/
- trace_sched_exit_tp(true, CALLER_ADDR0);
+ trace_sched_exit_tp(true);
preempt_enable();
if (current->set_child_tid)
@@ -6822,7 +6828,8 @@ static void __sched notrace __schedule(int sched_mode)
struct rq *rq;
int cpu;
- trace_sched_entry_tp(preempt, CALLER_ADDR0);
+ /* Trace preemptions consistently with task switches */
+ trace_sched_entry_tp(sched_mode == SM_PREEMPT);
cpu = smp_processor_id();
rq = cpu_rq(cpu);
@@ -6961,7 +6968,7 @@ keep_resched:
__balance_callbacks(rq);
raw_spin_rq_unlock_irq(rq);
}
- trace_sched_exit_tp(is_switch, CALLER_ADDR0);
+ trace_sched_exit_tp(is_switch);
}
void __noreturn do_task_dead(void)
@@ -9808,7 +9815,9 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0;
}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */
static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */
/* More than 203 days if BW_SHIFT equals 20. */
@@ -9817,12 +9826,21 @@ static const u64 max_bw_runtime_us = MAX_BW;
static void tg_bandwidth(struct task_group *tg,
u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p)
{
+#ifdef CONFIG_CFS_BANDWIDTH
if (period_us_p)
*period_us_p = tg_get_cfs_period(tg);
if (quota_us_p)
*quota_us_p = tg_get_cfs_quota(tg);
if (burst_us_p)
*burst_us_p = tg_get_cfs_burst(tg);
+#else /* !CONFIG_CFS_BANDWIDTH */
+ if (period_us_p)
+ *period_us_p = tg->scx.bw_period_us;
+ if (quota_us_p)
+ *quota_us_p = tg->scx.bw_quota_us;
+ if (burst_us_p)
+ *burst_us_p = tg->scx.bw_burst_us;
+#endif /* CONFIG_CFS_BANDWIDTH */
}
static u64 cpu_period_read_u64(struct cgroup_subsys_state *css,
@@ -9838,6 +9856,7 @@ static int tg_set_bandwidth(struct task_group *tg,
u64 period_us, u64 quota_us, u64 burst_us)
{
const u64 max_usec = U64_MAX / NSEC_PER_USEC;
+ int ret = 0;
if (tg == &root_task_group)
return -EINVAL;
@@ -9875,7 +9894,12 @@ static int tg_set_bandwidth(struct task_group *tg,
burst_us + quota_us > max_bw_runtime_us))
return -EINVAL;
- return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#ifdef CONFIG_CFS_BANDWIDTH
+ ret = tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us);
+#endif /* CONFIG_CFS_BANDWIDTH */
+ if (!ret)
+ scx_group_set_bandwidth(tg, period_us, quota_us, burst_us);
+ return ret;
}
static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css,
@@ -9928,7 +9952,7 @@ static int cpu_burst_write_u64(struct cgroup_subsys_state *css,
tg_bandwidth(tg, &period_us, &quota_us, NULL);
return tg_set_bandwidth(tg, period_us, quota_us, burst_us);
}
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9988,7 +10012,7 @@ static struct cftype cpu_legacy_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
{
.name = "cfs_period_us",
.read_u64 = cpu_period_read_u64,
@@ -10004,6 +10028,8 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_burst_read_u64,
.write_u64 = cpu_burst_write_u64,
},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
{
.name = "stat",
.seq_show = cpu_cfs_stat_show,
@@ -10217,7 +10243,7 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p,
return 0;
}
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
static int cpu_max_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
@@ -10264,7 +10290,7 @@ static struct cftype cpu_files[] = {
.write_s64 = cpu_idle_write_s64,
},
#endif
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
{
.name = "max",
.flags = CFTYPE_NOT_ON_ROOT,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7dd5cbcb7a06..7dedc9a16281 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -203,6 +203,11 @@ struct scx_exit_task_args {
struct scx_cgroup_init_args {
/* the weight of the cgroup [1..10000] */
u32 weight;
+
+ /* bandwidth control parameters from cpu.max and cpu.max.burst */
+ u64 bw_period_us;
+ u64 bw_quota_us;
+ u64 bw_burst_us;
};
enum scx_cpu_preempt_reason {
@@ -664,9 +669,31 @@ struct sched_ext_ops {
* @cgrp: cgroup whose weight is being updated
* @weight: new weight [1..10000]
*
- * Update @tg's weight to @weight.
+ * Update @cgrp's weight to @weight.
*/
void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
+
+ /**
+ * @cgroup_set_bandwidth: A cgroup's bandwidth is being changed
+ * @cgrp: cgroup whose bandwidth is being updated
+ * @period_us: bandwidth control period
+ * @quota_us: bandwidth control quota
+ * @burst_us: bandwidth control burst
+ *
+ * Update @cgrp's bandwidth control parameters. This is from the cpu.max
+ * cgroup interface.
+ *
+ * @quota_us / @period_us determines the CPU bandwidth @cgrp is entitled
+ * to. For example, if @period_us is 1_000_000 and @quota_us is
+ * 2_500_000. @cgrp is entitled to 2.5 CPUs. @burst_us can be
+ * interpreted in the same fashion and specifies how much @cgrp can
+ * burst temporarily. The specific control mechanism and thus the
+ * interpretation of @period_us and burstiness is upto to the BPF
+ * scheduler.
+ */
+ void (*cgroup_set_bandwidth)(struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us);
+
#endif /* CONFIG_EXT_GROUP_SCHED */
/*
@@ -884,7 +911,7 @@ enum scx_enq_flags {
/*
* The task being enqueued was previously enqueued on the current CPU's
* %SCX_DSQ_LOCAL, but was removed from it in a call to the
- * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was
+ * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
* invoked in a ->cpu_release() callback, and the task is again
* dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
* task will not be scheduled on the CPU until at least the next invocation
@@ -1247,7 +1274,7 @@ static void scx_kf_disallow(u32 mask)
* This allows kfuncs to safely operate on rq from any scx ops callback,
* knowing which rq is already locked.
*/
-static DEFINE_PER_CPU(struct rq *, locked_rq);
+DEFINE_PER_CPU(struct rq *, scx_locked_rq_state);
static inline void update_locked_rq(struct rq *rq)
{
@@ -1258,16 +1285,7 @@ static inline void update_locked_rq(struct rq *rq)
*/
if (rq)
lockdep_assert_rq_held(rq);
- __this_cpu_write(locked_rq, rq);
-}
-
-/*
- * Return the rq currently locked from an scx callback, or NULL if no rq is
- * locked.
- */
-static inline struct rq *scx_locked_rq(void)
-{
- return __this_cpu_read(locked_rq);
+ __this_cpu_write(scx_locked_rq_state, rq);
}
#define SCX_CALL_OP(sch, mask, op, rq, args...) \
@@ -1641,7 +1659,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* scx_add_event - Increase an event counter for 'name' by 'cnt'
* @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occured
+ * @cnt: the number of the event occurred
*
* This can be used when preemption is not disabled.
*/
@@ -1654,7 +1672,7 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
* __scx_add_event - Increase an event counter for 'name' by 'cnt'
* @sch: scx_sched to account events for
* @name: an event name defined in struct scx_event_stats
- * @cnt: the number of the event occured
+ * @cnt: the number of the event occurred
*
* This should be used only when preemption is disabled.
*/
@@ -1705,11 +1723,6 @@ static bool scx_tryset_enable_state(enum scx_enable_state to,
return atomic_try_cmpxchg(&scx_enable_state_var, &from_v, to);
}
-static bool scx_rq_bypassing(struct rq *rq)
-{
- return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
-}
-
/**
* wait_ops_state - Busy-wait the specified ops state to end
* @p: target task
@@ -1796,12 +1809,10 @@ static void run_deferred(struct rq *rq)
process_ddsp_deferred_locals(rq);
}
-#ifdef CONFIG_SMP
static void deferred_bal_cb_workfn(struct rq *rq)
{
run_deferred(rq);
}
-#endif
static void deferred_irq_workfn(struct irq_work *irq_work)
{
@@ -1824,7 +1835,6 @@ static void schedule_deferred(struct rq *rq)
{
lockdep_assert_rq_held(rq);
-#ifdef CONFIG_SMP
/*
* If in the middle of waking up a task, task_woken_scx() will be called
* afterwards which will then run the deferred actions, no need to
@@ -1842,7 +1852,7 @@ static void schedule_deferred(struct rq *rq)
deferred_bal_cb_workfn);
return;
}
-#endif
+
/*
* No scheduler hooks available. Queue an irq work. They are executed on
* IRQ re-enable which may take a bit longer than the scheduler hooks.
@@ -2546,7 +2556,6 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
p->scx.dsq = dst_dsq;
}
-#ifdef CONFIG_SMP
/**
* move_remote_task_to_local_dsq - Move a task from a foreign rq to a local DSQ
* @p: task to move
@@ -2713,11 +2722,6 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
return false;
}
}
-#else /* CONFIG_SMP */
-static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct scx_sched *sch, struct task_struct *p, struct rq *rq, bool enforce) { return false; }
-static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
-#endif /* CONFIG_SMP */
/**
* move_task_between_dsqs() - Move a task from one DSQ to another
@@ -2890,9 +2894,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
{
struct rq *src_rq = task_rq(p);
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
-#ifdef CONFIG_SMP
struct rq *locked_rq = rq;
-#endif
/*
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2906,7 +2908,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
return;
}
-#ifdef CONFIG_SMP
if (src_rq != dst_rq &&
unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
dispatch_enqueue(sch, find_global_dsq(p), p,
@@ -2966,9 +2967,6 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
-#else /* CONFIG_SMP */
- BUG(); /* control can not reach here on UP */
-#endif /* CONFIG_SMP */
}
/**
@@ -3292,10 +3290,8 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
static enum scx_cpu_preempt_reason
preempt_reason_from_class(const struct sched_class *class)
{
-#ifdef CONFIG_SMP
if (class == &stop_sched_class)
return SCX_CPU_PREEMPT_STOP;
-#endif
if (class == &dl_sched_class)
return SCX_CPU_PREEMPT_DL;
if (class == &rt_sched_class)
@@ -3308,14 +3304,12 @@ static void switch_class(struct rq *rq, struct task_struct *next)
struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
-#ifdef CONFIG_SMP
/*
* Pairs with the smp_load_acquire() issued by a CPU in
* kick_cpus_irq_workfn() who is waiting for this CPU to perform a
* resched.
*/
smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
-#endif
if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
@@ -3512,8 +3506,6 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
}
#endif /* CONFIG_SCHED_CORE */
-#ifdef CONFIG_SMP
-
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
struct scx_sched *sch = scx_root;
@@ -3643,7 +3635,6 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-#endif /* CONFIG_SMP */
static bool check_rq_for_timeouts(struct rq *rq)
{
@@ -4098,7 +4089,9 @@ static bool scx_cgroup_enabled;
void scx_tg_init(struct task_group *tg)
{
- tg->scx_weight = CGROUP_WEIGHT_DFL;
+ tg->scx.weight = CGROUP_WEIGHT_DFL;
+ tg->scx.bw_period_us = default_bw_period_us();
+ tg->scx.bw_quota_us = RUNTIME_INF;
}
int scx_tg_online(struct task_group *tg)
@@ -4106,14 +4099,17 @@ int scx_tg_online(struct task_group *tg)
struct scx_sched *sch = scx_root;
int ret = 0;
- WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED));
+ WARN_ON_ONCE(tg->scx.flags & (SCX_TG_ONLINE | SCX_TG_INITED));
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled) {
if (SCX_HAS_OP(sch, cgroup_init)) {
struct scx_cgroup_init_args args =
- { .weight = tg->scx_weight };
+ { .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us };
ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
NULL, tg->css.cgroup, &args);
@@ -4121,9 +4117,9 @@ int scx_tg_online(struct task_group *tg)
ret = ops_sanitize_err(sch, "cgroup_init", ret);
}
if (ret == 0)
- tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_ONLINE | SCX_TG_INITED;
} else {
- tg->scx_flags |= SCX_TG_ONLINE;
+ tg->scx.flags |= SCX_TG_ONLINE;
}
percpu_up_read(&scx_cgroup_rwsem);
@@ -4134,15 +4130,15 @@ void scx_tg_offline(struct task_group *tg)
{
struct scx_sched *sch = scx_root;
- WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE));
+ WARN_ON_ONCE(!(tg->scx.flags & SCX_TG_ONLINE));
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
- (tg->scx_flags & SCX_TG_INITED))
+ (tg->scx.flags & SCX_TG_INITED))
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
tg->css.cgroup);
- tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
+ tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4251,11 +4247,11 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
percpu_down_read(&scx_cgroup_rwsem);
if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
- tg->scx_weight != weight)
+ tg->scx.weight != weight)
SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
tg_cgrp(tg), weight);
- tg->scx_weight = weight;
+ tg->scx.weight = weight;
percpu_up_read(&scx_cgroup_rwsem);
}
@@ -4265,6 +4261,27 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
/* TODO: Implement ops->cgroup_set_idle() */
}
+void scx_group_set_bandwidth(struct task_group *tg,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_bandwidth) &&
+ (tg->scx.bw_period_us != period_us ||
+ tg->scx.bw_quota_us != quota_us ||
+ tg->scx.bw_burst_us != burst_us))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+ tg_cgrp(tg), period_us, quota_us, burst_us);
+
+ tg->scx.bw_period_us = period_us;
+ tg->scx.bw_quota_us = quota_us;
+ tg->scx.bw_burst_us = burst_us;
+
+ percpu_up_read(&scx_cgroup_rwsem);
+}
+
static void scx_cgroup_lock(void)
{
percpu_down_write(&scx_cgroup_rwsem);
@@ -4308,14 +4325,12 @@ DEFINE_SCHED_CLASS(ext) = {
.put_prev_task = put_prev_task_scx,
.set_next_task = set_next_task_scx,
-#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_scx,
.task_woken = task_woken_scx,
.set_cpus_allowed = set_cpus_allowed_scx,
.rq_online = rq_online_scx,
.rq_offline = rq_offline_scx,
-#endif
.task_tick = task_tick_scx,
@@ -4408,9 +4423,9 @@ static void scx_cgroup_exit(struct scx_sched *sch)
css_for_each_descendant_post(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
- if (!(tg->scx_flags & SCX_TG_INITED))
+ if (!(tg->scx.flags & SCX_TG_INITED))
continue;
- tg->scx_flags &= ~SCX_TG_INITED;
+ tg->scx.flags &= ~SCX_TG_INITED;
if (!sch->ops.cgroup_exit)
continue;
@@ -4442,14 +4457,19 @@ static int scx_cgroup_init(struct scx_sched *sch)
rcu_read_lock();
css_for_each_descendant_pre(css, &root_task_group.css) {
struct task_group *tg = css_tg(css);
- struct scx_cgroup_init_args args = { .weight = tg->scx_weight };
+ struct scx_cgroup_init_args args = {
+ .weight = tg->scx.weight,
+ .bw_period_us = tg->scx.bw_period_us,
+ .bw_quota_us = tg->scx.bw_quota_us,
+ .bw_burst_us = tg->scx.bw_burst_us,
+ };
- if ((tg->scx_flags &
+ if ((tg->scx.flags &
(SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE)
continue;
if (!sch->ops.cgroup_init) {
- tg->scx_flags |= SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_INITED;
continue;
}
@@ -4464,7 +4484,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
return ret;
}
- tg->scx_flags |= SCX_TG_INITED;
+ tg->scx.flags |= SCX_TG_INITED;
rcu_read_lock();
css_put(css);
@@ -4657,6 +4677,41 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
}
/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ struct scx_sched *sch;
+
+ rcu_read_lock();
+
+ sch = rcu_dereference(scx_root);
+ if (unlikely(!sch)) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ switch (scx_enable_state()) {
+ case SCX_ENABLING:
+ case SCX_ENABLED:
+ break;
+ default:
+ rcu_read_unlock();
+ return false;
+ }
+
+ scx_error(sch, "RCU CPU stall detected!");
+ rcu_read_unlock();
+
+ return true;
+}
+
+/**
* scx_softlockup - sched_ext softlockup handler
* @dur_s: number of seconds of CPU stuck due to soft lockup
*
@@ -5944,6 +5999,7 @@ static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup
static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
+static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif
static void sched_ext_ops__cpu_online(s32 cpu) {}
static void sched_ext_ops__cpu_offline(s32 cpu) {}
@@ -5981,6 +6037,7 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
.cgroup_move = sched_ext_ops__cgroup_move,
.cgroup_cancel_move = sched_ext_ops__cgroup_cancel_move,
.cgroup_set_weight = sched_ext_ops__cgroup_set_weight,
+ .cgroup_set_bandwidth = sched_ext_ops__cgroup_set_bandwidth,
#endif
.cpu_online = sched_ext_ops__cpu_online,
.cpu_offline = sched_ext_ops__cpu_offline,
@@ -6338,7 +6395,8 @@ __bpf_kfunc_start_defs();
* When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
* and this function can be called upto ops.dispatch_max_batch times to insert
* multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
- * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
+ * remaining slots. scx_bpf_dsq_move_to_local() flushes the batch and resets the
+ * counter.
*
* This function doesn't have any locking restrictions and may be called under
* BPF locks (in the future when BPF introduces more flexible locking).
@@ -6362,14 +6420,6 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice
scx_dsq_insert_commit(p, dsq_id, enq_flags);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
- scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
-}
-
/**
* scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
* @p: task_struct to insert
@@ -6407,21 +6457,11 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
- u64 slice, u64 vtime, u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
- scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
-}
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
@@ -6594,13 +6634,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
}
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
- return scx_bpf_dsq_move_to_local(dsq_id);
-}
-
/**
* scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6619,14 +6652,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
- struct bpf_iter_scx_dsq *it__iter, u64 slice)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
- scx_bpf_dsq_move_set_slice(it__iter, slice);
-}
-
/**
* scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
* @it__iter: DSQ iterator in progress
@@ -6646,14 +6671,6 @@ __bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
- struct bpf_iter_scx_dsq *it__iter, u64 vtime)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
- scx_bpf_dsq_move_set_vtime(it__iter, vtime);
-}
-
/**
* scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
* @it__iter: DSQ iterator in progress
@@ -6686,15 +6703,6 @@ __bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
- return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
-}
-
/**
* scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
* @it__iter: DSQ iterator in progress
@@ -6720,30 +6728,16 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
}
-/* for backward compatibility, will be removed in v6.15 */
-__bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
- struct task_struct *p, u64 dsq_id,
- u64 enq_flags)
-{
- printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
- return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
-}
-
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
-BTF_ID_FLAGS(func, scx_bpf_consume)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
@@ -6874,10 +6868,6 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index a75835c23f15..292bb41a242e 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -13,8 +13,24 @@ static inline bool scx_kf_allowed_if_unlocked(void)
return !current->scx.kf_mask;
}
+static inline bool scx_rq_bypassing(struct rq *rq)
+{
+ return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+}
+
DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
+
+/*
+ * Return the rq currently locked from an scx callback, or NULL if no rq is
+ * locked.
+ */
+static inline struct rq *scx_locked_rq(void)
+{
+ return __this_cpu_read(scx_locked_rq_state);
+}
+
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -65,7 +81,7 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
-#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
+#ifdef CONFIG_SCHED_CLASS_EXT
void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
@@ -88,6 +104,7 @@ void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
void scx_group_set_idle(struct task_group *tg, bool idle);
+void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us);
#else /* CONFIG_EXT_GROUP_SCHED */
static inline void scx_tg_init(struct task_group *tg) {}
static inline int scx_tg_online(struct task_group *tg) { return 0; }
@@ -98,5 +115,6 @@ static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
+static inline void scx_group_set_bandwidth(struct task_group *tg, u64 period_us, u64 quota_us, u64 burst_us) {}
#endif /* CONFIG_EXT_GROUP_SCHED */
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 001fb88a8481..7174e1c1a392 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -17,7 +17,6 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
/* Enable/disable per-node idle cpumasks */
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
-#ifdef CONFIG_SMP
/* Enable/disable LLC aware optimizations */
static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
@@ -75,7 +74,7 @@ static int scx_cpu_node_if_enabled(int cpu)
return cpu_to_node(cpu);
}
-bool scx_idle_test_and_clear_cpu(int cpu)
+static bool scx_idle_test_and_clear_cpu(int cpu)
{
int node = scx_cpu_node_if_enabled(cpu);
struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
@@ -198,7 +197,7 @@ pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u6
/*
* Find an idle CPU in the system, starting from @node.
*/
-s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
{
s32 cpu;
@@ -250,7 +249,7 @@ static struct cpumask *llc_span(s32 cpu)
sd = rcu_dereference(per_cpu(sd_llc, cpu));
if (!sd)
- return 0;
+ return NULL;
return sched_domain_span(sd);
}
@@ -794,7 +793,6 @@ static void reset_idle_masks(struct sched_ext_ops *ops)
cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
}
}
-#endif /* CONFIG_SMP */
void scx_idle_enable(struct sched_ext_ops *ops)
{
@@ -808,9 +806,7 @@ void scx_idle_enable(struct sched_ext_ops *ops)
else
static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
-#ifdef CONFIG_SMP
reset_idle_masks(ops);
-#endif
}
void scx_idle_disable(void)
@@ -860,8 +856,8 @@ static bool check_builtin_idle_enabled(void)
return false;
}
-s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
- const struct cpumask *allowed, u64 flags)
+static s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
+ const struct cpumask *allowed, u64 flags)
{
struct rq *rq;
struct rq_flags rf;
@@ -896,7 +892,6 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
if (!rq)
lockdep_assert_held(&p->pi_lock);
-#ifdef CONFIG_SMP
/*
* This may also be called from ops.enqueue(), so we need to handle
* per-CPU tasks as well. For these tasks, we can skip all idle CPU
@@ -913,9 +908,7 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
allowed ?: p->cpus_ptr, flags);
}
-#else
- cpu = -EBUSY;
-#endif
+
if (scx_kf_allowed_if_unlocked())
task_rq_unlock(rq, p, &rf);
@@ -929,14 +922,10 @@ s32 select_cpu_from_kfunc(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
*/
__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
{
-#ifdef CONFIG_NUMA
if (!kf_cpu_valid(cpu, NULL))
return NUMA_NO_NODE;
return cpu_to_node(cpu);
-#else
- return 0;
-#endif
}
/**
@@ -1010,11 +999,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
if (node < 0)
return cpu_none_mask;
-#ifdef CONFIG_SMP
return idle_cpumask(node)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1034,11 +1019,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
if (!check_builtin_idle_enabled())
return cpu_none_mask;
-#ifdef CONFIG_SMP
return idle_cpumask(NUMA_NO_NODE)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1057,14 +1038,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
if (node < 0)
return cpu_none_mask;
-#ifdef CONFIG_SMP
if (sched_smt_active())
return idle_cpumask(node)->smt;
else
return idle_cpumask(node)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1085,14 +1062,10 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
if (!check_builtin_idle_enabled())
return cpu_none_mask;
-#ifdef CONFIG_SMP
if (sched_smt_active())
return idle_cpumask(NUMA_NO_NODE)->smt;
else
return idle_cpumask(NUMA_NO_NODE)->cpu;
-#else
- return cpu_none_mask;
-#endif
}
/**
@@ -1125,10 +1098,10 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
if (!check_builtin_idle_enabled())
return false;
- if (kf_cpu_valid(cpu, NULL))
- return scx_idle_test_and_clear_cpu(cpu);
- else
+ if (!kf_cpu_valid(cpu, NULL))
return false;
+
+ return scx_idle_test_and_clear_cpu(cpu);
}
/**
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index 37be78a7502b..fa583f141f35 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,20 +12,8 @@
struct sched_ext_ops;
-#ifdef CONFIG_SMP
void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
void scx_idle_init_masks(void);
-bool scx_idle_test_and_clear_cpu(int cpu);
-s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
-#else /* !CONFIG_SMP */
-static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
-static inline void scx_idle_init_masks(void) {}
-static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
-static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
-{
- return -EBUSY;
-}
-#endif /* CONFIG_SMP */
s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *cpus_allowed, u64 flags);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d3f33d10c58c..be9745d104f7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -403,7 +403,7 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se)
extern struct list_head task_groups;
-#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_GROUP_SCHED_BANDWIDTH
extern const u64 max_bw_quota_period_us;
/*
@@ -414,7 +414,7 @@ static inline u64 default_bw_period_us(void)
{
return 100000ULL;
}
-#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_GROUP_SCHED_BANDWIDTH */
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
@@ -472,10 +472,7 @@ struct task_group {
struct rt_bandwidth rt_bandwidth;
#endif
-#ifdef CONFIG_EXT_GROUP_SCHED
- u32 scx_flags; /* SCX_TG_* */
- u32 scx_weight;
-#endif
+ struct scx_task_group scx;
struct rcu_head rcu;
struct list_head list;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index a6f00012c72e..20f27e2cf7ae 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -41,13 +41,31 @@ void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_
{
unsigned long flags;
- wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+ wq_entry->flags |= WQ_FLAG_PRIORITY;
spin_lock_irqsave(&wq_head->lock, flags);
__add_wait_queue(wq_head, wq_entry);
spin_unlock_irqrestore(&wq_head->lock, flags);
}
EXPORT_SYMBOL_GPL(add_wait_queue_priority);
+int add_wait_queue_priority_exclusive(struct wait_queue_head *wq_head,
+ struct wait_queue_entry *wq_entry)
+{
+ struct list_head *head = &wq_head->head;
+
+ wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY;
+
+ guard(spinlock_irqsave)(&wq_head->lock);
+
+ if (!list_empty(head) &&
+ (list_first_entry(head, typeof(*wq_entry), entry)->flags & WQ_FLAG_PRIORITY))
+ return -EBUSY;
+
+ list_add(&wq_entry->entry, head);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(add_wait_queue_priority_exclusive);
+
void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
unsigned long flags;
@@ -65,7 +83,7 @@ EXPORT_SYMBOL(remove_wait_queue);
* the non-exclusive tasks. Normally, exclusive tasks will be at the end of
* the list and any non-exclusive tasks will be woken first. A priority task
* may be at the head of the list, and can consume the event without any other
- * tasks being woken.
+ * tasks being woken if it's also an exclusive task.
*
* There are circumstances in which we can try to wake a task which has already
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
diff --git a/kernel/sys.c b/kernel/sys.c
index 18a037cc6f61..1e28b40053ce 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2372,54 +2372,14 @@ int __weak arch_lock_shadow_stack_status(struct task_struct *t, unsigned long st
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
-#ifdef CONFIG_ANON_VMA_NAME
-
-#define ANON_VMA_NAME_MAX_LEN 80
-#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
-
-static inline bool is_valid_name_char(char ch)
-{
- /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
- return ch > 0x1f && ch < 0x7f &&
- !strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
-}
-
static int prctl_set_vma(unsigned long opt, unsigned long addr,
unsigned long size, unsigned long arg)
{
- struct mm_struct *mm = current->mm;
- const char __user *uname;
- struct anon_vma_name *anon_name = NULL;
int error;
switch (opt) {
case PR_SET_VMA_ANON_NAME:
- uname = (const char __user *)arg;
- if (uname) {
- char *name, *pch;
-
- name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
- if (IS_ERR(name))
- return PTR_ERR(name);
-
- for (pch = name; *pch != '\0'; pch++) {
- if (!is_valid_name_char(*pch)) {
- kfree(name);
- return -EINVAL;
- }
- }
- /* anon_vma has its own copy */
- anon_name = anon_vma_name_alloc(name);
- kfree(name);
- if (!anon_name)
- return -ENOMEM;
-
- }
-
- mmap_write_lock(mm);
- error = madvise_set_anon_name(mm, addr, size, anon_name);
- mmap_write_unlock(mm);
- anon_vma_name_put(anon_name);
+ error = set_anon_vma_name(addr, size, (const char __user *)arg);
break;
default:
error = -EINVAL;
@@ -2428,14 +2388,6 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
return error;
}
-#else /* CONFIG_ANON_VMA_NAME */
-static int prctl_set_vma(unsigned long opt, unsigned long start,
- unsigned long size, unsigned long arg)
-{
- return -EINVAL;
-}
-#endif /* CONFIG_ANON_VMA_NAME */
-
static inline unsigned long get_current_mdwe(void)
{
unsigned long ret = 0;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index e400fe150f9d..0aef0e349e49 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -340,10 +340,7 @@ static void clocksource_verify_choose_cpus(void)
* CPUs that are currently online.
*/
for (i = 1; i < n; i++) {
- cpu = get_random_u32_below(nr_cpu_ids);
- cpu = cpumask_next(cpu - 1, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_online_mask);
+ cpu = cpumask_random(cpu_online_mask);
if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
cpumask_set_cpu(cpu, &cpus_chosen);
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a3f35c7d83b6..d2c79da81e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -53,6 +53,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
config HAVE_DYNAMIC_FTRACE_WITH_CALL_OPS
bool
+config HAVE_EXTRA_IPI_TRACEPOINTS
+ bool
+ help
+ For architectures that use ipi_raise, ipi_entry and ipi_exit
+ tracepoints.
+
config HAVE_DYNAMIC_FTRACE_WITH_ARGS
bool
help
@@ -74,11 +80,6 @@ config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
If the architecture generates __patchable_function_entries sections
but does not want them included in the ftrace locations.
-config HAVE_FTRACE_MCOUNT_RECORD
- bool
- help
- See Documentation/trace/ftrace-design.rst
-
config HAVE_SYSCALL_TRACEPOINTS
bool
help
@@ -199,6 +200,19 @@ menuconfig FTRACE
if FTRACE
+config TRACEFS_AUTOMOUNT_DEPRECATED
+ bool "Automount tracefs on debugfs [DEPRECATED]"
+ depends on TRACING
+ default y
+ help
+ The tracing interface was moved from /sys/kernel/debug/tracing
+ to /sys/kernel/tracing in 2015, but the tracing file system
+ was still automounted in /sys/kernel/debug for backward
+ compatibility with tooling.
+
+ The new interface has been around for more than 10 years and
+ the old debug mount will soon be removed.
+
config BOOTTIME_TRACING
bool "Boot-time Tracing support"
depends on TRACING
@@ -275,7 +289,7 @@ config FUNCTION_TRACE_ARGS
funcgraph-args (for the function graph tracer)
config DYNAMIC_FTRACE
- bool "enable/disable function tracing dynamically"
+ bool
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
@@ -779,6 +793,20 @@ config UPROBE_EVENTS
This option is required if you plan to use perf-probe subcommand
of perf tools on user space applications.
+config EPROBE_EVENTS
+ bool "Enable event-based dynamic events"
+ depends on TRACING
+ depends on HAVE_REGS_AND_STACK_ACCESS_API
+ select PROBE_EVENTS
+ select DYNAMIC_EVENTS
+ default y
+ help
+ Eprobes are dynamic events that can be placed on other existing
+ events. It can be used to limit what fields are recorded in
+ an event or even dereference a field of an event. It can
+ convert the type of an event field. For example, turn an
+ address into a string.
+
config BPF_EVENTS
depends on BPF_SYSCALL
depends on (KPROBE_EVENTS || UPROBE_EVENTS) && PERF_EVENTS
@@ -803,27 +831,22 @@ config BPF_KPROBE_OVERRIDE
Allows BPF to override the execution of a probed function and
set a different return value. This is used for error injection.
-config FTRACE_MCOUNT_RECORD
- def_bool y
- depends on DYNAMIC_FTRACE
- depends on HAVE_FTRACE_MCOUNT_RECORD
-
config FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
bool
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config FTRACE_MCOUNT_USE_CC
def_bool y
depends on $(cc-option,-mrecord-mcount)
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config FTRACE_MCOUNT_USE_OBJTOOL
def_bool y
depends on HAVE_OBJTOOL_MCOUNT
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
depends on !FTRACE_MCOUNT_USE_CC
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
select OBJTOOL
config FTRACE_MCOUNT_USE_RECORDMCOUNT
@@ -831,7 +854,7 @@ config FTRACE_MCOUNT_USE_RECORDMCOUNT
depends on !FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY
depends on !FTRACE_MCOUNT_USE_CC
depends on !FTRACE_MCOUNT_USE_OBJTOOL
- depends on FTRACE_MCOUNT_RECORD
+ depends on DYNAMIC_FTRACE
config TRACING_MAP
bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 057cd975d014..dcb4e02afc5f 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
-obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
+obj-$(CONFIG_EPROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index c5b207992fb4..f4d200f0c610 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -1325,6 +1325,10 @@ int register_ftrace_graph(struct fgraph_ops *gops)
int ret = 0;
int i = -1;
+ if (WARN_ONCE(gops->ops.flags & FTRACE_OPS_FL_GRAPH,
+ "function graph ops registered again"))
+ return -EBUSY;
+
guard(mutex)(&ftrace_lock);
if (!fgraph_stack_cachep) {
@@ -1401,17 +1405,21 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
{
int command = 0;
+ if (WARN_ONCE(!(gops->ops.flags & FTRACE_OPS_FL_GRAPH),
+ "function graph ops unregistered without registering"))
+ return;
+
guard(mutex)(&ftrace_lock);
if (unlikely(!ftrace_graph_active))
- return;
+ goto out;
if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
fgraph_array[gops->idx] != gops))
- return;
+ goto out;
if (fgraph_lru_release_index(gops->idx) < 0)
- return;
+ goto out;
fgraph_array[gops->idx] = &fgraph_stub;
@@ -1434,4 +1442,6 @@ void unregister_ftrace_graph(struct fgraph_ops *gops)
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
}
gops->saved_func = NULL;
+ out:
+ gops->ops.flags &= ~FTRACE_OPS_FL_GRAPH;
}
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index ba7ff14f5339..c8034dfc1070 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -352,7 +352,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
size_words = SIZE_IN_LONG(size);
ret_ip = ftrace_regs_get_instruction_pointer(fregs);
- preempt_disable();
+ preempt_disable_notrace();
curr = 0;
while (size_words > curr) {
@@ -368,7 +368,7 @@ static void fprobe_return(struct ftrace_graph_ret *trace,
}
curr += size;
}
- preempt_enable();
+ preempt_enable_notrace();
}
NOKPROBE_SYMBOL(fprobe_return);
@@ -648,6 +648,11 @@ static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
#define FPROBE_IPS_MAX INT_MAX
+int fprobe_count_ips_from_filter(const char *filter, const char *notfilter)
+{
+ return get_ips_from_filter(filter, notfilter, NULL, NULL, FPROBE_IPS_MAX);
+}
+
/**
* register_fprobe() - Register fprobe to ftrace by pattern.
* @fp: A fprobe data structure to be registered.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4203fad56b6c..00b76d450a89 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1042,10 +1042,6 @@ static struct ftrace_ops *removed_ops;
*/
static bool update_all_ops;
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-# error Dynamic ftrace depends on MCOUNT_RECORD
-#endif
-
struct ftrace_func_probe {
struct ftrace_probe_ops *probe_ops;
struct ftrace_ops ops;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 21bb161c2316..f2fe33573e54 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -17,5 +17,4 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency);
-EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle);
diff --git a/kernel/trace/preemptirq_delay_test.c b/kernel/trace/preemptirq_delay_test.c
index 314ffc143039..acb0c971a408 100644
--- a/kernel/trace/preemptirq_delay_test.c
+++ b/kernel/trace/preemptirq_delay_test.c
@@ -117,12 +117,15 @@ static int preemptirq_delay_run(void *data)
{
int i;
int s = MIN(burst_size, NR_TEST_FUNCS);
- struct cpumask cpu_mask;
+ cpumask_var_t cpu_mask;
+
+ if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
if (cpu_affinity > -1) {
- cpumask_clear(&cpu_mask);
- cpumask_set_cpu(cpu_affinity, &cpu_mask);
- if (set_cpus_allowed_ptr(current, &cpu_mask))
+ cpumask_clear(cpu_mask);
+ cpumask_set_cpu(cpu_affinity, cpu_mask);
+ if (set_cpus_allowed_ptr(current, cpu_mask))
pr_err("cpu_affinity:%d, failed\n", cpu_affinity);
}
@@ -139,6 +142,8 @@ static int preemptirq_delay_run(void *data)
__set_current_state(TASK_RUNNING);
+ free_cpumask_var(cpu_mask);
+
return 0;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 00fc38d70e86..5176e0270f07 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1358,6 +1358,13 @@ static inline void rb_inc_page(struct buffer_page **bpage)
*bpage = list_entry(p, struct buffer_page, list);
}
+static inline void rb_dec_page(struct buffer_page **bpage)
+{
+ struct list_head *p = rb_list_head((*bpage)->list.prev);
+
+ *bpage = list_entry(p, struct buffer_page, list);
+}
+
static struct buffer_page *
rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
{
@@ -1866,10 +1873,11 @@ static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
{
struct ring_buffer_cpu_meta *meta = cpu_buffer->ring_meta;
- struct buffer_page *head_page;
+ struct buffer_page *head_page, *orig_head;
unsigned long entry_bytes = 0;
unsigned long entries = 0;
int ret;
+ u64 ts;
int i;
if (!meta || !meta->head_buffer)
@@ -1885,8 +1893,98 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
local_set(&cpu_buffer->reader_page->entries, ret);
- head_page = cpu_buffer->head_page;
+ orig_head = head_page = cpu_buffer->head_page;
+ ts = head_page->page->time_stamp;
+
+ /*
+ * Try to rewind the head so that we can read the pages which already
+ * read in the previous boot.
+ */
+ if (head_page == cpu_buffer->tail_page)
+ goto skip_rewind;
+
+ rb_dec_page(&head_page);
+ for (i = 0; i < meta->nr_subbufs + 1; i++, rb_dec_page(&head_page)) {
+
+ /* Rewind until tail (writer) page. */
+ if (head_page == cpu_buffer->tail_page)
+ break;
+
+ /* Ensure the page has older data than head. */
+ if (ts < head_page->page->time_stamp)
+ break;
+
+ ts = head_page->page->time_stamp;
+ /* Ensure the page has correct timestamp and some data. */
+ if (!ts || rb_page_commit(head_page) == 0)
+ break;
+
+ /* Stop rewind if the page is invalid. */
+ ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+ if (ret < 0)
+ break;
+
+ /* Recover the number of entries and update stats. */
+ local_set(&head_page->entries, ret);
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+ entries += ret;
+ entry_bytes += rb_page_commit(head_page);
+ }
+ if (i)
+ pr_info("Ring buffer [%d] rewound %d pages\n", cpu_buffer->cpu, i);
+
+ /* The last rewound page must be skipped. */
+ if (head_page != orig_head)
+ rb_inc_page(&head_page);
+
+ /*
+ * If the ring buffer was rewound, then inject the reader page
+ * into the location just before the original head page.
+ */
+ if (head_page != orig_head) {
+ struct buffer_page *bpage = orig_head;
+
+ rb_dec_page(&bpage);
+ /*
+ * Insert the reader_page before the original head page.
+ * Since the list encode RB_PAGE flags, general list
+ * operations should be avoided.
+ */
+ cpu_buffer->reader_page->list.next = &orig_head->list;
+ cpu_buffer->reader_page->list.prev = orig_head->list.prev;
+ orig_head->list.prev = &cpu_buffer->reader_page->list;
+ bpage->list.next = &cpu_buffer->reader_page->list;
+
+ /* Make the head_page the reader page */
+ cpu_buffer->reader_page = head_page;
+ bpage = head_page;
+ rb_inc_page(&head_page);
+ head_page->list.prev = bpage->list.prev;
+ rb_dec_page(&bpage);
+ bpage->list.next = &head_page->list;
+ rb_set_list_to_head(&bpage->list);
+ cpu_buffer->pages = &head_page->list;
+
+ cpu_buffer->head_page = head_page;
+ meta->head_buffer = (unsigned long)head_page->page;
+
+ /* Reset all the indexes */
+ bpage = cpu_buffer->reader_page;
+ meta->buffers[0] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = 0;
+
+ for (i = 1, bpage = head_page; i < meta->nr_subbufs;
+ i++, rb_inc_page(&bpage)) {
+ meta->buffers[i] = rb_meta_subbuf_idx(meta, bpage->page);
+ bpage->id = i;
+ }
+
+ /* We'll restart verifying from orig_head */
+ head_page = orig_head;
+ }
+ skip_rewind:
/* If the commit_buffer is the reader page, update the commit page */
if (meta->commit_buffer == (unsigned long)cpu_buffer->reader_page->page) {
cpu_buffer->commit_page = cpu_buffer->reader_page;
@@ -4118,7 +4216,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static const char *show_irq_str(int bits)
{
- const char *type[] = {
+ static const char * type[] = {
".", // 0
"s", // 1
"h", // 2
@@ -5342,7 +5440,6 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
*/
local_set(&cpu_buffer->reader_page->write, 0);
local_set(&cpu_buffer->reader_page->entries, 0);
- local_set(&cpu_buffer->reader_page->page->commit, 0);
cpu_buffer->reader_page->real_end = 0;
spin:
@@ -5846,24 +5943,20 @@ ring_buffer_consume(struct trace_buffer *buffer, int cpu, u64 *ts,
EXPORT_SYMBOL_GPL(ring_buffer_consume);
/**
- * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
+ * ring_buffer_read_start - start a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
* @flags: gfp flags to use for memory allocation
*
- * This performs the initial preparations necessary to iterate
- * through the buffer. Memory is allocated, buffer resizing
- * is disabled, and the iterator pointer is returned to the caller.
+ * This creates an iterator to allow non-consuming iteration through
+ * the buffer. If the buffer is disabled for writing, it will produce
+ * the same information each time, but if the buffer is still writing
+ * then the first hit of a write will cause the iteration to stop.
*
- * After a sequence of ring_buffer_read_prepare calls, the user is
- * expected to make at least one call to ring_buffer_read_prepare_sync.
- * Afterwards, ring_buffer_read_start is invoked to get things going
- * for real.
- *
- * This overall must be paired with ring_buffer_read_finish.
+ * Must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
+ring_buffer_read_start(struct trace_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -5889,51 +5982,12 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
atomic_inc(&cpu_buffer->resize_disabled);
- return iter;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
-
-/**
- * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
- *
- * All previously invoked ring_buffer_read_prepare calls to prepare
- * iterators will be synchronized. Afterwards, read_buffer_read_start
- * calls on those iterators are allowed.
- */
-void
-ring_buffer_read_prepare_sync(void)
-{
- synchronize_rcu();
-}
-EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
-
-/**
- * ring_buffer_read_start - start a non consuming read of the buffer
- * @iter: The iterator returned by ring_buffer_read_prepare
- *
- * This finalizes the startup of an iteration through the buffer.
- * The iterator comes from a call to ring_buffer_read_prepare and
- * an intervening ring_buffer_read_prepare_sync must have been
- * performed.
- *
- * Must be paired with ring_buffer_read_finish.
- */
-void
-ring_buffer_read_start(struct ring_buffer_iter *iter)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- unsigned long flags;
-
- if (!iter)
- return;
-
- cpu_buffer = iter->cpu_buffer;
-
- raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_buffer->reader_lock);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
- raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return iter;
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index b39f36013ef2..5b4be87ba59d 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -1,19 +1,31 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-config DA_MON_EVENTS
+config RV_MON_EVENTS
+ bool
+
+config RV_MON_MAINTENANCE_EVENTS
bool
config DA_MON_EVENTS_IMPLICIT
- select DA_MON_EVENTS
+ select RV_MON_EVENTS
+ select RV_MON_MAINTENANCE_EVENTS
bool
config DA_MON_EVENTS_ID
- select DA_MON_EVENTS
+ select RV_MON_EVENTS
+ select RV_MON_MAINTENANCE_EVENTS
+ bool
+
+config LTL_MON_EVENTS_ID
+ select RV_MON_EVENTS
+ bool
+
+config RV_LTL_MONITOR
bool
menuconfig RV
bool "Runtime Verification"
- depends on TRACING
+ select TRACING
help
Enable the kernel runtime verification infrastructure. RV is a
lightweight (yet rigorous) method that complements classical
@@ -25,15 +37,34 @@ menuconfig RV
For further information, see:
Documentation/trace/rv/runtime-verification.rst
+config RV_PER_TASK_MONITORS
+ int "Maximum number of per-task monitor"
+ depends on RV
+ range 1 8
+ default 2
+ help
+ This option configures the maximum number of per-task RV monitors that can run
+ simultaneously.
+
source "kernel/trace/rv/monitors/wip/Kconfig"
source "kernel/trace/rv/monitors/wwnr/Kconfig"
+
source "kernel/trace/rv/monitors/sched/Kconfig"
-source "kernel/trace/rv/monitors/tss/Kconfig"
source "kernel/trace/rv/monitors/sco/Kconfig"
source "kernel/trace/rv/monitors/snroc/Kconfig"
source "kernel/trace/rv/monitors/scpd/Kconfig"
source "kernel/trace/rv/monitors/snep/Kconfig"
-source "kernel/trace/rv/monitors/sncid/Kconfig"
+source "kernel/trace/rv/monitors/sts/Kconfig"
+source "kernel/trace/rv/monitors/nrp/Kconfig"
+source "kernel/trace/rv/monitors/sssw/Kconfig"
+source "kernel/trace/rv/monitors/opid/Kconfig"
+# Add new sched monitors here
+
+source "kernel/trace/rv/monitors/rtapp/Kconfig"
+source "kernel/trace/rv/monitors/pagefault/Kconfig"
+source "kernel/trace/rv/monitors/sleep/Kconfig"
+# Add new rtapp monitors here
+
# Add new monitors here
config RV_REACTORS
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index f9b2cd0483c3..750e4ad6fa0f 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -6,12 +6,17 @@ obj-$(CONFIG_RV) += rv.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
obj-$(CONFIG_RV_MON_SCHED) += monitors/sched/sched.o
-obj-$(CONFIG_RV_MON_TSS) += monitors/tss/tss.o
obj-$(CONFIG_RV_MON_SCO) += monitors/sco/sco.o
obj-$(CONFIG_RV_MON_SNROC) += monitors/snroc/snroc.o
obj-$(CONFIG_RV_MON_SCPD) += monitors/scpd/scpd.o
obj-$(CONFIG_RV_MON_SNEP) += monitors/snep/snep.o
-obj-$(CONFIG_RV_MON_SNCID) += monitors/sncid/sncid.o
+obj-$(CONFIG_RV_MON_RTAPP) += monitors/rtapp/rtapp.o
+obj-$(CONFIG_RV_MON_PAGEFAULT) += monitors/pagefault/pagefault.o
+obj-$(CONFIG_RV_MON_SLEEP) += monitors/sleep/sleep.o
+obj-$(CONFIG_RV_MON_STS) += monitors/sts/sts.o
+obj-$(CONFIG_RV_MON_NRP) += monitors/nrp/nrp.o
+obj-$(CONFIG_RV_MON_SSSW) += monitors/sssw/sssw.o
+obj-$(CONFIG_RV_MON_OPID) += monitors/opid/opid.o
# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
diff --git a/kernel/trace/rv/monitors/tss/Kconfig b/kernel/trace/rv/monitors/nrp/Kconfig
index 479f86f52e60..f5ec08f65535 100644
--- a/kernel/trace/rv/monitors/tss/Kconfig
+++ b/kernel/trace/rv/monitors/nrp/Kconfig
@@ -1,14 +1,16 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-config RV_MON_TSS
+config RV_MON_NRP
depends on RV
depends on RV_MON_SCHED
- default y
- select DA_MON_EVENTS_IMPLICIT
- bool "tss monitor"
+ default y if !ARM64
+ select DA_MON_EVENTS_ID
+ bool "nrp monitor"
help
- Monitor to ensure sched_switch happens only in scheduling context.
+ Monitor to ensure preemption requires need resched.
This monitor is part of the sched monitors collection.
+ This monitor is unstable on arm64, say N unless you are testing it.
+
For further information, see:
Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/nrp/nrp.c b/kernel/trace/rv/monitors/nrp/nrp.c
new file mode 100644
index 000000000000..5a83b7171432
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "nrp"
+
+#include <trace/events/irq.h>
+#include <trace/events/sched.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "nrp.h"
+
+static struct rv_monitor rv_nrp;
+DECLARE_DA_MON_PER_TASK(nrp, unsigned char);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+
+static void handle_vector_irq_entry(void *data, int vector)
+{
+ da_handle_event_nrp(current, irq_entry_nrp);
+}
+
+static void attach_vector_irq(void)
+{
+ rv_attach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_attach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_attach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+static void detach_vector_irq(void)
+{
+ rv_detach_trace_probe("nrp", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_detach_trace_probe("nrp", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_detach_trace_probe("nrp", reschedule_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("nrp", call_function_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("nrp", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+#else
+/* We assume irq_entry tracepoints are sufficient on other architectures */
+static void attach_vector_irq(void) { }
+static void detach_vector_irq(void) { }
+#endif
+
+static void handle_irq_entry(void *data, int irq, struct irqaction *action)
+{
+ da_handle_event_nrp(current, irq_entry_nrp);
+}
+
+static void handle_sched_need_resched(void *data, struct task_struct *tsk,
+ int cpu, int tif)
+{
+ /*
+ * Although need_resched leads to both the rescheduling and preempt_irq
+ * states, it is safer to start the monitor always in preempt_irq,
+ * which may not mirror the system state but makes the monitor simpler,
+ */
+ if (tif == TIF_NEED_RESCHED)
+ da_handle_start_event_nrp(tsk, sched_need_resched_nrp);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ if (preempt)
+ da_handle_event_nrp(current, schedule_entry_preempt_nrp);
+ else
+ da_handle_event_nrp(current, schedule_entry_nrp);
+}
+
+static int enable_nrp(void)
+{
+ int retval;
+
+ retval = da_monitor_init_nrp();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("nrp", irq_handler_entry, handle_irq_entry);
+ rv_attach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_attach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry);
+ attach_vector_irq();
+
+ return 0;
+}
+
+static void disable_nrp(void)
+{
+ rv_nrp.enabled = 0;
+
+ rv_detach_trace_probe("nrp", irq_handler_entry, handle_irq_entry);
+ rv_detach_trace_probe("nrp", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_detach_trace_probe("nrp", sched_entry_tp, handle_schedule_entry);
+ detach_vector_irq();
+
+ da_monitor_destroy_nrp();
+}
+
+static struct rv_monitor rv_nrp = {
+ .name = "nrp",
+ .description = "need resched preempts.",
+ .enable = enable_nrp,
+ .disable = disable_nrp,
+ .reset = da_monitor_reset_all_nrp,
+ .enabled = 0,
+};
+
+static int __init register_nrp(void)
+{
+ return rv_register_monitor(&rv_nrp, &rv_sched);
+}
+
+static void __exit unregister_nrp(void)
+{
+ rv_unregister_monitor(&rv_nrp);
+}
+
+module_init(register_nrp);
+module_exit(unregister_nrp);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("nrp: need resched preempts.");
diff --git a/kernel/trace/rv/monitors/nrp/nrp.h b/kernel/trace/rv/monitors/nrp/nrp.h
new file mode 100644
index 000000000000..c9f12207cbf6
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of nrp automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_nrp {
+ preempt_irq_nrp = 0,
+ any_thread_running_nrp,
+ nested_preempt_nrp,
+ rescheduling_nrp,
+ state_max_nrp
+};
+
+#define INVALID_STATE state_max_nrp
+
+enum events_nrp {
+ irq_entry_nrp = 0,
+ sched_need_resched_nrp,
+ schedule_entry_nrp,
+ schedule_entry_preempt_nrp,
+ event_max_nrp
+};
+
+struct automaton_nrp {
+ char *state_names[state_max_nrp];
+ char *event_names[event_max_nrp];
+ unsigned char function[state_max_nrp][event_max_nrp];
+ unsigned char initial_state;
+ bool final_states[state_max_nrp];
+};
+
+static const struct automaton_nrp automaton_nrp = {
+ .state_names = {
+ "preempt_irq",
+ "any_thread_running",
+ "nested_preempt",
+ "rescheduling"
+ },
+ .event_names = {
+ "irq_entry",
+ "sched_need_resched",
+ "schedule_entry",
+ "schedule_entry_preempt"
+ },
+ .function = {
+ {
+ preempt_irq_nrp,
+ preempt_irq_nrp,
+ nested_preempt_nrp,
+ nested_preempt_nrp
+ },
+ {
+ any_thread_running_nrp,
+ rescheduling_nrp,
+ any_thread_running_nrp,
+ INVALID_STATE
+ },
+ {
+ nested_preempt_nrp,
+ preempt_irq_nrp,
+ any_thread_running_nrp,
+ any_thread_running_nrp
+ },
+ {
+ preempt_irq_nrp,
+ rescheduling_nrp,
+ any_thread_running_nrp,
+ any_thread_running_nrp
+ },
+ },
+ .initial_state = preempt_irq_nrp,
+ .final_states = { 0, 1, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/nrp/nrp_trace.h b/kernel/trace/rv/monitors/nrp/nrp_trace.h
new file mode 100644
index 000000000000..2e13497de3b6
--- /dev/null
+++ b/kernel/trace/rv/monitors/nrp/nrp_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_NRP
+DEFINE_EVENT(event_da_monitor_id, event_nrp,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_nrp,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_NRP */
diff --git a/kernel/trace/rv/monitors/opid/Kconfig b/kernel/trace/rv/monitors/opid/Kconfig
new file mode 100644
index 000000000000..561d32da572b
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_OPID
+ depends on RV
+ depends on TRACE_IRQFLAGS
+ depends on TRACE_PREEMPT_TOGGLE
+ depends on RV_MON_SCHED
+ default y if PREEMPT_RT
+ select DA_MON_EVENTS_IMPLICIT
+ bool "opid monitor"
+ help
+ Monitor to ensure operations like wakeup and need resched occur with
+ interrupts and preemption disabled or during IRQs, where preemption
+ may not be disabled explicitly.
+
+ This monitor is unstable on !PREEMPT_RT, say N unless you are testing it.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/opid/opid.c b/kernel/trace/rv/monitors/opid/opid.c
new file mode 100644
index 000000000000..50d64e7fb8c4
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/opid.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "opid"
+
+#include <trace/events/sched.h>
+#include <trace/events/irq.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "opid.h"
+
+static struct rv_monitor rv_opid;
+DECLARE_DA_MON_PER_CPU(opid, unsigned char);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+
+static void handle_vector_irq_entry(void *data, int vector)
+{
+ da_handle_event_opid(irq_entry_opid);
+}
+
+static void attach_vector_irq(void)
+{
+ rv_attach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_attach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_attach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("opid", call_function_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+static void detach_vector_irq(void)
+{
+ rv_detach_trace_probe("opid", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_detach_trace_probe("opid", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_detach_trace_probe("opid", reschedule_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("opid", call_function_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("opid", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+#else
+/* We assume irq_entry tracepoints are sufficient on other architectures */
+static void attach_vector_irq(void) { }
+static void detach_vector_irq(void) { }
+#endif
+
+static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_opid(irq_disable_opid);
+}
+
+static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_opid(irq_enable_opid);
+}
+
+static void handle_irq_entry(void *data, int irq, struct irqaction *action)
+{
+ da_handle_event_opid(irq_entry_opid);
+}
+
+static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_opid(preempt_disable_opid);
+}
+
+static void handle_preempt_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_opid(preempt_enable_opid);
+}
+
+static void handle_sched_need_resched(void *data, struct task_struct *tsk, int cpu, int tif)
+{
+ /* The monitor's intitial state is not in_irq */
+ if (this_cpu_read(hardirq_context))
+ da_handle_event_opid(sched_need_resched_opid);
+ else
+ da_handle_start_event_opid(sched_need_resched_opid);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *p)
+{
+ /* The monitor's intitial state is not in_irq */
+ if (this_cpu_read(hardirq_context))
+ da_handle_event_opid(sched_waking_opid);
+ else
+ da_handle_start_event_opid(sched_waking_opid);
+}
+
+static int enable_opid(void)
+{
+ int retval;
+
+ retval = da_monitor_init_opid();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("opid", irq_disable, handle_irq_disable);
+ rv_attach_trace_probe("opid", irq_enable, handle_irq_enable);
+ rv_attach_trace_probe("opid", irq_handler_entry, handle_irq_entry);
+ rv_attach_trace_probe("opid", preempt_disable, handle_preempt_disable);
+ rv_attach_trace_probe("opid", preempt_enable, handle_preempt_enable);
+ rv_attach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_attach_trace_probe("opid", sched_waking, handle_sched_waking);
+ attach_vector_irq();
+
+ return 0;
+}
+
+static void disable_opid(void)
+{
+ rv_opid.enabled = 0;
+
+ rv_detach_trace_probe("opid", irq_disable, handle_irq_disable);
+ rv_detach_trace_probe("opid", irq_enable, handle_irq_enable);
+ rv_detach_trace_probe("opid", irq_handler_entry, handle_irq_entry);
+ rv_detach_trace_probe("opid", preempt_disable, handle_preempt_disable);
+ rv_detach_trace_probe("opid", preempt_enable, handle_preempt_enable);
+ rv_detach_trace_probe("opid", sched_set_need_resched_tp, handle_sched_need_resched);
+ rv_detach_trace_probe("opid", sched_waking, handle_sched_waking);
+ detach_vector_irq();
+
+ da_monitor_destroy_opid();
+}
+
+/*
+ * This is the monitor register section.
+ */
+static struct rv_monitor rv_opid = {
+ .name = "opid",
+ .description = "operations with preemption and irq disabled.",
+ .enable = enable_opid,
+ .disable = disable_opid,
+ .reset = da_monitor_reset_all_opid,
+ .enabled = 0,
+};
+
+static int __init register_opid(void)
+{
+ return rv_register_monitor(&rv_opid, &rv_sched);
+}
+
+static void __exit unregister_opid(void)
+{
+ rv_unregister_monitor(&rv_opid);
+}
+
+module_init(register_opid);
+module_exit(unregister_opid);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("opid: operations with preemption and irq disabled.");
diff --git a/kernel/trace/rv/monitors/opid/opid.h b/kernel/trace/rv/monitors/opid/opid.h
new file mode 100644
index 000000000000..b4b8c2ff7f64
--- /dev/null
+++ b/kernel/trace/rv/monitors/opid/opid.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of opid automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_opid {
+ disabled_opid = 0,
+ enabled_opid,
+ in_irq_opid,
+ irq_disabled_opid,
+ preempt_disabled_opid,
+ state_max_opid
+};
+
+#define INVALID_STATE state_max_opid
+
+enum events_opid {
+ irq_disable_opid = 0,
+ irq_enable_opid,
+ irq_entry_opid,
+ preempt_disable_opid,
+ preempt_enable_opid,
+ sched_need_resched_opid,
+ sched_waking_opid,
+ event_max_opid
+};
+
+struct automaton_opid {
+ char *state_names[state_max_opid];
+ char *event_names[event_max_opid];
+ unsigned char function[state_max_opid][event_max_opid];
+ unsigned char initial_state;
+ bool final_states[state_max_opid];
+};
+
+static const struct automaton_opid automaton_opid = {
+ .state_names = {
+ "disabled",
+ "enabled",
+ "in_irq",
+ "irq_disabled",
+ "preempt_disabled"
+ },
+ .event_names = {
+ "irq_disable",
+ "irq_enable",
+ "irq_entry",
+ "preempt_disable",
+ "preempt_enable",
+ "sched_need_resched",
+ "sched_waking"
+ },
+ .function = {
+ {
+ INVALID_STATE,
+ preempt_disabled_opid,
+ disabled_opid,
+ INVALID_STATE,
+ irq_disabled_opid,
+ disabled_opid,
+ disabled_opid
+ },
+ {
+ irq_disabled_opid,
+ INVALID_STATE,
+ INVALID_STATE,
+ preempt_disabled_opid,
+ enabled_opid,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ enabled_opid,
+ in_irq_opid,
+ INVALID_STATE,
+ INVALID_STATE,
+ in_irq_opid,
+ in_irq_opid
+ },
+ {
+ INVALID_STATE,
+ enabled_opid,
+ in_irq_opid,
+ disabled_opid,
+ INVALID_STATE,
+ irq_disabled_opid,
+ INVALID_STATE
+ },
+ {
+ disabled_opid,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ enabled_opid,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ },
+ .initial_state = disabled_opid,
+ .final_states = { 0, 1, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sncid/sncid_trace.h b/kernel/trace/rv/monitors/opid/opid_trace.h
index 3ce42a57671d..3df6ff955c30 100644
--- a/kernel/trace/rv/monitors/sncid/sncid_trace.h
+++ b/kernel/trace/rv/monitors/opid/opid_trace.h
@@ -4,12 +4,12 @@
* Snippet to be included in rv_trace.h
*/
-#ifdef CONFIG_RV_MON_SNCID
-DEFINE_EVENT(event_da_monitor, event_sncid,
+#ifdef CONFIG_RV_MON_OPID
+DEFINE_EVENT(event_da_monitor, event_opid,
TP_PROTO(char *state, char *event, char *next_state, bool final_state),
TP_ARGS(state, event, next_state, final_state));
-DEFINE_EVENT(error_da_monitor, error_sncid,
+DEFINE_EVENT(error_da_monitor, error_opid,
TP_PROTO(char *state, char *event),
TP_ARGS(state, event));
-#endif /* CONFIG_RV_MON_SNCID */
+#endif /* CONFIG_RV_MON_OPID */
diff --git a/kernel/trace/rv/monitors/pagefault/Kconfig b/kernel/trace/rv/monitors/pagefault/Kconfig
new file mode 100644
index 000000000000..5e16625f1653
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/Kconfig
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_PAGEFAULT
+ depends on RV
+ select RV_LTL_MONITOR
+ depends on RV_MON_RTAPP
+ depends on X86 || RISCV
+ default y
+ select LTL_MON_EVENTS_ID
+ bool "pagefault monitor"
+ help
+ Monitor that real-time tasks do not raise page faults, causing
+ undesirable latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
+
+ This monitor does not affect execution speed while it is not running,
+ therefore it is safe to enable this in production kernel.
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c
new file mode 100644
index 000000000000..9fe6123b2200
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rv.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/rt.h>
+#include <linux/tracepoint.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "pagefault"
+
+#include <rv_trace.h>
+#include <trace/events/exceptions.h>
+#include <monitors/rtapp/rtapp.h>
+
+#include "pagefault.h"
+#include <rv/ltl_monitor.h>
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon)
+{
+ /*
+ * This includes "actual" real-time tasks and also PI-boosted
+ * tasks. A task being PI-boosted means it is blocking an "actual"
+ * real-task, therefore it should also obey the monitor's rule,
+ * otherwise the "actual" real-task may be delayed.
+ */
+ ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task));
+}
+
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation)
+{
+ if (task_creation)
+ ltl_atom_set(mon, LTL_PAGEFAULT, false);
+}
+
+static void handle_page_fault(void *data, unsigned long address, struct pt_regs *regs,
+ unsigned long error_code)
+{
+ ltl_atom_pulse(current, LTL_PAGEFAULT, true);
+}
+
+static int enable_pagefault(void)
+{
+ int retval;
+
+ retval = ltl_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault);
+ rv_attach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault);
+
+ return 0;
+}
+
+static void disable_pagefault(void)
+{
+ rv_detach_trace_probe("rtapp_pagefault", page_fault_kernel, handle_page_fault);
+ rv_detach_trace_probe("rtapp_pagefault", page_fault_user, handle_page_fault);
+
+ ltl_monitor_destroy();
+}
+
+static struct rv_monitor rv_pagefault = {
+ .name = "pagefault",
+ .description = "Monitor that RT tasks do not raise page faults",
+ .enable = enable_pagefault,
+ .disable = disable_pagefault,
+};
+
+static int __init register_pagefault(void)
+{
+ return rv_register_monitor(&rv_pagefault, &rv_rtapp);
+}
+
+static void __exit unregister_pagefault(void)
+{
+ rv_unregister_monitor(&rv_pagefault);
+}
+
+module_init(register_pagefault);
+module_exit(unregister_pagefault);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults");
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.h b/kernel/trace/rv/monitors/pagefault/pagefault.h
new file mode 100644
index 000000000000..c580ec194009
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * C implementation of Buchi automaton, automatically generated by
+ * tools/verification/rvgen from the linear temporal logic specification.
+ * For further information, see kernel documentation:
+ * Documentation/trace/rv/linear_temporal_logic.rst
+ */
+
+#include <linux/rv.h>
+
+#define MONITOR_NAME pagefault
+
+enum ltl_atom {
+ LTL_PAGEFAULT,
+ LTL_RT,
+ LTL_NUM_ATOM
+};
+static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM);
+
+static const char *ltl_atom_str(enum ltl_atom atom)
+{
+ static const char *const names[] = {
+ "pa",
+ "rt",
+ };
+
+ return names[atom];
+}
+
+enum ltl_buchi_state {
+ S0,
+ RV_NUM_BA_STATES
+};
+static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
+
+static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+ bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms);
+ bool val3 = !pagefault;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val4 = val1 || val3;
+
+ if (val4)
+ __set_bit(S0, mon->states);
+}
+
+static void
+ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
+{
+ bool pagefault = test_bit(LTL_PAGEFAULT, mon->atoms);
+ bool val3 = !pagefault;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val4 = val1 || val3;
+
+ switch (state) {
+ case S0:
+ if (val4)
+ __set_bit(S0, next);
+ break;
+ }
+}
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault_trace.h b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h
new file mode 100644
index 000000000000..fe1f82597b1a
--- /dev/null
+++ b/kernel/trace/rv/monitors/pagefault/pagefault_trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_PAGEFAULT
+DEFINE_EVENT(event_ltl_monitor_id, event_pagefault,
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+ TP_ARGS(task, states, atoms, next));
+DEFINE_EVENT(error_ltl_monitor_id, error_pagefault,
+ TP_PROTO(struct task_struct *task),
+ TP_ARGS(task));
+#endif /* CONFIG_RV_MON_PAGEFAULT */
diff --git a/kernel/trace/rv/monitors/rtapp/Kconfig b/kernel/trace/rv/monitors/rtapp/Kconfig
new file mode 100644
index 000000000000..1ce9370a9ba8
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/Kconfig
@@ -0,0 +1,11 @@
+config RV_MON_RTAPP
+ depends on RV
+ depends on RV_PER_TASK_MONITORS >= 2
+ bool "rtapp monitor"
+ help
+ Collection of monitors to check for common problems with real-time
+ application that may cause unexpected latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.c b/kernel/trace/rv/monitors/rtapp/rtapp.c
new file mode 100644
index 000000000000..fd75fc927d65
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+
+#define MODULE_NAME "rtapp"
+
+#include "rtapp.h"
+
+struct rv_monitor rv_rtapp;
+
+struct rv_monitor rv_rtapp = {
+ .name = "rtapp",
+ .description = "Collection of monitors for detecting problems with real-time applications",
+};
+
+static int __init register_rtapp(void)
+{
+ return rv_register_monitor(&rv_rtapp, NULL);
+}
+
+static void __exit unregister_rtapp(void)
+{
+ rv_unregister_monitor(&rv_rtapp);
+}
+
+module_init(register_rtapp);
+module_exit(unregister_rtapp);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("Collection of monitors for detecting problems with real-time applications");
diff --git a/kernel/trace/rv/monitors/rtapp/rtapp.h b/kernel/trace/rv/monitors/rtapp/rtapp.h
new file mode 100644
index 000000000000..4c200d67c7f6
--- /dev/null
+++ b/kernel/trace/rv/monitors/rtapp/rtapp.h
@@ -0,0 +1,3 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+extern struct rv_monitor rv_rtapp;
diff --git a/kernel/trace/rv/monitors/sched/Kconfig b/kernel/trace/rv/monitors/sched/Kconfig
index ae3eb410abd7..aa16456da864 100644
--- a/kernel/trace/rv/monitors/sched/Kconfig
+++ b/kernel/trace/rv/monitors/sched/Kconfig
@@ -2,6 +2,7 @@
#
config RV_MON_SCHED
depends on RV
+ depends on RV_PER_TASK_MONITORS >= 3
bool "sched monitor"
help
Collection of monitors to check the scheduler behaves according to specifications.
diff --git a/kernel/trace/rv/monitors/sched/sched.c b/kernel/trace/rv/monitors/sched/sched.c
index 905e03c3c934..d04db4b543f9 100644
--- a/kernel/trace/rv/monitors/sched/sched.c
+++ b/kernel/trace/rv/monitors/sched/sched.c
@@ -21,8 +21,7 @@ struct rv_monitor rv_sched = {
static int __init register_sched(void)
{
- rv_register_monitor(&rv_sched, NULL);
- return 0;
+ return rv_register_monitor(&rv_sched, NULL);
}
static void __exit unregister_sched(void)
diff --git a/kernel/trace/rv/monitors/sco/sco.c b/kernel/trace/rv/monitors/sco/sco.c
index 4cff59220bfc..04c36405e2e3 100644
--- a/kernel/trace/rv/monitors/sco/sco.c
+++ b/kernel/trace/rv/monitors/sco/sco.c
@@ -24,12 +24,12 @@ static void handle_sched_set_state(void *data, struct task_struct *tsk, int stat
da_handle_start_event_sco(sched_set_state_sco);
}
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
{
da_handle_event_sco(schedule_entry_sco);
}
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
{
da_handle_start_event_sco(schedule_exit_sco);
}
@@ -71,8 +71,7 @@ static struct rv_monitor rv_sco = {
static int __init register_sco(void)
{
- rv_register_monitor(&rv_sco, &rv_sched);
- return 0;
+ return rv_register_monitor(&rv_sco, &rv_sched);
}
static void __exit unregister_sco(void)
diff --git a/kernel/trace/rv/monitors/scpd/Kconfig b/kernel/trace/rv/monitors/scpd/Kconfig
index b9114fbf680f..682d0416188b 100644
--- a/kernel/trace/rv/monitors/scpd/Kconfig
+++ b/kernel/trace/rv/monitors/scpd/Kconfig
@@ -2,7 +2,7 @@
#
config RV_MON_SCPD
depends on RV
- depends on PREEMPT_TRACER
+ depends on TRACE_PREEMPT_TOGGLE
depends on RV_MON_SCHED
default y
select DA_MON_EVENTS_IMPLICIT
diff --git a/kernel/trace/rv/monitors/scpd/scpd.c b/kernel/trace/rv/monitors/scpd/scpd.c
index cbdd6a5f8d7f..1e351ba52fee 100644
--- a/kernel/trace/rv/monitors/scpd/scpd.c
+++ b/kernel/trace/rv/monitors/scpd/scpd.c
@@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa
da_handle_start_event_scpd(preempt_enable_scpd);
}
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
{
da_handle_event_scpd(schedule_entry_scpd);
}
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
{
da_handle_event_scpd(schedule_exit_scpd);
}
@@ -79,8 +79,7 @@ static struct rv_monitor rv_scpd = {
static int __init register_scpd(void)
{
- rv_register_monitor(&rv_scpd, &rv_sched);
- return 0;
+ return rv_register_monitor(&rv_scpd, &rv_sched);
}
static void __exit unregister_scpd(void)
diff --git a/kernel/trace/rv/monitors/sleep/Kconfig b/kernel/trace/rv/monitors/sleep/Kconfig
new file mode 100644
index 000000000000..6b7a122e7b47
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/Kconfig
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_SLEEP
+ depends on RV
+ select RV_LTL_MONITOR
+ depends on HAVE_SYSCALL_TRACEPOINTS
+ depends on RV_MON_RTAPP
+ select TRACE_IRQFLAGS
+ default y
+ select LTL_MON_EVENTS_ID
+ bool "sleep monitor"
+ help
+ Monitor that real-time tasks do not sleep in a manner that may
+ cause undesirable latency.
+
+ If you are developing a real-time system and not entirely sure whether
+ the applications are designed correctly for real-time, you want to say
+ Y here.
+
+ Enabling this monitor may have performance impact (due to select
+ TRACE_IRQFLAGS). Therefore, you probably should say N for
+ production kernel.
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
new file mode 100644
index 000000000000..eea447b06907
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/rv.h>
+#include <linux/sched/deadline.h>
+#include <linux/sched/rt.h>
+#include <rv/instrumentation.h>
+
+#define MODULE_NAME "sleep"
+
+#include <trace/events/syscalls.h>
+#include <trace/events/sched.h>
+#include <trace/events/lock.h>
+#include <uapi/linux/futex.h>
+#include <rv_trace.h>
+#include <monitors/rtapp/rtapp.h>
+
+#include "sleep.h"
+#include <rv/ltl_monitor.h>
+
+static void ltl_atoms_fetch(struct task_struct *task, struct ltl_monitor *mon)
+{
+ /*
+ * This includes "actual" real-time tasks and also PI-boosted
+ * tasks. A task being PI-boosted means it is blocking an "actual"
+ * real-task, therefore it should also obey the monitor's rule,
+ * otherwise the "actual" real-task may be delayed.
+ */
+ ltl_atom_set(mon, LTL_RT, rt_or_dl_task(task));
+}
+
+static void ltl_atoms_init(struct task_struct *task, struct ltl_monitor *mon, bool task_creation)
+{
+ ltl_atom_set(mon, LTL_SLEEP, false);
+ ltl_atom_set(mon, LTL_WAKE, false);
+ ltl_atom_set(mon, LTL_ABORT_SLEEP, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_HARDIRQ, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_NMI, false);
+ ltl_atom_set(mon, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, false);
+
+ if (task_creation) {
+ ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_BLOCK_ON_RT_MUTEX, false);
+ }
+
+ if (task->flags & PF_KTHREAD) {
+ ltl_atom_set(mon, LTL_KERNEL_THREAD, true);
+
+ /* kernel tasks do not do syscall */
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_set(mon, LTL_CLOCK_NANOSLEEP, false);
+
+ if (strstarts(task->comm, "migration/"))
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, true);
+ else
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false);
+
+ if (strstarts(task->comm, "rcu"))
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, true);
+ else
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, false);
+ } else {
+ ltl_atom_set(mon, LTL_KTHREAD_SHOULD_STOP, false);
+ ltl_atom_set(mon, LTL_KERNEL_THREAD, false);
+ ltl_atom_set(mon, LTL_TASK_IS_RCU, false);
+ ltl_atom_set(mon, LTL_TASK_IS_MIGRATION, false);
+ }
+
+}
+
+static void handle_sched_set_state(void *data, struct task_struct *task, int state)
+{
+ if (state & TASK_INTERRUPTIBLE)
+ ltl_atom_pulse(task, LTL_SLEEP, true);
+ else if (state == TASK_RUNNING)
+ ltl_atom_pulse(task, LTL_ABORT_SLEEP, true);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *task)
+{
+ ltl_atom_pulse(task, LTL_WAKE, true);
+}
+
+static void handle_sched_waking(void *data, struct task_struct *task)
+{
+ if (this_cpu_read(hardirq_context)) {
+ ltl_atom_pulse(task, LTL_WOKEN_BY_HARDIRQ, true);
+ } else if (in_task()) {
+ if (current->prio <= task->prio)
+ ltl_atom_pulse(task, LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO, true);
+ } else if (in_nmi()) {
+ ltl_atom_pulse(task, LTL_WOKEN_BY_NMI, true);
+ }
+}
+
+static void handle_contention_begin(void *data, void *lock, unsigned int flags)
+{
+ if (flags & LCB_F_RT)
+ ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, true);
+}
+
+static void handle_contention_end(void *data, void *lock, int ret)
+{
+ ltl_atom_update(current, LTL_BLOCK_ON_RT_MUTEX, false);
+}
+
+static void handle_sys_enter(void *data, struct pt_regs *regs, long id)
+{
+ struct ltl_monitor *mon;
+ unsigned long args[6];
+ int op, cmd;
+
+ mon = ltl_get_monitor(current);
+
+ switch (id) {
+ case __NR_clock_nanosleep:
+#ifdef __NR_clock_nanosleep_time64
+ case __NR_clock_nanosleep_time64:
+#endif
+ syscall_get_arguments(current, regs, args);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, args[0] == CLOCK_MONOTONIC);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, args[0] == CLOCK_TAI);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, args[1] == TIMER_ABSTIME);
+ ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, true);
+ break;
+
+ case __NR_futex:
+#ifdef __NR_futex_time64
+ case __NR_futex_time64:
+#endif
+ syscall_get_arguments(current, regs, args);
+ op = args[1];
+ cmd = op & FUTEX_CMD_MASK;
+
+ switch (cmd) {
+ case FUTEX_LOCK_PI:
+ case FUTEX_LOCK_PI2:
+ ltl_atom_update(current, LTL_FUTEX_LOCK_PI, true);
+ break;
+ case FUTEX_WAIT:
+ case FUTEX_WAIT_BITSET:
+ case FUTEX_WAIT_REQUEUE_PI:
+ ltl_atom_update(current, LTL_FUTEX_WAIT, true);
+ break;
+ }
+ break;
+ }
+}
+
+static void handle_sys_exit(void *data, struct pt_regs *regs, long ret)
+{
+ struct ltl_monitor *mon = ltl_get_monitor(current);
+
+ ltl_atom_set(mon, LTL_FUTEX_LOCK_PI, false);
+ ltl_atom_set(mon, LTL_FUTEX_WAIT, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_MONOTONIC, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_CLOCK_TAI, false);
+ ltl_atom_set(mon, LTL_NANOSLEEP_TIMER_ABSTIME, false);
+ ltl_atom_update(current, LTL_CLOCK_NANOSLEEP, false);
+}
+
+static void handle_kthread_stop(void *data, struct task_struct *task)
+{
+ /* FIXME: this could race with other tracepoint handlers */
+ ltl_atom_update(task, LTL_KTHREAD_SHOULD_STOP, true);
+}
+
+static int enable_sleep(void)
+{
+ int retval;
+
+ retval = ltl_monitor_init();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
+ rv_attach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup);
+ rv_attach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin);
+ rv_attach_trace_probe("rtapp_sleep", contention_end, handle_contention_end);
+ rv_attach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop);
+ rv_attach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter);
+ rv_attach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit);
+ return 0;
+}
+
+static void disable_sleep(void)
+{
+ rv_detach_trace_probe("rtapp_sleep", sched_waking, handle_sched_waking);
+ rv_detach_trace_probe("rtapp_sleep", sched_wakeup, handle_sched_wakeup);
+ rv_detach_trace_probe("rtapp_sleep", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("rtapp_sleep", contention_begin, handle_contention_begin);
+ rv_detach_trace_probe("rtapp_sleep", contention_end, handle_contention_end);
+ rv_detach_trace_probe("rtapp_sleep", sched_kthread_stop, handle_kthread_stop);
+ rv_detach_trace_probe("rtapp_sleep", sys_enter, handle_sys_enter);
+ rv_detach_trace_probe("rtapp_sleep", sys_exit, handle_sys_exit);
+
+ ltl_monitor_destroy();
+}
+
+static struct rv_monitor rv_sleep = {
+ .name = "sleep",
+ .description = "Monitor that RT tasks do not undesirably sleep",
+ .enable = enable_sleep,
+ .disable = disable_sleep,
+};
+
+static int __init register_sleep(void)
+{
+ return rv_register_monitor(&rv_sleep, &rv_rtapp);
+}
+
+static void __exit unregister_sleep(void)
+{
+ rv_unregister_monitor(&rv_sleep);
+}
+
+module_init(register_sleep);
+module_exit(unregister_sleep);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
+MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep");
diff --git a/kernel/trace/rv/monitors/sleep/sleep.h b/kernel/trace/rv/monitors/sleep/sleep.h
new file mode 100644
index 000000000000..2ab46fd218d2
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep.h
@@ -0,0 +1,257 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * C implementation of Buchi automaton, automatically generated by
+ * tools/verification/rvgen from the linear temporal logic specification.
+ * For further information, see kernel documentation:
+ * Documentation/trace/rv/linear_temporal_logic.rst
+ */
+
+#include <linux/rv.h>
+
+#define MONITOR_NAME sleep
+
+enum ltl_atom {
+ LTL_ABORT_SLEEP,
+ LTL_BLOCK_ON_RT_MUTEX,
+ LTL_CLOCK_NANOSLEEP,
+ LTL_FUTEX_LOCK_PI,
+ LTL_FUTEX_WAIT,
+ LTL_KERNEL_THREAD,
+ LTL_KTHREAD_SHOULD_STOP,
+ LTL_NANOSLEEP_CLOCK_MONOTONIC,
+ LTL_NANOSLEEP_CLOCK_TAI,
+ LTL_NANOSLEEP_TIMER_ABSTIME,
+ LTL_RT,
+ LTL_SLEEP,
+ LTL_TASK_IS_MIGRATION,
+ LTL_TASK_IS_RCU,
+ LTL_WAKE,
+ LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ LTL_WOKEN_BY_HARDIRQ,
+ LTL_WOKEN_BY_NMI,
+ LTL_NUM_ATOM
+};
+static_assert(LTL_NUM_ATOM <= RV_MAX_LTL_ATOM);
+
+static const char *ltl_atom_str(enum ltl_atom atom)
+{
+ static const char *const names[] = {
+ "ab_sl",
+ "bl_on_rt_mu",
+ "cl_na",
+ "fu_lo_pi",
+ "fu_wa",
+ "ker_th",
+ "kth_sh_st",
+ "na_cl_mo",
+ "na_cl_ta",
+ "na_ti_ab",
+ "rt",
+ "sl",
+ "ta_mi",
+ "ta_rc",
+ "wak",
+ "wo_eq_hi_pr",
+ "wo_ha",
+ "wo_nm",
+ };
+
+ return names[atom];
+}
+
+enum ltl_buchi_state {
+ S0,
+ S1,
+ S2,
+ S3,
+ S4,
+ S5,
+ S6,
+ S7,
+ RV_NUM_BA_STATES
+};
+static_assert(RV_NUM_BA_STATES <= RV_MAX_BA_STATES);
+
+static void ltl_start(struct task_struct *task, struct ltl_monitor *mon)
+{
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool val40 = task_is_rcu || task_is_migration;
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool val41 = futex_lock_pi || val40;
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool val5 = block_on_rt_mutex || val41;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val32 = abort_sleep || kthread_should_stop;
+ bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
+ bool val33 = woken_by_nmi || val32;
+ bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
+ bool val34 = woken_by_hardirq || val33;
+ bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ mon->atoms);
+ bool val14 = woken_by_equal_or_higher_prio || val34;
+ bool wake = test_bit(LTL_WAKE, mon->atoms);
+ bool val13 = !wake;
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
+ bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
+ bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
+ bool val25 = nanosleep_timer_abstime && val24;
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool val18 = clock_nanosleep && val25;
+ bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
+ bool val9 = futex_wait || val18;
+ bool val11 = val9 || kernel_thread;
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool val2 = !sleep;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val3 = val1 || val2;
+
+ if (val3)
+ __set_bit(S0, mon->states);
+ if (val11 && val13)
+ __set_bit(S1, mon->states);
+ if (val11 && val14)
+ __set_bit(S4, mon->states);
+ if (val5)
+ __set_bit(S5, mon->states);
+}
+
+static void
+ltl_possible_next_states(struct ltl_monitor *mon, unsigned int state, unsigned long *next)
+{
+ bool task_is_migration = test_bit(LTL_TASK_IS_MIGRATION, mon->atoms);
+ bool task_is_rcu = test_bit(LTL_TASK_IS_RCU, mon->atoms);
+ bool val40 = task_is_rcu || task_is_migration;
+ bool futex_lock_pi = test_bit(LTL_FUTEX_LOCK_PI, mon->atoms);
+ bool val41 = futex_lock_pi || val40;
+ bool block_on_rt_mutex = test_bit(LTL_BLOCK_ON_RT_MUTEX, mon->atoms);
+ bool val5 = block_on_rt_mutex || val41;
+ bool kthread_should_stop = test_bit(LTL_KTHREAD_SHOULD_STOP, mon->atoms);
+ bool abort_sleep = test_bit(LTL_ABORT_SLEEP, mon->atoms);
+ bool val32 = abort_sleep || kthread_should_stop;
+ bool woken_by_nmi = test_bit(LTL_WOKEN_BY_NMI, mon->atoms);
+ bool val33 = woken_by_nmi || val32;
+ bool woken_by_hardirq = test_bit(LTL_WOKEN_BY_HARDIRQ, mon->atoms);
+ bool val34 = woken_by_hardirq || val33;
+ bool woken_by_equal_or_higher_prio = test_bit(LTL_WOKEN_BY_EQUAL_OR_HIGHER_PRIO,
+ mon->atoms);
+ bool val14 = woken_by_equal_or_higher_prio || val34;
+ bool wake = test_bit(LTL_WAKE, mon->atoms);
+ bool val13 = !wake;
+ bool kernel_thread = test_bit(LTL_KERNEL_THREAD, mon->atoms);
+ bool nanosleep_clock_tai = test_bit(LTL_NANOSLEEP_CLOCK_TAI, mon->atoms);
+ bool nanosleep_clock_monotonic = test_bit(LTL_NANOSLEEP_CLOCK_MONOTONIC, mon->atoms);
+ bool val24 = nanosleep_clock_monotonic || nanosleep_clock_tai;
+ bool nanosleep_timer_abstime = test_bit(LTL_NANOSLEEP_TIMER_ABSTIME, mon->atoms);
+ bool val25 = nanosleep_timer_abstime && val24;
+ bool clock_nanosleep = test_bit(LTL_CLOCK_NANOSLEEP, mon->atoms);
+ bool val18 = clock_nanosleep && val25;
+ bool futex_wait = test_bit(LTL_FUTEX_WAIT, mon->atoms);
+ bool val9 = futex_wait || val18;
+ bool val11 = val9 || kernel_thread;
+ bool sleep = test_bit(LTL_SLEEP, mon->atoms);
+ bool val2 = !sleep;
+ bool rt = test_bit(LTL_RT, mon->atoms);
+ bool val1 = !rt;
+ bool val3 = val1 || val2;
+
+ switch (state) {
+ case S0:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S1:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S2:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S3:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S4:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S5:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ case S6:
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val13 && val3)
+ __set_bit(S2, next);
+ if (val14 && val3)
+ __set_bit(S3, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val13 && val5)
+ __set_bit(S6, next);
+ if (val14 && val5)
+ __set_bit(S7, next);
+ break;
+ case S7:
+ if (val3)
+ __set_bit(S0, next);
+ if (val11 && val13)
+ __set_bit(S1, next);
+ if (val11 && val14)
+ __set_bit(S4, next);
+ if (val5)
+ __set_bit(S5, next);
+ break;
+ }
+}
diff --git a/kernel/trace/rv/monitors/sleep/sleep_trace.h b/kernel/trace/rv/monitors/sleep/sleep_trace.h
new file mode 100644
index 000000000000..22eaf31da987
--- /dev/null
+++ b/kernel/trace/rv/monitors/sleep/sleep_trace.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SLEEP
+DEFINE_EVENT(event_ltl_monitor_id, event_sleep,
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+ TP_ARGS(task, states, atoms, next));
+DEFINE_EVENT(error_ltl_monitor_id, error_sleep,
+ TP_PROTO(struct task_struct *task),
+ TP_ARGS(task));
+#endif /* CONFIG_RV_MON_SLEEP */
diff --git a/kernel/trace/rv/monitors/sncid/sncid.c b/kernel/trace/rv/monitors/sncid/sncid.c
deleted file mode 100644
index f5037cd6214c..000000000000
--- a/kernel/trace/rv/monitors/sncid/sncid.c
+++ /dev/null
@@ -1,96 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/ftrace.h>
-#include <linux/tracepoint.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/rv.h>
-#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
-
-#define MODULE_NAME "sncid"
-
-#include <trace/events/sched.h>
-#include <trace/events/preemptirq.h>
-#include <rv_trace.h>
-#include <monitors/sched/sched.h>
-
-#include "sncid.h"
-
-static struct rv_monitor rv_sncid;
-DECLARE_DA_MON_PER_CPU(sncid, unsigned char);
-
-static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_event_sncid(irq_disable_sncid);
-}
-
-static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
-{
- da_handle_start_event_sncid(irq_enable_sncid);
-}
-
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
-{
- da_handle_start_event_sncid(schedule_entry_sncid);
-}
-
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
-{
- da_handle_start_event_sncid(schedule_exit_sncid);
-}
-
-static int enable_sncid(void)
-{
- int retval;
-
- retval = da_monitor_init_sncid();
- if (retval)
- return retval;
-
- rv_attach_trace_probe("sncid", irq_disable, handle_irq_disable);
- rv_attach_trace_probe("sncid", irq_enable, handle_irq_enable);
- rv_attach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry);
- rv_attach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit);
-
- return 0;
-}
-
-static void disable_sncid(void)
-{
- rv_sncid.enabled = 0;
-
- rv_detach_trace_probe("sncid", irq_disable, handle_irq_disable);
- rv_detach_trace_probe("sncid", irq_enable, handle_irq_enable);
- rv_detach_trace_probe("sncid", sched_entry_tp, handle_schedule_entry);
- rv_detach_trace_probe("sncid", sched_exit_tp, handle_schedule_exit);
-
- da_monitor_destroy_sncid();
-}
-
-static struct rv_monitor rv_sncid = {
- .name = "sncid",
- .description = "schedule not called with interrupt disabled.",
- .enable = enable_sncid,
- .disable = disable_sncid,
- .reset = da_monitor_reset_all_sncid,
- .enabled = 0,
-};
-
-static int __init register_sncid(void)
-{
- rv_register_monitor(&rv_sncid, &rv_sched);
- return 0;
-}
-
-static void __exit unregister_sncid(void)
-{
- rv_unregister_monitor(&rv_sncid);
-}
-
-module_init(register_sncid);
-module_exit(unregister_sncid);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
-MODULE_DESCRIPTION("sncid: schedule not called with interrupt disabled.");
diff --git a/kernel/trace/rv/monitors/sncid/sncid.h b/kernel/trace/rv/monitors/sncid/sncid.h
deleted file mode 100644
index 21304725142b..000000000000
--- a/kernel/trace/rv/monitors/sncid/sncid.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Automatically generated C representation of sncid automaton
- * For further information about this format, see kernel documentation:
- * Documentation/trace/rv/deterministic_automata.rst
- */
-
-enum states_sncid {
- can_sched_sncid = 0,
- cant_sched_sncid,
- state_max_sncid
-};
-
-#define INVALID_STATE state_max_sncid
-
-enum events_sncid {
- irq_disable_sncid = 0,
- irq_enable_sncid,
- schedule_entry_sncid,
- schedule_exit_sncid,
- event_max_sncid
-};
-
-struct automaton_sncid {
- char *state_names[state_max_sncid];
- char *event_names[event_max_sncid];
- unsigned char function[state_max_sncid][event_max_sncid];
- unsigned char initial_state;
- bool final_states[state_max_sncid];
-};
-
-static const struct automaton_sncid automaton_sncid = {
- .state_names = {
- "can_sched",
- "cant_sched"
- },
- .event_names = {
- "irq_disable",
- "irq_enable",
- "schedule_entry",
- "schedule_exit"
- },
- .function = {
- { cant_sched_sncid, INVALID_STATE, can_sched_sncid, can_sched_sncid },
- { INVALID_STATE, can_sched_sncid, INVALID_STATE, INVALID_STATE },
- },
- .initial_state = can_sched_sncid,
- .final_states = { 1, 0 },
-};
diff --git a/kernel/trace/rv/monitors/snep/Kconfig b/kernel/trace/rv/monitors/snep/Kconfig
index 77527f971232..7dd54f434ff7 100644
--- a/kernel/trace/rv/monitors/snep/Kconfig
+++ b/kernel/trace/rv/monitors/snep/Kconfig
@@ -2,7 +2,7 @@
#
config RV_MON_SNEP
depends on RV
- depends on PREEMPT_TRACER
+ depends on TRACE_PREEMPT_TOGGLE
depends on RV_MON_SCHED
default y
select DA_MON_EVENTS_IMPLICIT
diff --git a/kernel/trace/rv/monitors/snep/snep.c b/kernel/trace/rv/monitors/snep/snep.c
index 0076ba6d7ea4..558950f524a5 100644
--- a/kernel/trace/rv/monitors/snep/snep.c
+++ b/kernel/trace/rv/monitors/snep/snep.c
@@ -30,12 +30,12 @@ static void handle_preempt_enable(void *data, unsigned long ip, unsigned long pa
da_handle_start_event_snep(preempt_enable_snep);
}
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
+static void handle_schedule_entry(void *data, bool preempt)
{
da_handle_event_snep(schedule_entry_snep);
}
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
+static void handle_schedule_exit(void *data, bool is_switch)
{
da_handle_start_event_snep(schedule_exit_snep);
}
@@ -79,8 +79,7 @@ static struct rv_monitor rv_snep = {
static int __init register_snep(void)
{
- rv_register_monitor(&rv_snep, &rv_sched);
- return 0;
+ return rv_register_monitor(&rv_snep, &rv_sched);
}
static void __exit unregister_snep(void)
diff --git a/kernel/trace/rv/monitors/snep/snep.h b/kernel/trace/rv/monitors/snep/snep.h
index 6d16b9ad931e..4cd9abb77b7b 100644
--- a/kernel/trace/rv/monitors/snep/snep.h
+++ b/kernel/trace/rv/monitors/snep/snep.h
@@ -41,8 +41,18 @@ static const struct automaton_snep automaton_snep = {
"schedule_exit"
},
.function = {
- { non_scheduling_context_snep, non_scheduling_context_snep, scheduling_contex_snep, INVALID_STATE },
- { INVALID_STATE, INVALID_STATE, INVALID_STATE, non_scheduling_context_snep },
+ {
+ non_scheduling_context_snep,
+ non_scheduling_context_snep,
+ scheduling_contex_snep,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ non_scheduling_context_snep
+ },
},
.initial_state = non_scheduling_context_snep,
.final_states = { 1, 0 },
diff --git a/kernel/trace/rv/monitors/snroc/snroc.c b/kernel/trace/rv/monitors/snroc/snroc.c
index bb1f60d55296..540e686e699f 100644
--- a/kernel/trace/rv/monitors/snroc/snroc.c
+++ b/kernel/trace/rv/monitors/snroc/snroc.c
@@ -68,8 +68,7 @@ static struct rv_monitor rv_snroc = {
static int __init register_snroc(void)
{
- rv_register_monitor(&rv_snroc, &rv_sched);
- return 0;
+ return rv_register_monitor(&rv_snroc, &rv_sched);
}
static void __exit unregister_snroc(void)
diff --git a/kernel/trace/rv/monitors/sncid/Kconfig b/kernel/trace/rv/monitors/sssw/Kconfig
index 76bcfef4fd10..23b7eeb38bbf 100644
--- a/kernel/trace/rv/monitors/sncid/Kconfig
+++ b/kernel/trace/rv/monitors/sssw/Kconfig
@@ -1,14 +1,14 @@
# SPDX-License-Identifier: GPL-2.0-only
#
-config RV_MON_SNCID
+config RV_MON_SSSW
depends on RV
- depends on IRQSOFF_TRACER
depends on RV_MON_SCHED
default y
- select DA_MON_EVENTS_IMPLICIT
- bool "sncid monitor"
+ select DA_MON_EVENTS_ID
+ bool "sssw monitor"
help
- Monitor to ensure schedule is not called with interrupt disabled.
+ Monitor to ensure sched_set_state to sleepable leads to sleeping and
+ sleeping tasks require wakeup.
This monitor is part of the sched monitors collection.
For further information, see:
diff --git a/kernel/trace/rv/monitors/sssw/sssw.c b/kernel/trace/rv/monitors/sssw/sssw.c
new file mode 100644
index 000000000000..84b8d890d9d4
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "sssw"
+
+#include <trace/events/sched.h>
+#include <trace/events/signal.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "sssw.h"
+
+static struct rv_monitor rv_sssw;
+DECLARE_DA_MON_PER_TASK(sssw, unsigned char);
+
+static void handle_sched_set_state(void *data, struct task_struct *tsk, int state)
+{
+ if (state == TASK_RUNNING)
+ da_handle_start_event_sssw(tsk, sched_set_state_runnable_sssw);
+ else
+ da_handle_event_sssw(tsk, sched_set_state_sleepable_sssw);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ if (preempt)
+ da_handle_event_sssw(prev, sched_switch_preempt_sssw);
+ else if (prev_state == TASK_RUNNING)
+ da_handle_event_sssw(prev, sched_switch_yield_sssw);
+ else if (prev_state == TASK_RTLOCK_WAIT)
+ /* special case of sleeping task with racy conditions */
+ da_handle_event_sssw(prev, sched_switch_blocking_sssw);
+ else
+ da_handle_event_sssw(prev, sched_switch_suspend_sssw);
+ da_handle_event_sssw(next, sched_switch_in_sssw);
+}
+
+static void handle_sched_wakeup(void *data, struct task_struct *p)
+{
+ /*
+ * Wakeup can also lead to signal_wakeup although the system is
+ * actually runnable. The monitor can safely start with this event.
+ */
+ da_handle_start_event_sssw(p, sched_wakeup_sssw);
+}
+
+static void handle_signal_deliver(void *data, int sig,
+ struct kernel_siginfo *info,
+ struct k_sigaction *ka)
+{
+ da_handle_event_sssw(current, signal_deliver_sssw);
+}
+
+static int enable_sssw(void)
+{
+ int retval;
+
+ retval = da_monitor_init_sssw();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state);
+ rv_attach_trace_probe("sssw", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup);
+ rv_attach_trace_probe("sssw", signal_deliver, handle_signal_deliver);
+
+ return 0;
+}
+
+static void disable_sssw(void)
+{
+ rv_sssw.enabled = 0;
+
+ rv_detach_trace_probe("sssw", sched_set_state_tp, handle_sched_set_state);
+ rv_detach_trace_probe("sssw", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("sssw", sched_wakeup, handle_sched_wakeup);
+ rv_detach_trace_probe("sssw", signal_deliver, handle_signal_deliver);
+
+ da_monitor_destroy_sssw();
+}
+
+static struct rv_monitor rv_sssw = {
+ .name = "sssw",
+ .description = "set state sleep and wakeup.",
+ .enable = enable_sssw,
+ .disable = disable_sssw,
+ .reset = da_monitor_reset_all_sssw,
+ .enabled = 0,
+};
+
+static int __init register_sssw(void)
+{
+ return rv_register_monitor(&rv_sssw, &rv_sched);
+}
+
+static void __exit unregister_sssw(void)
+{
+ rv_unregister_monitor(&rv_sssw);
+}
+
+module_init(register_sssw);
+module_exit(unregister_sssw);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sssw: set state sleep and wakeup.");
diff --git a/kernel/trace/rv/monitors/sssw/sssw.h b/kernel/trace/rv/monitors/sssw/sssw.h
new file mode 100644
index 000000000000..243d54050c94
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sssw automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_sssw {
+ runnable_sssw = 0,
+ signal_wakeup_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ state_max_sssw
+};
+
+#define INVALID_STATE state_max_sssw
+
+enum events_sssw {
+ sched_set_state_runnable_sssw = 0,
+ sched_set_state_sleepable_sssw,
+ sched_switch_blocking_sssw,
+ sched_switch_in_sssw,
+ sched_switch_preempt_sssw,
+ sched_switch_suspend_sssw,
+ sched_switch_yield_sssw,
+ sched_wakeup_sssw,
+ signal_deliver_sssw,
+ event_max_sssw
+};
+
+struct automaton_sssw {
+ char *state_names[state_max_sssw];
+ char *event_names[event_max_sssw];
+ unsigned char function[state_max_sssw][event_max_sssw];
+ unsigned char initial_state;
+ bool final_states[state_max_sssw];
+};
+
+static const struct automaton_sssw automaton_sssw = {
+ .state_names = {
+ "runnable",
+ "signal_wakeup",
+ "sleepable",
+ "sleeping"
+ },
+ .event_names = {
+ "sched_set_state_runnable",
+ "sched_set_state_sleepable",
+ "sched_switch_blocking",
+ "sched_switch_in",
+ "sched_switch_preempt",
+ "sched_switch_suspend",
+ "sched_switch_yield",
+ "sched_wakeup",
+ "signal_deliver"
+ },
+ .function = {
+ {
+ runnable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ runnable_sssw,
+ runnable_sssw,
+ INVALID_STATE,
+ runnable_sssw,
+ runnable_sssw,
+ runnable_sssw
+ },
+ {
+ INVALID_STATE,
+ sleepable_sssw,
+ INVALID_STATE,
+ signal_wakeup_sssw,
+ signal_wakeup_sssw,
+ INVALID_STATE,
+ signal_wakeup_sssw,
+ signal_wakeup_sssw,
+ runnable_sssw
+ },
+ {
+ runnable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ sleepable_sssw,
+ sleepable_sssw,
+ sleeping_sssw,
+ signal_wakeup_sssw,
+ runnable_sssw,
+ sleepable_sssw
+ },
+ {
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ runnable_sssw,
+ INVALID_STATE
+ },
+ },
+ .initial_state = runnable_sssw,
+ .final_states = { 1, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/sssw/sssw_trace.h b/kernel/trace/rv/monitors/sssw/sssw_trace.h
new file mode 100644
index 000000000000..6c03cfc6960b
--- /dev/null
+++ b/kernel/trace/rv/monitors/sssw/sssw_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_SSSW
+DEFINE_EVENT(event_da_monitor_id, event_sssw,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_sssw,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_SSSW */
diff --git a/kernel/trace/rv/monitors/sts/Kconfig b/kernel/trace/rv/monitors/sts/Kconfig
new file mode 100644
index 000000000000..7d1ff0f6fc91
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/Kconfig
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+config RV_MON_STS
+ depends on RV
+ depends on TRACE_IRQFLAGS
+ depends on RV_MON_SCHED
+ default y
+ select DA_MON_EVENTS_IMPLICIT
+ bool "sts monitor"
+ help
+ Monitor to ensure relationships between scheduler and task switches
+ * the scheduler is called and returns with interrupts disabled
+ * each call to the scheduler has up to one switch
+ * switches only happen inside the scheduler
+ * each call to the scheduler disables interrupts to switch
+ This monitor is part of the sched monitors collection.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_sched.rst
diff --git a/kernel/trace/rv/monitors/sts/sts.c b/kernel/trace/rv/monitors/sts/sts.c
new file mode 100644
index 000000000000..c4a9cd67c1d2
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/sts.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ftrace.h>
+#include <linux/tracepoint.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/rv.h>
+#include <rv/instrumentation.h>
+#include <rv/da_monitor.h>
+
+#define MODULE_NAME "sts"
+
+#include <trace/events/sched.h>
+#include <trace/events/irq.h>
+#include <trace/events/preemptirq.h>
+#include <rv_trace.h>
+#include <monitors/sched/sched.h>
+
+#include "sts.h"
+
+static struct rv_monitor rv_sts;
+DECLARE_DA_MON_PER_CPU(sts, unsigned char);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/trace/irq_vectors.h>
+
+static void handle_vector_irq_entry(void *data, int vector)
+{
+ da_handle_event_sts(irq_entry_sts);
+}
+
+static void attach_vector_irq(void)
+{
+ rv_attach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_attach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_attach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("sts", call_function_entry, handle_vector_irq_entry);
+ rv_attach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+static void detach_vector_irq(void)
+{
+ rv_detach_trace_probe("sts", local_timer_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_IRQ_WORK))
+ rv_detach_trace_probe("sts", irq_work_entry, handle_vector_irq_entry);
+ if (IS_ENABLED(CONFIG_SMP)) {
+ rv_detach_trace_probe("sts", reschedule_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("sts", call_function_entry, handle_vector_irq_entry);
+ rv_detach_trace_probe("sts", call_function_single_entry, handle_vector_irq_entry);
+ }
+}
+
+#else
+/* We assume irq_entry tracepoints are sufficient on other architectures */
+static void attach_vector_irq(void) { }
+static void detach_vector_irq(void) { }
+#endif
+
+static void handle_irq_disable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_sts(irq_disable_sts);
+}
+
+static void handle_irq_enable(void *data, unsigned long ip, unsigned long parent_ip)
+{
+ da_handle_event_sts(irq_enable_sts);
+}
+
+static void handle_irq_entry(void *data, int irq, struct irqaction *action)
+{
+ da_handle_event_sts(irq_entry_sts);
+}
+
+static void handle_sched_switch(void *data, bool preempt,
+ struct task_struct *prev,
+ struct task_struct *next,
+ unsigned int prev_state)
+{
+ da_handle_event_sts(sched_switch_sts);
+}
+
+static void handle_schedule_entry(void *data, bool preempt)
+{
+ da_handle_event_sts(schedule_entry_sts);
+}
+
+static void handle_schedule_exit(void *data, bool is_switch)
+{
+ da_handle_start_event_sts(schedule_exit_sts);
+}
+
+static int enable_sts(void)
+{
+ int retval;
+
+ retval = da_monitor_init_sts();
+ if (retval)
+ return retval;
+
+ rv_attach_trace_probe("sts", irq_disable, handle_irq_disable);
+ rv_attach_trace_probe("sts", irq_enable, handle_irq_enable);
+ rv_attach_trace_probe("sts", irq_handler_entry, handle_irq_entry);
+ rv_attach_trace_probe("sts", sched_switch, handle_sched_switch);
+ rv_attach_trace_probe("sts", sched_entry_tp, handle_schedule_entry);
+ rv_attach_trace_probe("sts", sched_exit_tp, handle_schedule_exit);
+ attach_vector_irq();
+
+ return 0;
+}
+
+static void disable_sts(void)
+{
+ rv_sts.enabled = 0;
+
+ rv_detach_trace_probe("sts", irq_disable, handle_irq_disable);
+ rv_detach_trace_probe("sts", irq_enable, handle_irq_enable);
+ rv_detach_trace_probe("sts", irq_handler_entry, handle_irq_entry);
+ rv_detach_trace_probe("sts", sched_switch, handle_sched_switch);
+ rv_detach_trace_probe("sts", sched_entry_tp, handle_schedule_entry);
+ rv_detach_trace_probe("sts", sched_exit_tp, handle_schedule_exit);
+ detach_vector_irq();
+
+ da_monitor_destroy_sts();
+}
+
+/*
+ * This is the monitor register section.
+ */
+static struct rv_monitor rv_sts = {
+ .name = "sts",
+ .description = "schedule implies task switch.",
+ .enable = enable_sts,
+ .disable = disable_sts,
+ .reset = da_monitor_reset_all_sts,
+ .enabled = 0,
+};
+
+static int __init register_sts(void)
+{
+ return rv_register_monitor(&rv_sts, &rv_sched);
+}
+
+static void __exit unregister_sts(void)
+{
+ rv_unregister_monitor(&rv_sts);
+}
+
+module_init(register_sts);
+module_exit(unregister_sts);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
+MODULE_DESCRIPTION("sts: schedule implies task switch.");
diff --git a/kernel/trace/rv/monitors/sts/sts.h b/kernel/trace/rv/monitors/sts/sts.h
new file mode 100644
index 000000000000..3368b6599a00
--- /dev/null
+++ b/kernel/trace/rv/monitors/sts/sts.h
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Automatically generated C representation of sts automaton
+ * For further information about this format, see kernel documentation:
+ * Documentation/trace/rv/deterministic_automata.rst
+ */
+
+enum states_sts {
+ can_sched_sts = 0,
+ cant_sched_sts,
+ disable_to_switch_sts,
+ enable_to_exit_sts,
+ in_irq_sts,
+ scheduling_sts,
+ switching_sts,
+ state_max_sts
+};
+
+#define INVALID_STATE state_max_sts
+
+enum events_sts {
+ irq_disable_sts = 0,
+ irq_enable_sts,
+ irq_entry_sts,
+ sched_switch_sts,
+ schedule_entry_sts,
+ schedule_exit_sts,
+ event_max_sts
+};
+
+struct automaton_sts {
+ char *state_names[state_max_sts];
+ char *event_names[event_max_sts];
+ unsigned char function[state_max_sts][event_max_sts];
+ unsigned char initial_state;
+ bool final_states[state_max_sts];
+};
+
+static const struct automaton_sts automaton_sts = {
+ .state_names = {
+ "can_sched",
+ "cant_sched",
+ "disable_to_switch",
+ "enable_to_exit",
+ "in_irq",
+ "scheduling",
+ "switching"
+ },
+ .event_names = {
+ "irq_disable",
+ "irq_enable",
+ "irq_entry",
+ "sched_switch",
+ "schedule_entry",
+ "schedule_exit"
+ },
+ .function = {
+ {
+ cant_sched_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ scheduling_sts,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ can_sched_sts,
+ cant_sched_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ enable_to_exit_sts,
+ in_irq_sts,
+ switching_sts,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ enable_to_exit_sts,
+ enable_to_exit_sts,
+ enable_to_exit_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ can_sched_sts
+ },
+ {
+ INVALID_STATE,
+ scheduling_sts,
+ in_irq_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ disable_to_switch_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ {
+ INVALID_STATE,
+ enable_to_exit_sts,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE,
+ INVALID_STATE
+ },
+ },
+ .initial_state = can_sched_sts,
+ .final_states = { 1, 0, 0, 0, 0, 0, 0 },
+};
diff --git a/kernel/trace/rv/monitors/tss/tss_trace.h b/kernel/trace/rv/monitors/sts/sts_trace.h
index 4619dbb50cc0..d78beb58d5b3 100644
--- a/kernel/trace/rv/monitors/tss/tss_trace.h
+++ b/kernel/trace/rv/monitors/sts/sts_trace.h
@@ -4,12 +4,12 @@
* Snippet to be included in rv_trace.h
*/
-#ifdef CONFIG_RV_MON_TSS
-DEFINE_EVENT(event_da_monitor, event_tss,
+#ifdef CONFIG_RV_MON_STS
+DEFINE_EVENT(event_da_monitor, event_sts,
TP_PROTO(char *state, char *event, char *next_state, bool final_state),
TP_ARGS(state, event, next_state, final_state));
-DEFINE_EVENT(error_da_monitor, error_tss,
+DEFINE_EVENT(error_da_monitor, error_sts,
TP_PROTO(char *state, char *event),
TP_ARGS(state, event));
-#endif /* CONFIG_RV_MON_TSS */
+#endif /* CONFIG_RV_MON_STS */
diff --git a/kernel/trace/rv/monitors/tss/tss.c b/kernel/trace/rv/monitors/tss/tss.c
deleted file mode 100644
index 542787e6524f..000000000000
--- a/kernel/trace/rv/monitors/tss/tss.c
+++ /dev/null
@@ -1,91 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/ftrace.h>
-#include <linux/tracepoint.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/rv.h>
-#include <rv/instrumentation.h>
-#include <rv/da_monitor.h>
-
-#define MODULE_NAME "tss"
-
-#include <trace/events/sched.h>
-#include <rv_trace.h>
-#include <monitors/sched/sched.h>
-
-#include "tss.h"
-
-static struct rv_monitor rv_tss;
-DECLARE_DA_MON_PER_CPU(tss, unsigned char);
-
-static void handle_sched_switch(void *data, bool preempt,
- struct task_struct *prev,
- struct task_struct *next,
- unsigned int prev_state)
-{
- da_handle_event_tss(sched_switch_tss);
-}
-
-static void handle_schedule_entry(void *data, bool preempt, unsigned long ip)
-{
- da_handle_event_tss(schedule_entry_tss);
-}
-
-static void handle_schedule_exit(void *data, bool is_switch, unsigned long ip)
-{
- da_handle_start_event_tss(schedule_exit_tss);
-}
-
-static int enable_tss(void)
-{
- int retval;
-
- retval = da_monitor_init_tss();
- if (retval)
- return retval;
-
- rv_attach_trace_probe("tss", sched_switch, handle_sched_switch);
- rv_attach_trace_probe("tss", sched_entry_tp, handle_schedule_entry);
- rv_attach_trace_probe("tss", sched_exit_tp, handle_schedule_exit);
-
- return 0;
-}
-
-static void disable_tss(void)
-{
- rv_tss.enabled = 0;
-
- rv_detach_trace_probe("tss", sched_switch, handle_sched_switch);
- rv_detach_trace_probe("tss", sched_entry_tp, handle_schedule_entry);
- rv_detach_trace_probe("tss", sched_exit_tp, handle_schedule_exit);
-
- da_monitor_destroy_tss();
-}
-
-static struct rv_monitor rv_tss = {
- .name = "tss",
- .description = "task switch while scheduling.",
- .enable = enable_tss,
- .disable = disable_tss,
- .reset = da_monitor_reset_all_tss,
- .enabled = 0,
-};
-
-static int __init register_tss(void)
-{
- rv_register_monitor(&rv_tss, &rv_sched);
- return 0;
-}
-
-static void __exit unregister_tss(void)
-{
- rv_unregister_monitor(&rv_tss);
-}
-
-module_init(register_tss);
-module_exit(unregister_tss);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Gabriele Monaco <gmonaco@redhat.com>");
-MODULE_DESCRIPTION("tss: task switch while scheduling.");
diff --git a/kernel/trace/rv/monitors/tss/tss.h b/kernel/trace/rv/monitors/tss/tss.h
deleted file mode 100644
index f0a36fda1b87..000000000000
--- a/kernel/trace/rv/monitors/tss/tss.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Automatically generated C representation of tss automaton
- * For further information about this format, see kernel documentation:
- * Documentation/trace/rv/deterministic_automata.rst
- */
-
-enum states_tss {
- thread_tss = 0,
- sched_tss,
- state_max_tss
-};
-
-#define INVALID_STATE state_max_tss
-
-enum events_tss {
- sched_switch_tss = 0,
- schedule_entry_tss,
- schedule_exit_tss,
- event_max_tss
-};
-
-struct automaton_tss {
- char *state_names[state_max_tss];
- char *event_names[event_max_tss];
- unsigned char function[state_max_tss][event_max_tss];
- unsigned char initial_state;
- bool final_states[state_max_tss];
-};
-
-static const struct automaton_tss automaton_tss = {
- .state_names = {
- "thread",
- "sched"
- },
- .event_names = {
- "sched_switch",
- "schedule_entry",
- "schedule_exit"
- },
- .function = {
- { INVALID_STATE, sched_tss, INVALID_STATE },
- { sched_tss, INVALID_STATE, thread_tss },
- },
- .initial_state = thread_tss,
- .final_states = { 1, 0 },
-};
diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig
index e464b9294865..87a26195792b 100644
--- a/kernel/trace/rv/monitors/wip/Kconfig
+++ b/kernel/trace/rv/monitors/wip/Kconfig
@@ -2,7 +2,7 @@
#
config RV_MON_WIP
depends on RV
- depends on PREEMPT_TRACER
+ depends on TRACE_PREEMPT_TOGGLE
select DA_MON_EVENTS_IMPLICIT
bool "wip monitor"
help
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index ed758fec8608..4b4e99615a11 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -71,8 +71,7 @@ static struct rv_monitor rv_wip = {
static int __init register_wip(void)
{
- rv_register_monitor(&rv_wip, NULL);
- return 0;
+ return rv_register_monitor(&rv_wip, NULL);
}
static void __exit unregister_wip(void)
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 172f31c4b0f3..4145bea2729e 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -70,8 +70,7 @@ static struct rv_monitor rv_wwnr = {
static int __init register_wwnr(void)
{
- rv_register_monitor(&rv_wwnr, NULL);
- return 0;
+ return rv_register_monitor(&rv_wwnr, NULL);
}
static void __exit unregister_wwnr(void)
diff --git a/kernel/trace/rv/reactor_panic.c b/kernel/trace/rv/reactor_panic.c
index 0186ff4cbd0b..74c6bcc2c749 100644
--- a/kernel/trace/rv/reactor_panic.c
+++ b/kernel/trace/rv/reactor_panic.c
@@ -13,9 +13,13 @@
#include <linux/init.h>
#include <linux/rv.h>
-static void rv_panic_reaction(char *msg)
+__printf(1, 2) static void rv_panic_reaction(const char *msg, ...)
{
- panic(msg);
+ va_list args;
+
+ va_start(args, msg);
+ vpanic(msg, args);
+ va_end(args);
}
static struct rv_reactor rv_panic = {
diff --git a/kernel/trace/rv/reactor_printk.c b/kernel/trace/rv/reactor_printk.c
index 178759dbf89f..2dae2916c05f 100644
--- a/kernel/trace/rv/reactor_printk.c
+++ b/kernel/trace/rv/reactor_printk.c
@@ -12,9 +12,13 @@
#include <linux/init.h>
#include <linux/rv.h>
-static void rv_printk_reaction(char *msg)
+__printf(1, 2) static void rv_printk_reaction(const char *msg, ...)
{
- printk_deferred(msg);
+ va_list args;
+
+ va_start(args, msg);
+ vprintk_deferred(msg, args);
+ va_end(args);
}
static struct rv_reactor rv_printk = {
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index e4077500a91d..1482e91c39f4 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -143,7 +143,7 @@
#include <linux/init.h>
#include <linux/slab.h>
-#ifdef CONFIG_DA_MON_EVENTS
+#ifdef CONFIG_RV_MON_EVENTS
#define CREATE_TRACE_POINTS
#include <rv_trace.h>
#endif
@@ -165,7 +165,7 @@ struct dentry *get_monitors_root(void)
LIST_HEAD(rv_monitors_list);
static int task_monitor_count;
-static bool task_monitor_slots[RV_PER_TASK_MONITORS];
+static bool task_monitor_slots[CONFIG_RV_PER_TASK_MONITORS];
int rv_get_task_monitor_slot(void)
{
@@ -173,12 +173,12 @@ int rv_get_task_monitor_slot(void)
lockdep_assert_held(&rv_interface_lock);
- if (task_monitor_count == RV_PER_TASK_MONITORS)
+ if (task_monitor_count == CONFIG_RV_PER_TASK_MONITORS)
return -EBUSY;
task_monitor_count++;
- for (i = 0; i < RV_PER_TASK_MONITORS; i++) {
+ for (i = 0; i < CONFIG_RV_PER_TASK_MONITORS; i++) {
if (task_monitor_slots[i] == false) {
task_monitor_slots[i] = true;
return i;
@@ -194,7 +194,7 @@ void rv_put_task_monitor_slot(int slot)
{
lockdep_assert_held(&rv_interface_lock);
- if (slot < 0 || slot >= RV_PER_TASK_MONITORS) {
+ if (slot < 0 || slot >= CONFIG_RV_PER_TASK_MONITORS) {
WARN_ONCE(1, "RV releasing an invalid slot!: %d\n", slot);
return;
}
@@ -210,9 +210,9 @@ void rv_put_task_monitor_slot(int slot)
* Monitors with a parent are nested,
* Monitors without a parent could be standalone or containers.
*/
-bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
+bool rv_is_nested_monitor(struct rv_monitor *mon)
{
- return mdef->parent != NULL;
+ return mon->parent != NULL;
}
/*
@@ -223,16 +223,16 @@ bool rv_is_nested_monitor(struct rv_monitor_def *mdef)
* for enable()/disable(). Use this condition to find empty containers.
* Keep both conditions in case we have some non-compliant containers.
*/
-bool rv_is_container_monitor(struct rv_monitor_def *mdef)
+bool rv_is_container_monitor(struct rv_monitor *mon)
{
- struct rv_monitor_def *next;
+ struct rv_monitor *next;
- if (list_is_last(&mdef->list, &rv_monitors_list))
+ if (list_is_last(&mon->list, &rv_monitors_list))
return false;
- next = list_next_entry(mdef, list);
+ next = list_next_entry(mon, list);
- return next->parent == mdef->monitor || !mdef->monitor->enable;
+ return next->parent == mon || !mon->enable;
}
/*
@@ -241,10 +241,10 @@ bool rv_is_container_monitor(struct rv_monitor_def *mdef)
static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf, size_t count,
loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
const char *buff;
- buff = mdef->monitor->enabled ? "1\n" : "0\n";
+ buff = mon->enabled ? "1\n" : "0\n";
return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff)+1);
}
@@ -252,14 +252,14 @@ static ssize_t monitor_enable_read_data(struct file *filp, char __user *user_buf
/*
* __rv_disable_monitor - disabled an enabled monitor
*/
-static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
+static int __rv_disable_monitor(struct rv_monitor *mon, bool sync)
{
lockdep_assert_held(&rv_interface_lock);
- if (mdef->monitor->enabled) {
- mdef->monitor->enabled = 0;
- if (mdef->monitor->disable)
- mdef->monitor->disable();
+ if (mon->enabled) {
+ mon->enabled = 0;
+ if (mon->disable)
+ mon->disable();
/*
* Wait for the execution of all events to finish.
@@ -273,90 +273,90 @@ static int __rv_disable_monitor(struct rv_monitor_def *mdef, bool sync)
return 0;
}
-static void rv_disable_single(struct rv_monitor_def *mdef)
+static void rv_disable_single(struct rv_monitor *mon)
{
- __rv_disable_monitor(mdef, true);
+ __rv_disable_monitor(mon, true);
}
-static int rv_enable_single(struct rv_monitor_def *mdef)
+static int rv_enable_single(struct rv_monitor *mon)
{
int retval;
lockdep_assert_held(&rv_interface_lock);
- if (mdef->monitor->enabled)
+ if (mon->enabled)
return 0;
- retval = mdef->monitor->enable();
+ retval = mon->enable();
if (!retval)
- mdef->monitor->enabled = 1;
+ mon->enabled = 1;
return retval;
}
-static void rv_disable_container(struct rv_monitor_def *mdef)
+static void rv_disable_container(struct rv_monitor *mon)
{
- struct rv_monitor_def *p = mdef;
+ struct rv_monitor *p = mon;
int enabled = 0;
list_for_each_entry_continue(p, &rv_monitors_list, list) {
- if (p->parent != mdef->monitor)
+ if (p->parent != mon)
break;
enabled += __rv_disable_monitor(p, false);
}
if (enabled)
tracepoint_synchronize_unregister();
- mdef->monitor->enabled = 0;
+ mon->enabled = 0;
}
-static int rv_enable_container(struct rv_monitor_def *mdef)
+static int rv_enable_container(struct rv_monitor *mon)
{
- struct rv_monitor_def *p = mdef;
+ struct rv_monitor *p = mon;
int retval = 0;
list_for_each_entry_continue(p, &rv_monitors_list, list) {
- if (retval || p->parent != mdef->monitor)
+ if (retval || p->parent != mon)
break;
retval = rv_enable_single(p);
}
if (retval)
- rv_disable_container(mdef);
+ rv_disable_container(mon);
else
- mdef->monitor->enabled = 1;
+ mon->enabled = 1;
return retval;
}
/**
* rv_disable_monitor - disable a given runtime monitor
- * @mdef: Pointer to the monitor definition structure.
+ * @mon: Pointer to the monitor definition structure.
*
* Returns 0 on success.
*/
-int rv_disable_monitor(struct rv_monitor_def *mdef)
+int rv_disable_monitor(struct rv_monitor *mon)
{
- if (rv_is_container_monitor(mdef))
- rv_disable_container(mdef);
+ if (rv_is_container_monitor(mon))
+ rv_disable_container(mon);
else
- rv_disable_single(mdef);
+ rv_disable_single(mon);
return 0;
}
/**
* rv_enable_monitor - enable a given runtime monitor
- * @mdef: Pointer to the monitor definition structure.
+ * @mon: Pointer to the monitor definition structure.
*
* Returns 0 on success, error otherwise.
*/
-int rv_enable_monitor(struct rv_monitor_def *mdef)
+int rv_enable_monitor(struct rv_monitor *mon)
{
int retval;
- if (rv_is_container_monitor(mdef))
- retval = rv_enable_container(mdef);
+ if (rv_is_container_monitor(mon))
+ retval = rv_enable_container(mon);
else
- retval = rv_enable_single(mdef);
+ retval = rv_enable_single(mon);
return retval;
}
@@ -367,7 +367,7 @@ int rv_enable_monitor(struct rv_monitor_def *mdef)
static ssize_t monitor_enable_write_data(struct file *filp, const char __user *user_buf,
size_t count, loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
int retval;
bool val;
@@ -378,9 +378,9 @@ static ssize_t monitor_enable_write_data(struct file *filp, const char __user *u
mutex_lock(&rv_interface_lock);
if (val)
- retval = rv_enable_monitor(mdef);
+ retval = rv_enable_monitor(mon);
else
- retval = rv_disable_monitor(mdef);
+ retval = rv_disable_monitor(mon);
mutex_unlock(&rv_interface_lock);
@@ -399,12 +399,12 @@ static const struct file_operations interface_enable_fops = {
static ssize_t monitor_desc_read_data(struct file *filp, char __user *user_buf, size_t count,
loff_t *ppos)
{
- struct rv_monitor_def *mdef = filp->private_data;
+ struct rv_monitor *mon = filp->private_data;
char buff[256];
memset(buff, 0, sizeof(buff));
- snprintf(buff, sizeof(buff), "%s\n", mdef->monitor->description);
+ snprintf(buff, sizeof(buff), "%s\n", mon->description);
return simple_read_from_buffer(user_buf, count, ppos, buff, strlen(buff) + 1);
}
@@ -419,37 +419,37 @@ static const struct file_operations interface_desc_fops = {
* the monitor dir, where the specific options of the monitor
* are exposed.
*/
-static int create_monitor_dir(struct rv_monitor_def *mdef, struct rv_monitor_def *parent)
+static int create_monitor_dir(struct rv_monitor *mon, struct rv_monitor *parent)
{
struct dentry *root = parent ? parent->root_d : get_monitors_root();
- const char *name = mdef->monitor->name;
+ const char *name = mon->name;
struct dentry *tmp;
int retval;
- mdef->root_d = rv_create_dir(name, root);
- if (!mdef->root_d)
+ mon->root_d = rv_create_dir(name, root);
+ if (!mon->root_d)
return -ENOMEM;
- tmp = rv_create_file("enable", RV_MODE_WRITE, mdef->root_d, mdef, &interface_enable_fops);
+ tmp = rv_create_file("enable", RV_MODE_WRITE, mon->root_d, mon, &interface_enable_fops);
if (!tmp) {
retval = -ENOMEM;
goto out_remove_root;
}
- tmp = rv_create_file("desc", RV_MODE_READ, mdef->root_d, mdef, &interface_desc_fops);
+ tmp = rv_create_file("desc", RV_MODE_READ, mon->root_d, mon, &interface_desc_fops);
if (!tmp) {
retval = -ENOMEM;
goto out_remove_root;
}
- retval = reactor_populate_monitor(mdef);
+ retval = reactor_populate_monitor(mon);
if (retval)
goto out_remove_root;
return 0;
out_remove_root:
- rv_remove(mdef->root_d);
+ rv_remove(mon->root_d);
return retval;
}
@@ -458,13 +458,12 @@ out_remove_root:
*/
static int monitors_show(struct seq_file *m, void *p)
{
- struct rv_monitor_def *mon_def = p;
+ struct rv_monitor *mon = container_of(p, struct rv_monitor, list);
- if (mon_def->parent)
- seq_printf(m, "%s:%s\n", mon_def->parent->name,
- mon_def->monitor->name);
+ if (mon->parent)
+ seq_printf(m, "%s:%s\n", mon->parent->name, mon->name);
else
- seq_printf(m, "%s\n", mon_def->monitor->name);
+ seq_printf(m, "%s\n", mon->name);
return 0;
}
@@ -496,13 +495,13 @@ static void *available_monitors_next(struct seq_file *m, void *p, loff_t *pos)
*/
static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct rv_monitor_def *m_def = p;
+ struct rv_monitor *mon = p;
(*pos)++;
- list_for_each_entry_continue(m_def, &rv_monitors_list, list) {
- if (m_def->monitor->enabled)
- return m_def;
+ list_for_each_entry_continue(mon, &rv_monitors_list, list) {
+ if (mon->enabled)
+ return mon;
}
return NULL;
@@ -510,7 +509,7 @@ static void *enabled_monitors_next(struct seq_file *m, void *p, loff_t *pos)
static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
{
- struct rv_monitor_def *m_def;
+ struct rv_monitor *mon;
loff_t l;
mutex_lock(&rv_interface_lock);
@@ -518,15 +517,15 @@ static void *enabled_monitors_start(struct seq_file *m, loff_t *pos)
if (list_empty(&rv_monitors_list))
return NULL;
- m_def = list_entry(&rv_monitors_list, struct rv_monitor_def, list);
+ mon = list_entry(&rv_monitors_list, struct rv_monitor, list);
for (l = 0; l <= *pos; ) {
- m_def = enabled_monitors_next(m, m_def, &l);
- if (!m_def)
+ mon = enabled_monitors_next(m, mon, &l);
+ if (!mon)
break;
}
- return m_def;
+ return mon;
}
/*
@@ -566,13 +565,13 @@ static const struct file_operations available_monitors_ops = {
*/
static void disable_all_monitors(void)
{
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
int enabled = 0;
mutex_lock(&rv_interface_lock);
- list_for_each_entry(mdef, &rv_monitors_list, list)
- enabled += __rv_disable_monitor(mdef, false);
+ list_for_each_entry(mon, &rv_monitors_list, list)
+ enabled += __rv_disable_monitor(mon, false);
if (enabled) {
/*
@@ -598,7 +597,7 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
size_t count, loff_t *ppos)
{
char buff[MAX_RV_MONITOR_NAME_SIZE + 2];
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
int retval = -EINVAL;
bool enable = true;
char *ptr, *tmp;
@@ -633,17 +632,17 @@ static ssize_t enabled_monitors_write(struct file *filp, const char __user *user
if (tmp)
ptr = tmp+1;
- list_for_each_entry(mdef, &rv_monitors_list, list) {
- if (strcmp(ptr, mdef->monitor->name) != 0)
+ list_for_each_entry(mon, &rv_monitors_list, list) {
+ if (strcmp(ptr, mon->name) != 0)
continue;
/*
* Monitor found!
*/
if (enable)
- retval = rv_enable_monitor(mdef);
+ retval = rv_enable_monitor(mon);
else
- retval = rv_disable_monitor(mdef);
+ retval = rv_disable_monitor(mon);
if (!retval)
retval = count;
@@ -675,8 +674,6 @@ static bool __read_mostly monitoring_on;
*/
bool rv_monitoring_on(void)
{
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_rmb();
return READ_ONCE(monitoring_on);
}
@@ -696,25 +693,21 @@ static ssize_t monitoring_on_read_data(struct file *filp, char __user *user_buf,
static void turn_monitoring_off(void)
{
WRITE_ONCE(monitoring_on, false);
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_wmb();
}
static void reset_all_monitors(void)
{
- struct rv_monitor_def *mdef;
+ struct rv_monitor *mon;
- list_for_each_entry(mdef, &rv_monitors_list, list) {
- if (mdef->monitor->enabled && mdef->monitor->reset)
- mdef->monitor->reset();
+ list_for_each_entry(mon, &rv_monitors_list, list) {
+ if (mon->enabled && mon->reset)
+ mon->reset();
}
}
static void turn_monitoring_on(void)
{
WRITE_ONCE(monitoring_on, true);
- /* Ensures that concurrent monitors read consistent monitoring_on */
- smp_wmb();
}
static void turn_monitoring_on_with_reset(void)
@@ -768,10 +761,9 @@ static const struct file_operations monitoring_on_fops = {
.read = monitoring_on_read_data,
};
-static void destroy_monitor_dir(struct rv_monitor_def *mdef)
+static void destroy_monitor_dir(struct rv_monitor *mon)
{
- reactor_cleanup_monitor(mdef);
- rv_remove(mdef->root_d);
+ rv_remove(mon->root_d);
}
/**
@@ -783,7 +775,7 @@ static void destroy_monitor_dir(struct rv_monitor_def *mdef)
*/
int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
{
- struct rv_monitor_def *r, *p = NULL;
+ struct rv_monitor *r;
int retval = 0;
if (strlen(monitor->name) >= MAX_RV_MONITOR_NAME_SIZE) {
@@ -795,49 +787,31 @@ int rv_register_monitor(struct rv_monitor *monitor, struct rv_monitor *parent)
mutex_lock(&rv_interface_lock);
list_for_each_entry(r, &rv_monitors_list, list) {
- if (strcmp(monitor->name, r->monitor->name) == 0) {
+ if (strcmp(monitor->name, r->name) == 0) {
pr_info("Monitor %s is already registered\n", monitor->name);
retval = -EEXIST;
goto out_unlock;
}
}
- if (parent) {
- list_for_each_entry(r, &rv_monitors_list, list) {
- if (strcmp(parent->name, r->monitor->name) == 0) {
- p = r;
- break;
- }
- }
- }
-
- if (p && rv_is_nested_monitor(p)) {
+ if (parent && rv_is_nested_monitor(parent)) {
pr_info("Parent monitor %s is already nested, cannot nest further\n",
parent->name);
retval = -EINVAL;
goto out_unlock;
}
- r = kzalloc(sizeof(struct rv_monitor_def), GFP_KERNEL);
- if (!r) {
- retval = -ENOMEM;
- goto out_unlock;
- }
-
- r->monitor = monitor;
- r->parent = parent;
+ monitor->parent = parent;
- retval = create_monitor_dir(r, p);
- if (retval) {
- kfree(r);
- goto out_unlock;
- }
+ retval = create_monitor_dir(monitor, parent);
+ if (retval)
+ return retval;
/* keep children close to the parent for easier visualisation */
- if (p)
- list_add(&r->list, &p->list);
+ if (parent)
+ list_add(&monitor->list, &parent->list);
else
- list_add_tail(&r->list, &rv_monitors_list);
+ list_add_tail(&monitor->list, &rv_monitors_list);
out_unlock:
mutex_unlock(&rv_interface_lock);
@@ -852,17 +826,11 @@ out_unlock:
*/
int rv_unregister_monitor(struct rv_monitor *monitor)
{
- struct rv_monitor_def *ptr, *next;
-
mutex_lock(&rv_interface_lock);
- list_for_each_entry_safe(ptr, next, &rv_monitors_list, list) {
- if (strcmp(monitor->name, ptr->monitor->name) == 0) {
- rv_disable_monitor(ptr);
- list_del(&ptr->list);
- destroy_monitor_dir(ptr);
- }
- }
+ rv_disable_monitor(monitor);
+ list_del(&monitor->list);
+ destroy_monitor_dir(monitor);
mutex_unlock(&rv_interface_lock);
return 0;
diff --git a/kernel/trace/rv/rv.h b/kernel/trace/rv/rv.h
index 98fca0a1adbc..1485a70c1bf4 100644
--- a/kernel/trace/rv/rv.h
+++ b/kernel/trace/rv/rv.h
@@ -23,48 +23,21 @@ struct rv_interface {
extern struct mutex rv_interface_lock;
extern struct list_head rv_monitors_list;
-#ifdef CONFIG_RV_REACTORS
-struct rv_reactor_def {
- struct list_head list;
- struct rv_reactor *reactor;
- /* protected by the monitor interface lock */
- int counter;
-};
-#endif
-
-struct rv_monitor_def {
- struct list_head list;
- struct rv_monitor *monitor;
- struct rv_monitor *parent;
- struct dentry *root_d;
-#ifdef CONFIG_RV_REACTORS
- struct rv_reactor_def *rdef;
- bool reacting;
-#endif
- bool task_monitor;
-};
-
struct dentry *get_monitors_root(void);
-int rv_disable_monitor(struct rv_monitor_def *mdef);
-int rv_enable_monitor(struct rv_monitor_def *mdef);
-bool rv_is_container_monitor(struct rv_monitor_def *mdef);
-bool rv_is_nested_monitor(struct rv_monitor_def *mdef);
+int rv_disable_monitor(struct rv_monitor *mon);
+int rv_enable_monitor(struct rv_monitor *mon);
+bool rv_is_container_monitor(struct rv_monitor *mon);
+bool rv_is_nested_monitor(struct rv_monitor *mon);
#ifdef CONFIG_RV_REACTORS
-int reactor_populate_monitor(struct rv_monitor_def *mdef);
-void reactor_cleanup_monitor(struct rv_monitor_def *mdef);
+int reactor_populate_monitor(struct rv_monitor *mon);
int init_rv_reactors(struct dentry *root_dir);
#else
-static inline int reactor_populate_monitor(struct rv_monitor_def *mdef)
+static inline int reactor_populate_monitor(struct rv_monitor *mon)
{
return 0;
}
-static inline void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
-{
- return;
-}
-
static inline int init_rv_reactors(struct dentry *root_dir)
{
return 0;
diff --git a/kernel/trace/rv/rv_reactors.c b/kernel/trace/rv/rv_reactors.c
index 9501ca886d83..d32859fec238 100644
--- a/kernel/trace/rv/rv_reactors.c
+++ b/kernel/trace/rv/rv_reactors.c
@@ -70,12 +70,12 @@
*/
static LIST_HEAD(rv_reactors_list);
-static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
+static struct rv_reactor *get_reactor_rdef_by_name(char *name)
{
- struct rv_reactor_def *r;
+ struct rv_reactor *r;
list_for_each_entry(r, &rv_reactors_list, list) {
- if (strcmp(name, r->reactor->name) == 0)
+ if (strcmp(name, r->name) == 0)
return r;
}
return NULL;
@@ -86,9 +86,9 @@ static struct rv_reactor_def *get_reactor_rdef_by_name(char *name)
*/
static int reactors_show(struct seq_file *m, void *p)
{
- struct rv_reactor_def *rea_def = p;
+ struct rv_reactor *reactor = container_of(p, struct rv_reactor, list);
- seq_printf(m, "%s\n", rea_def->reactor->name);
+ seq_printf(m, "%s\n", reactor->name);
return 0;
}
@@ -138,13 +138,13 @@ static const struct file_operations available_reactors_ops = {
*/
static int monitor_reactor_show(struct seq_file *m, void *p)
{
- struct rv_monitor_def *mdef = m->private;
- struct rv_reactor_def *rdef = p;
+ struct rv_monitor *mon = m->private;
+ struct rv_reactor *reactor = container_of(p, struct rv_reactor, list);
- if (mdef->rdef == rdef)
- seq_printf(m, "[%s]\n", rdef->reactor->name);
+ if (mon->reactor == reactor)
+ seq_printf(m, "[%s]\n", reactor->name);
else
- seq_printf(m, "%s\n", rdef->reactor->name);
+ seq_printf(m, "%s\n", reactor->name);
return 0;
}
@@ -158,43 +158,37 @@ static const struct seq_operations monitor_reactors_seq_ops = {
.show = monitor_reactor_show
};
-static void monitor_swap_reactors_single(struct rv_monitor_def *mdef,
- struct rv_reactor_def *rdef,
- bool reacting, bool nested)
+static void monitor_swap_reactors_single(struct rv_monitor *mon,
+ struct rv_reactor *reactor,
+ bool nested)
{
bool monitor_enabled;
/* nothing to do */
- if (mdef->rdef == rdef)
+ if (mon->reactor == reactor)
return;
- monitor_enabled = mdef->monitor->enabled;
+ monitor_enabled = mon->enabled;
if (monitor_enabled)
- rv_disable_monitor(mdef);
+ rv_disable_monitor(mon);
- /* swap reactor's usage */
- mdef->rdef->counter--;
- rdef->counter++;
-
- mdef->rdef = rdef;
- mdef->reacting = reacting;
- mdef->monitor->react = rdef->reactor->react;
+ mon->reactor = reactor;
+ mon->react = reactor->react;
/* enable only once if iterating through a container */
if (monitor_enabled && !nested)
- rv_enable_monitor(mdef);
+ rv_enable_monitor(mon);
}
-static void monitor_swap_reactors(struct rv_monitor_def *mdef,
- struct rv_reactor_def *rdef, bool reacting)
+static void monitor_swap_reactors(struct rv_monitor *mon, struct rv_reactor *reactor)
{
- struct rv_monitor_def *p = mdef;
+ struct rv_monitor *p = mon;
- if (rv_is_container_monitor(mdef))
+ if (rv_is_container_monitor(mon))
list_for_each_entry_continue(p, &rv_monitors_list, list) {
- if (p->parent != mdef->monitor)
+ if (p->parent != mon)
break;
- monitor_swap_reactors_single(p, rdef, reacting, true);
+ monitor_swap_reactors_single(p, reactor, true);
}
/*
* This call enables and disables the monitor if they were active.
@@ -202,7 +196,7 @@ static void monitor_swap_reactors(struct rv_monitor_def *mdef,
* All nested monitors are enabled also if they were off, we may refine
* this logic in the future.
*/
- monitor_swap_reactors_single(mdef, rdef, reacting, false);
+ monitor_swap_reactors_single(mon, reactor, false);
}
static ssize_t
@@ -210,11 +204,10 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
char buff[MAX_RV_REACTOR_NAME_SIZE + 2];
- struct rv_monitor_def *mdef;
- struct rv_reactor_def *rdef;
+ struct rv_monitor *mon;
+ struct rv_reactor *reactor;
struct seq_file *seq_f;
int retval = -EINVAL;
- bool enable;
char *ptr;
int len;
@@ -237,22 +230,17 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
* See monitor_reactors_open()
*/
seq_f = file->private_data;
- mdef = seq_f->private;
+ mon = seq_f->private;
mutex_lock(&rv_interface_lock);
retval = -EINVAL;
- list_for_each_entry(rdef, &rv_reactors_list, list) {
- if (strcmp(ptr, rdef->reactor->name) != 0)
+ list_for_each_entry(reactor, &rv_reactors_list, list) {
+ if (strcmp(ptr, reactor->name) != 0)
continue;
- if (rdef == get_reactor_rdef_by_name("nop"))
- enable = false;
- else
- enable = true;
-
- monitor_swap_reactors(mdef, rdef, enable);
+ monitor_swap_reactors(mon, reactor);
retval = count;
break;
@@ -268,7 +256,7 @@ monitor_reactors_write(struct file *file, const char __user *user_buf,
*/
static int monitor_reactors_open(struct inode *inode, struct file *file)
{
- struct rv_monitor_def *mdef = inode->i_private;
+ struct rv_monitor *mon = inode->i_private;
struct seq_file *seq_f;
int ret;
@@ -284,7 +272,7 @@ static int monitor_reactors_open(struct inode *inode, struct file *file)
/*
* Copy the create file "private" data to the seq_file private data.
*/
- seq_f->private = mdef;
+ seq_f->private = mon;
return 0;
};
@@ -299,23 +287,16 @@ static const struct file_operations monitor_reactors_ops = {
static int __rv_register_reactor(struct rv_reactor *reactor)
{
- struct rv_reactor_def *r;
+ struct rv_reactor *r;
list_for_each_entry(r, &rv_reactors_list, list) {
- if (strcmp(reactor->name, r->reactor->name) == 0) {
+ if (strcmp(reactor->name, r->name) == 0) {
pr_info("Reactor %s is already registered\n", reactor->name);
return -EINVAL;
}
}
- r = kzalloc(sizeof(struct rv_reactor_def), GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->reactor = reactor;
- r->counter = 0;
-
- list_add_tail(&r->list, &rv_reactors_list);
+ list_add_tail(&reactor->list, &rv_reactors_list);
return 0;
}
@@ -350,30 +331,10 @@ int rv_register_reactor(struct rv_reactor *reactor)
*/
int rv_unregister_reactor(struct rv_reactor *reactor)
{
- struct rv_reactor_def *ptr, *next;
- int ret = 0;
-
mutex_lock(&rv_interface_lock);
-
- list_for_each_entry_safe(ptr, next, &rv_reactors_list, list) {
- if (strcmp(reactor->name, ptr->reactor->name) == 0) {
-
- if (!ptr->counter) {
- list_del(&ptr->list);
- } else {
- printk(KERN_WARNING
- "rv: the rv_reactor %s is in use by %d monitor(s)\n",
- ptr->reactor->name, ptr->counter);
- printk(KERN_WARNING "rv: the rv_reactor %s cannot be removed\n",
- ptr->reactor->name);
- ret = -EBUSY;
- break;
- }
- }
- }
-
+ list_del(&reactor->list);
mutex_unlock(&rv_interface_lock);
- return ret;
+ return 0;
}
/*
@@ -454,43 +415,30 @@ static const struct file_operations reacting_on_fops = {
/**
* reactor_populate_monitor - creates per monitor reactors file
- * @mdef: monitor's definition.
+ * @mon: The monitor.
*
* Returns 0 if successful, error otherwise.
*/
-int reactor_populate_monitor(struct rv_monitor_def *mdef)
+int reactor_populate_monitor(struct rv_monitor *mon)
{
struct dentry *tmp;
- tmp = rv_create_file("reactors", RV_MODE_WRITE, mdef->root_d, mdef, &monitor_reactors_ops);
+ tmp = rv_create_file("reactors", RV_MODE_WRITE, mon->root_d, mon, &monitor_reactors_ops);
if (!tmp)
return -ENOMEM;
/*
* Configure as the rv_nop reactor.
*/
- mdef->rdef = get_reactor_rdef_by_name("nop");
- mdef->rdef->counter++;
- mdef->reacting = false;
+ mon->reactor = get_reactor_rdef_by_name("nop");
return 0;
}
-/**
- * reactor_cleanup_monitor - cleanup a monitor reference
- * @mdef: monitor's definition.
- */
-void reactor_cleanup_monitor(struct rv_monitor_def *mdef)
-{
- lockdep_assert_held(&rv_interface_lock);
- mdef->rdef->counter--;
- WARN_ON_ONCE(mdef->rdef->counter < 0);
-}
-
/*
* Nop reactor register
*/
-static void rv_nop_reaction(char *msg)
+__printf(1, 2) static void rv_nop_reaction(const char *msg, ...)
{
}
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
index 422b75f58891..4a6faddac614 100644
--- a/kernel/trace/rv/rv_trace.h
+++ b/kernel/trace/rv/rv_trace.h
@@ -16,24 +16,24 @@ DECLARE_EVENT_CLASS(event_da_monitor,
TP_ARGS(state, event, next_state, final_state),
TP_STRUCT__entry(
- __array( char, state, MAX_DA_NAME_LEN )
- __array( char, event, MAX_DA_NAME_LEN )
- __array( char, next_state, MAX_DA_NAME_LEN )
- __field( bool, final_state )
+ __string( state, state )
+ __string( event, event )
+ __string( next_state, next_state )
+ __field( bool, final_state )
),
TP_fast_assign(
- memcpy(__entry->state, state, MAX_DA_NAME_LEN);
- memcpy(__entry->event, event, MAX_DA_NAME_LEN);
- memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
- __entry->final_state = final_state;
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(next_state);
+ __entry->final_state = final_state;
),
- TP_printk("%s x %s -> %s %s",
- __entry->state,
- __entry->event,
- __entry->next_state,
- __entry->final_state ? "(final)" : "")
+ TP_printk("%s x %s -> %s%s",
+ __get_str(state),
+ __get_str(event),
+ __get_str(next_state),
+ __entry->final_state ? " (final)" : "")
);
DECLARE_EVENT_CLASS(error_da_monitor,
@@ -43,26 +43,26 @@ DECLARE_EVENT_CLASS(error_da_monitor,
TP_ARGS(state, event),
TP_STRUCT__entry(
- __array( char, state, MAX_DA_NAME_LEN )
- __array( char, event, MAX_DA_NAME_LEN )
+ __string( state, state )
+ __string( event, event )
),
TP_fast_assign(
- memcpy(__entry->state, state, MAX_DA_NAME_LEN);
- memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ __assign_str(state);
+ __assign_str(event);
),
TP_printk("event %s not expected in the state %s",
- __entry->event,
- __entry->state)
+ __get_str(event),
+ __get_str(state))
);
#include <monitors/wip/wip_trace.h>
-#include <monitors/tss/tss_trace.h>
#include <monitors/sco/sco_trace.h>
#include <monitors/scpd/scpd_trace.h>
#include <monitors/snep/snep_trace.h>
-#include <monitors/sncid/sncid_trace.h>
+#include <monitors/sts/sts_trace.h>
+#include <monitors/opid/opid_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
@@ -75,27 +75,27 @@ DECLARE_EVENT_CLASS(event_da_monitor_id,
TP_ARGS(id, state, event, next_state, final_state),
TP_STRUCT__entry(
- __field( int, id )
- __array( char, state, MAX_DA_NAME_LEN )
- __array( char, event, MAX_DA_NAME_LEN )
- __array( char, next_state, MAX_DA_NAME_LEN )
- __field( bool, final_state )
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
+ __string( next_state, next_state )
+ __field( bool, final_state )
),
TP_fast_assign(
- memcpy(__entry->state, state, MAX_DA_NAME_LEN);
- memcpy(__entry->event, event, MAX_DA_NAME_LEN);
- memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
- __entry->id = id;
- __entry->final_state = final_state;
+ __assign_str(state);
+ __assign_str(event);
+ __assign_str(next_state);
+ __entry->id = id;
+ __entry->final_state = final_state;
),
- TP_printk("%d: %s x %s -> %s %s",
+ TP_printk("%d: %s x %s -> %s%s",
__entry->id,
- __entry->state,
- __entry->event,
- __entry->next_state,
- __entry->final_state ? "(final)" : "")
+ __get_str(state),
+ __get_str(event),
+ __get_str(next_state),
+ __entry->final_state ? " (final)" : "")
);
DECLARE_EVENT_CLASS(error_da_monitor_id,
@@ -105,32 +105,108 @@ DECLARE_EVENT_CLASS(error_da_monitor_id,
TP_ARGS(id, state, event),
TP_STRUCT__entry(
- __field( int, id )
- __array( char, state, MAX_DA_NAME_LEN )
- __array( char, event, MAX_DA_NAME_LEN )
+ __field( int, id )
+ __string( state, state )
+ __string( event, event )
),
TP_fast_assign(
- memcpy(__entry->state, state, MAX_DA_NAME_LEN);
- memcpy(__entry->event, event, MAX_DA_NAME_LEN);
- __entry->id = id;
+ __assign_str(state);
+ __assign_str(event);
+ __entry->id = id;
),
TP_printk("%d: event %s not expected in the state %s",
__entry->id,
- __entry->event,
- __entry->state)
+ __get_str(event),
+ __get_str(state))
);
#include <monitors/wwnr/wwnr_trace.h>
#include <monitors/snroc/snroc_trace.h>
+#include <monitors/nrp/nrp_trace.h>
+#include <monitors/sssw/sssw_trace.h>
// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
#endif /* CONFIG_DA_MON_EVENTS_ID */
+#ifdef CONFIG_LTL_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_ltl_monitor_id,
+
+ TP_PROTO(struct task_struct *task, char *states, char *atoms, char *next),
+
+ TP_ARGS(task, states, atoms, next),
+
+ TP_STRUCT__entry(
+ __string(comm, task->comm)
+ __field(pid_t, pid)
+ __string(states, states)
+ __string(atoms, atoms)
+ __string(next, next)
+ ),
+
+ TP_fast_assign(
+ __assign_str(comm);
+ __entry->pid = task->pid;
+ __assign_str(states);
+ __assign_str(atoms);
+ __assign_str(next);
+ ),
+
+ TP_printk("%s[%d]: (%s) x (%s) -> (%s)", __get_str(comm), __entry->pid,
+ __get_str(states), __get_str(atoms), __get_str(next))
+);
+
+DECLARE_EVENT_CLASS(error_ltl_monitor_id,
+
+ TP_PROTO(struct task_struct *task),
+
+ TP_ARGS(task),
+
+ TP_STRUCT__entry(
+ __string(comm, task->comm)
+ __field(pid_t, pid)
+ ),
+
+ TP_fast_assign(
+ __assign_str(comm);
+ __entry->pid = task->pid;
+ ),
+
+ TP_printk("%s[%d]: violation detected", __get_str(comm), __entry->pid)
+);
+#include <monitors/pagefault/pagefault_trace.h>
+#include <monitors/sleep/sleep_trace.h>
+// Add new monitors based on CONFIG_LTL_MON_EVENTS_ID here
+#endif /* CONFIG_LTL_MON_EVENTS_ID */
+
+#ifdef CONFIG_RV_MON_MAINTENANCE_EVENTS
+/* Tracepoint useful for monitors development, currenly only used in DA */
+TRACE_EVENT(rv_retries_error,
+
+ TP_PROTO(char *name, char *event),
+
+ TP_ARGS(name, event),
+
+ TP_STRUCT__entry(
+ __string( name, name )
+ __string( event, event )
+ ),
+
+ TP_fast_assign(
+ __assign_str(name);
+ __assign_str(event);
+ ),
+
+ TP_printk(__stringify(MAX_DA_RETRY_RACING_EVENTS)
+ " retries reached for event %s, resetting monitor %s",
+ __get_str(event), __get_str(name))
+);
+#endif /* CONFIG_RV_MON_MAINTENANCE_EVENTS */
#endif /* _TRACE_RV_H */
-/* This part ust be outside protection */
+/* This part must be outside protection */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE rv_trace
#include <trace/define_trace.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 95ae7c4e5835..b9716178f728 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -936,7 +936,6 @@ int tracing_is_enabled(void)
* return the mirror variable of the state of the ring buffer.
* It's a little racy, but we don't really care.
*/
- smp_rmb();
return !global_trace.buffer_disabled;
}
@@ -1107,8 +1106,6 @@ void tracer_tracing_on(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 0;
- /* Make the flag seen by readers */
- smp_wmb();
}
/**
@@ -1640,8 +1637,6 @@ void tracer_tracing_off(struct trace_array *tr)
* important to be fast than accurate.
*/
tr->buffer_disabled = 1;
- /* Make the flag seen by readers */
- smp_wmb();
}
/**
@@ -2710,8 +2705,6 @@ void trace_buffered_event_enable(void)
static void enable_trace_buffered_event(void *data)
{
- /* Probably not needed, but do it anyway */
- smp_rmb();
this_cpu_dec(trace_buffered_event_cnt);
}
@@ -4735,21 +4728,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->array_buffer->buffer,
- cpu, GFP_KERNEL);
- }
- ring_buffer_read_prepare_sync();
- for_each_tracing_cpu(cpu) {
- ring_buffer_read_start(iter->buffer_iter[cpu]);
+ ring_buffer_read_start(iter->array_buffer->buffer,
+ cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->array_buffer->buffer,
- cpu, GFP_KERNEL);
- ring_buffer_read_prepare_sync();
- ring_buffer_read_start(iter->buffer_iter[cpu]);
+ ring_buffer_read_start(iter->array_buffer->buffer,
+ cpu, GFP_KERNEL);
tracing_iter_reset(iter, cpu);
}
@@ -5937,17 +5924,27 @@ static inline void trace_insert_eval_map_file(struct module *mod,
struct trace_eval_map **start, int len) { }
#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
-static void trace_insert_eval_map(struct module *mod,
- struct trace_eval_map **start, int len)
+static void
+trace_event_update_with_eval_map(struct module *mod,
+ struct trace_eval_map **start,
+ int len)
{
struct trace_eval_map **map;
- if (len <= 0)
- return;
+ /* Always run sanitizer only if btf_type_tag attr exists. */
+ if (len <= 0) {
+ if (!(IS_ENABLED(CONFIG_DEBUG_INFO_BTF) &&
+ IS_ENABLED(CONFIG_PAHOLE_HAS_BTF_TAG) &&
+ __has_attribute(btf_type_tag)))
+ return;
+ }
map = start;
- trace_event_eval_update(map, len);
+ trace_event_update_all(map, len);
+
+ if (len <= 0)
+ return;
trace_insert_eval_map_file(mod, start, len);
}
@@ -6303,7 +6300,7 @@ static bool tracer_options_updated;
static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
/* Only enable if the directory has been created already. */
- if (!tr->dir)
+ if (!tr->dir && !(tr->flags & TRACE_ARRAY_FL_GLOBAL))
return;
/* Only create trace option files after update_tracer_options finish */
@@ -8984,13 +8981,13 @@ static inline __init int register_snapshot_cmd(void) { return 0; }
static struct dentry *tracing_get_dentry(struct trace_array *tr)
{
- if (WARN_ON(!tr->dir))
- return ERR_PTR(-ENODEV);
-
/* Top directory uses NULL as the parent */
if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
return NULL;
+ if (WARN_ON(!tr->dir))
+ return ERR_PTR(-ENODEV);
+
/* All sub buffers have a descriptor */
return tr->dir;
}
@@ -10256,6 +10253,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
ftrace_init_tracefs(tr, d_tracer);
}
+#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
@@ -10277,6 +10275,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
if (IS_ERR(fc))
return ERR_CAST(fc);
+ pr_warn("NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030\n");
+
ret = vfs_parse_fs_string(fc, "source",
"tracefs", strlen("tracefs"));
if (!ret)
@@ -10287,6 +10287,7 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
put_fs_context(fc);
return mnt;
}
+#endif
/**
* tracing_init_dentry - initialize top level trace array
@@ -10311,6 +10312,7 @@ int tracing_init_dentry(void)
if (WARN_ON(!tracefs_initialized()))
return -ENODEV;
+#ifdef CONFIG_TRACEFS_AUTOMOUNT_DEPRECATED
/*
* As there may still be users that expect the tracing
* files to exist in debugfs/tracing, we must automount
@@ -10319,6 +10321,7 @@ int tracing_init_dentry(void)
*/
tr->dir = debugfs_create_automount("tracing", NULL,
trace_automount, NULL);
+#endif
return 0;
}
@@ -10335,7 +10338,7 @@ static void __init eval_map_work_func(struct work_struct *work)
int len;
len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
- trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
+ trace_event_update_with_eval_map(NULL, __start_ftrace_eval_maps, len);
}
static int __init trace_eval_init(void)
@@ -10388,9 +10391,6 @@ bool module_exists(const char *module)
static void trace_module_add_evals(struct module *mod)
{
- if (!mod->num_trace_evals)
- return;
-
/*
* Modules with bad taint do not have events created, do
* not bother with enums either.
@@ -10398,7 +10398,8 @@ static void trace_module_add_evals(struct module *mod)
if (trace_module_has_bad_taint(mod))
return;
- trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
+ /* Even if no trace_evals, this need to sanitize field types. */
+ trace_event_update_with_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}
#ifdef CONFIG_TRACE_EVAL_MAP_FILE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bd084953a98b..1dbf1d3cf2f1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -2125,13 +2125,13 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
-void trace_event_eval_update(struct trace_eval_map **map, int len);
+void trace_event_update_all(struct trace_eval_map **map, int len);
/* Used from boot time tracer */
extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set);
extern int trigger_process_regex(struct trace_event_file *file, char *buff);
#else
static inline void __init trace_event_init(void) { }
-static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
+static inline void trace_event_update_all(struct trace_eval_map **map, int len) { }
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 916555f0de81..a1d402124836 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -9,14 +9,15 @@
* Copyright (C) 2021, VMware Inc, Tzvetomir Stoyanov tz.stoyanov@gmail.com>
*
*/
+#include <linux/cleanup.h>
+#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/mutex.h>
-#include <linux/ftrace.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
-#include "trace_probe_tmpl.h"
#include "trace_probe_kernel.h"
+#include "trace_probe_tmpl.h"
#define EPROBE_EVENT_SYSTEM "eprobes"
@@ -343,10 +344,15 @@ get_event_field(struct fetch_insn *code, void *rec)
val = *(unsigned int *)addr;
break;
default:
- if (field->is_signed)
- val = *(long *)addr;
- else
- val = *(unsigned long *)addr;
+ if (field->size == sizeof(long)) {
+ if (field->is_signed)
+ val = *(long *)addr;
+ else
+ val = *(unsigned long *)addr;
+ break;
+ }
+ /* This is an array, point to the addr itself */
+ val = (unsigned long)addr;
break;
}
return val;
@@ -797,18 +803,20 @@ find_and_get_event(const char *system, const char *event_name)
static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[], int i)
{
- struct traceprobe_parse_context ctx = {
- .event = ep->event,
- .flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT,
- };
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
int ret;
- ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], &ctx);
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->event = ep->event;
+ ctx->flags = TPARG_FL_KERNEL | TPARG_FL_TEVENT;
+
+ ret = traceprobe_parse_probe_arg(&ep->tp, i, argv[i], ctx);
/* Handle symbols "@" */
if (!ret)
ret = traceprobe_update_arg(&ep->tp.args[i]);
- traceprobe_finish_parse(&ctx);
return ret;
}
@@ -869,10 +877,10 @@ static int __trace_eprobe_create(int argc, const char *argv[])
const char *event = NULL, *group = EPROBE_EVENT_SYSTEM;
const char *sys_event = NULL, *sys_name = NULL;
struct trace_event_call *event_call;
+ char *buf1 __free(kfree) = NULL;
+ char *buf2 __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
struct trace_eprobe *ep = NULL;
- char buf1[MAX_EVENT_NAME_LEN];
- char buf2[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
int ret = 0, filter_idx = 0;
int i, filter_cnt;
@@ -883,6 +891,9 @@ static int __trace_eprobe_create(int argc, const char *argv[])
event = strchr(&argv[0][1], ':');
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ goto mem_error;
event++;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
@@ -892,6 +903,11 @@ static int __trace_eprobe_create(int argc, const char *argv[])
trace_probe_log_set_index(1);
sys_event = argv[1];
+
+ buf2 = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!buf2)
+ goto mem_error;
+
ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0);
if (ret || !sys_event || !sys_name) {
trace_probe_log_err(0, NO_EVENT_INFO);
@@ -899,7 +915,9 @@ static int __trace_eprobe_create(int argc, const char *argv[])
}
if (!event) {
- strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN);
+ buf1 = kstrdup(sys_event, GFP_KERNEL);
+ if (!buf1)
+ goto mem_error;
event = buf1;
}
@@ -972,6 +990,9 @@ static int __trace_eprobe_create(int argc, const char *argv[])
trace_probe_log_clear();
return ret;
+mem_error:
+ ret = -ENOMEM;
+ goto error;
parse_error:
ret = -EINVAL;
error:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d01e5c910ce1..9f3e9537417d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -768,6 +768,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
{
struct trace_event_call *call = file->event_call;
struct trace_array *tr = file->tr;
+ bool soft_mode = atomic_read(&file->sm_ref) != 0;
int ret = 0;
int disable;
@@ -782,7 +783,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* is set we do not want the event to be enabled before we
* clear the bit.
*
- * When soft_disable is not set but the SOFT_MODE flag is,
+ * When soft_disable is not set but the soft_mode is,
* we do nothing. Do not disable the tracepoint, otherwise
* "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
*/
@@ -790,11 +791,11 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (atomic_dec_return(&file->sm_ref) > 0)
break;
disable = file->flags & EVENT_FILE_FL_SOFT_DISABLED;
- clear_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ soft_mode = false;
/* Disable use of trace_buffered_event */
trace_buffered_event_disable();
} else
- disable = !(file->flags & EVENT_FILE_FL_SOFT_MODE);
+ disable = !soft_mode;
if (disable && (file->flags & EVENT_FILE_FL_ENABLED)) {
clear_bit(EVENT_FILE_FL_ENABLED_BIT, &file->flags);
@@ -812,8 +813,8 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
WARN_ON_ONCE(ret);
}
- /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
- if (file->flags & EVENT_FILE_FL_SOFT_MODE)
+ /* If in soft mode, just set the SOFT_DISABLE_BIT, else clear it */
+ if (soft_mode)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
else
clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@@ -823,7 +824,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
* When soft_disable is set and enable is set, we want to
* register the tracepoint for the event, but leave the event
* as is. That means, if the event was already enabled, we do
- * nothing (but set SOFT_MODE). If the event is disabled, we
+ * nothing (but set soft_mode). If the event is disabled, we
* set SOFT_DISABLED before enabling the event tracepoint, so
* it still seems to be disabled.
*/
@@ -832,7 +833,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
else {
if (atomic_inc_return(&file->sm_ref) > 1)
break;
- set_bit(EVENT_FILE_FL_SOFT_MODE_BIT, &file->flags);
+ soft_mode = true;
/* Enable use of trace_buffered_event */
trace_buffered_event_enable();
}
@@ -840,7 +841,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
bool cmd = false, tgid = false;
- /* Keep the event disabled, when going to SOFT_MODE. */
+ /* Keep the event disabled, when going to soft mode. */
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
@@ -1792,8 +1793,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
!(flags & EVENT_FILE_FL_SOFT_DISABLED))
strcpy(buf, "1");
- if (flags & EVENT_FILE_FL_SOFT_DISABLED ||
- flags & EVENT_FILE_FL_SOFT_MODE)
+ if (atomic_read(&file->sm_ref) != 0)
strcat(buf, "*");
strcat(buf, "\n");
@@ -3267,43 +3267,120 @@ static void add_str_to_module(struct module *module, char *str)
list_add(&modstr->next, &module_strings);
}
+#define ATTRIBUTE_STR "__attribute__("
+#define ATTRIBUTE_STR_LEN (sizeof(ATTRIBUTE_STR) - 1)
+
+/* Remove all __attribute__() from @type. Return allocated string or @type. */
+static char *sanitize_field_type(const char *type)
+{
+ char *attr, *tmp, *next, *ret = (char *)type;
+ int depth;
+
+ next = (char *)type;
+ while ((attr = strstr(next, ATTRIBUTE_STR))) {
+ /* Retry if "__attribute__(" is a part of another word. */
+ if (attr != next && !isspace(attr[-1])) {
+ next = attr + ATTRIBUTE_STR_LEN;
+ continue;
+ }
+
+ if (ret == type) {
+ ret = kstrdup(type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!ret))
+ return NULL;
+ attr = ret + (attr - type);
+ }
+
+ /* the ATTRIBUTE_STR already has the first '(' */
+ depth = 1;
+ next = attr + ATTRIBUTE_STR_LEN;
+ do {
+ tmp = strpbrk(next, "()");
+ /* There is unbalanced parentheses */
+ if (WARN_ON_ONCE(!tmp)) {
+ kfree(ret);
+ return (char *)type;
+ }
+
+ if (*tmp == '(')
+ depth++;
+ else
+ depth--;
+ next = tmp + 1;
+ } while (depth > 0);
+ next = skip_spaces(next);
+ strcpy(attr, next);
+ next = attr;
+ }
+ return ret;
+}
+
+static char *find_replacable_eval(const char *type, const char *eval_string,
+ int len)
+{
+ char *ptr;
+
+ if (!eval_string)
+ return NULL;
+
+ ptr = strchr(type, '[');
+ if (!ptr)
+ return NULL;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ return NULL;
+
+ if (strncmp(eval_string, ptr, len) != 0)
+ return NULL;
+
+ return ptr;
+}
+
static void update_event_fields(struct trace_event_call *call,
struct trace_eval_map *map)
{
struct ftrace_event_field *field;
+ const char *eval_string = NULL;
struct list_head *head;
+ int len = 0;
char *ptr;
char *str;
- int len = strlen(map->eval_string);
/* Dynamic events should never have field maps */
- if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ if (call->flags & TRACE_EVENT_FL_DYNAMIC)
return;
+ if (map) {
+ eval_string = map->eval_string;
+ len = strlen(map->eval_string);
+ }
+
head = trace_get_fields(call);
list_for_each_entry(field, head, link) {
- ptr = strchr(field->type, '[');
- if (!ptr)
- continue;
- ptr++;
-
- if (!isalpha(*ptr) && *ptr != '_')
- continue;
+ str = sanitize_field_type(field->type);
+ if (!str)
+ return;
- if (strncmp(map->eval_string, ptr, len) != 0)
- continue;
+ ptr = find_replacable_eval(str, eval_string, len);
+ if (ptr) {
+ if (str == field->type) {
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ }
- str = kstrdup(field->type, GFP_KERNEL);
- if (WARN_ON_ONCE(!str))
- return;
- ptr = str + (ptr - field->type);
- ptr = eval_replace(ptr, map, len);
- /* enum/sizeof string smaller than value */
- if (WARN_ON_ONCE(!ptr)) {
- kfree(str);
- continue;
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
}
+ if (str == field->type)
+ continue;
/*
* If the event is part of a module, then we need to free the string
* when the module is removed. Otherwise, it will stay allocated
@@ -3313,14 +3390,18 @@ static void update_event_fields(struct trace_event_call *call,
add_str_to_module(call->module, str);
field->type = str;
+ if (field->filter_type == FILTER_OTHER)
+ field->filter_type = filter_assign_type(field->type);
}
}
-void trace_event_eval_update(struct trace_eval_map **map, int len)
+/* Update all events for replacing eval and sanitizing */
+void trace_event_update_all(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
bool first = false;
+ bool updated;
int last_i;
int i;
@@ -3333,6 +3414,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
last_system = call->class->system;
}
+ updated = false;
/*
* Since calls are grouped by systems, the likelihood that the
* next call in the iteration belongs to the same system as the
@@ -3352,8 +3434,12 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
}
update_event_printk(call, map[i]);
update_event_fields(call, map[i]);
+ updated = true;
}
}
+ /* If not updated yet, update field for sanitizing. */
+ if (!updated)
+ update_event_fields(call, NULL);
cond_resched();
}
up_write(&trace_event_sem);
@@ -3587,7 +3673,7 @@ static int probe_remove_event_call(struct trace_event_call *call)
continue;
/*
* We can't rely on ftrace_event_enable_disable(enable => 0)
- * we are going to do, EVENT_FILE_FL_SOFT_MODE can suppress
+ * we are going to do, soft mode can suppress
* TRACE_REG_UNREGISTER.
*/
if (file->flags & EVENT_FILE_FL_ENABLED)
@@ -3698,7 +3784,7 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
- /* Check for any strings allocade for this module */
+ /* Check for any strings allocated for this module */
list_for_each_entry_safe(modstr, m, &module_strings, next) {
if (modstr->module != mod)
continue;
@@ -4002,7 +4088,7 @@ static int free_probe_data(void *data)
edata->ref--;
if (!edata->ref) {
- /* Remove the SOFT_MODE flag */
+ /* Remove soft mode */
__ftrace_event_enable_disable(edata->file, 0, 1);
trace_event_put_ref(edata->file->event_call);
kfree(edata);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 3885aadc434d..54226b48b2d1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1344,13 +1344,14 @@ struct filter_list {
struct filter_head {
struct list_head list;
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ struct rcu_work rwork;
+ };
};
-
-static void free_filter_list(struct rcu_head *rhp)
+static void free_filter_list(struct filter_head *filter_list)
{
- struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
struct filter_list *filter_item, *tmp;
list_for_each_entry_safe(filter_item, tmp, &filter_list->list, list) {
@@ -1361,9 +1362,20 @@ static void free_filter_list(struct rcu_head *rhp)
kfree(filter_list);
}
+static void free_filter_list_work(struct work_struct *work)
+{
+ struct filter_head *filter_list;
+
+ filter_list = container_of(to_rcu_work(work), struct filter_head, rwork);
+ free_filter_list(filter_list);
+}
+
static void free_filter_list_tasks(struct rcu_head *rhp)
{
- call_rcu(rhp, free_filter_list);
+ struct filter_head *filter_list = container_of(rhp, struct filter_head, rcu);
+
+ INIT_RCU_WORK(&filter_list->rwork, free_filter_list_work);
+ queue_rcu_work(system_wq, &filter_list->rwork);
}
/*
@@ -1460,7 +1472,7 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
tracepoint_synchronize_unregister();
if (head)
- free_filter_list(&head->rcu);
+ free_filter_list(head);
list_for_each_entry(file, &tr->events, list) {
if (file->system != dir || !file->filter)
@@ -2305,7 +2317,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
return 0;
fail:
/* No call succeeded */
- free_filter_list(&filter_list->rcu);
+ free_filter_list(filter_list);
parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
return -EINVAL;
fail_mem:
@@ -2315,7 +2327,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir,
if (!fail)
delay_free_filter(filter_list);
else
- free_filter_list(&filter_list->rcu);
+ free_filter_list(filter_list);
return -ENOMEM;
}
@@ -2900,6 +2912,10 @@ static __init int ftrace_test_event_filter(void)
if (i == DATA_CNT)
printk(KERN_CONT "OK\n");
+ /* Need to call ftrace_test_filter to prevent a warning */
+ if (!trace_ftrace_test_filter_enabled())
+ trace_ftrace_test_filter(1, 2, 3, 4, 5, 6, 7, 8);
+
return 0;
}
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index b40fa59159ac..b36ade43d4b3 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -4,15 +4,18 @@
* Copyright (C) 2022 Google LLC.
*/
#define pr_fmt(fmt) "trace_fprobe: " fmt
-#include <asm/ptrace.h>
#include <linux/fprobe.h>
+#include <linux/list.h>
#include <linux/module.h>
+#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/security.h>
#include <linux/tracepoint.h>
#include <linux/uaccess.h>
+#include <asm/ptrace.h>
+
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_kernel.h"
@@ -21,7 +24,6 @@
#define FPROBE_EVENT_SYSTEM "fprobes"
#define TRACEPOINT_EVENT_SYSTEM "tracepoints"
#define RETHOOK_MAXACTIVE_MAX 4096
-#define TRACEPOINT_STUB ERR_PTR(-ENOENT)
static int trace_fprobe_create(const char *raw_command);
static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev);
@@ -38,15 +40,156 @@ static struct dyn_event_operations trace_fprobe_ops = {
.match = trace_fprobe_match,
};
+/* List of tracepoint_user */
+static LIST_HEAD(tracepoint_user_list);
+static DEFINE_MUTEX(tracepoint_user_mutex);
+
+/* While living tracepoint_user, @tpoint can be NULL and @refcount != 0. */
+struct tracepoint_user {
+ struct list_head list;
+ const char *name;
+ struct tracepoint *tpoint;
+ unsigned int refcount;
+};
+
+/* NOTE: you must lock tracepoint_user_mutex. */
+#define for_each_tracepoint_user(tuser) \
+ list_for_each_entry(tuser, &tracepoint_user_list, list)
+
+static int tracepoint_user_register(struct tracepoint_user *tuser)
+{
+ struct tracepoint *tpoint = tuser->tpoint;
+
+ if (!tpoint)
+ return 0;
+
+ return tracepoint_probe_register_prio_may_exist(tpoint,
+ tpoint->probestub, NULL, 0);
+}
+
+static void tracepoint_user_unregister(struct tracepoint_user *tuser)
+{
+ if (!tuser->tpoint)
+ return;
+
+ WARN_ON_ONCE(tracepoint_probe_unregister(tuser->tpoint, tuser->tpoint->probestub, NULL));
+ tuser->tpoint = NULL;
+}
+
+static unsigned long tracepoint_user_ip(struct tracepoint_user *tuser)
+{
+ if (!tuser->tpoint)
+ return 0UL;
+
+ return (unsigned long)tuser->tpoint->probestub;
+}
+
+static void __tracepoint_user_free(struct tracepoint_user *tuser)
+{
+ if (!tuser)
+ return;
+ kfree(tuser->name);
+ kfree(tuser);
+}
+
+DEFINE_FREE(tuser_free, struct tracepoint_user *, __tracepoint_user_free(_T))
+
+static struct tracepoint_user *__tracepoint_user_init(const char *name, struct tracepoint *tpoint)
+{
+ struct tracepoint_user *tuser __free(tuser_free) = NULL;
+ int ret;
+
+ tuser = kzalloc(sizeof(*tuser), GFP_KERNEL);
+ if (!tuser)
+ return NULL;
+ tuser->name = kstrdup(name, GFP_KERNEL);
+ if (!tuser->name)
+ return NULL;
+
+ if (tpoint) {
+ ret = tracepoint_user_register(tuser);
+ if (ret)
+ return ERR_PTR(ret);
+ }
+
+ tuser->tpoint = tpoint;
+ tuser->refcount = 1;
+ INIT_LIST_HEAD(&tuser->list);
+ list_add(&tuser->list, &tracepoint_user_list);
+
+ return_ptr(tuser);
+}
+
+static struct tracepoint *find_tracepoint(const char *tp_name,
+ struct module **tp_mod);
+
+/*
+ * Get tracepoint_user if exist, or allocate new one and register it.
+ * If tracepoint is on a module, get its refcounter too.
+ * This returns errno or NULL (not loaded yet) or tracepoint_user.
+ */
+static struct tracepoint_user *tracepoint_user_find_get(const char *name, struct module **pmod)
+{
+ struct module *mod __free(module_put) = NULL;
+ struct tracepoint_user *tuser;
+ struct tracepoint *tpoint;
+
+ if (!name || !pmod)
+ return ERR_PTR(-EINVAL);
+
+ /* Get and lock the module which has tracepoint. */
+ tpoint = find_tracepoint(name, &mod);
+
+ guard(mutex)(&tracepoint_user_mutex);
+ /* Search existing tracepoint_user */
+ for_each_tracepoint_user(tuser) {
+ if (!strcmp(tuser->name, name)) {
+ tuser->refcount++;
+ *pmod = no_free_ptr(mod);
+ return tuser;
+ }
+ }
+
+ /* The corresponding tracepoint_user is not found. */
+ tuser = __tracepoint_user_init(name, tpoint);
+ if (!IS_ERR_OR_NULL(tuser))
+ *pmod = no_free_ptr(mod);
+
+ return tuser;
+}
+
+static void tracepoint_user_put(struct tracepoint_user *tuser)
+{
+ scoped_guard(mutex, &tracepoint_user_mutex) {
+ if (--tuser->refcount > 0)
+ return;
+
+ list_del(&tuser->list);
+ tracepoint_user_unregister(tuser);
+ }
+
+ __tracepoint_user_free(tuser);
+}
+
+DEFINE_FREE(tuser_put, struct tracepoint_user *,
+ if (!IS_ERR_OR_NULL(_T))
+ tracepoint_user_put(_T))
+
/*
* Fprobe event core functions
*/
+
+/*
+ * @tprobe is true for tracepoint probe.
+ * @tuser can be NULL if the trace_fprobe is disabled or the tracepoint is not
+ * loaded with a module. If @tuser != NULL, this trace_fprobe is enabled.
+ */
struct trace_fprobe {
struct dyn_event devent;
struct fprobe fp;
const char *symbol;
- struct tracepoint *tpoint;
- struct module *mod;
+ bool tprobe;
+ struct tracepoint_user *tuser;
struct trace_probe tp;
};
@@ -76,7 +219,7 @@ static bool trace_fprobe_is_return(struct trace_fprobe *tf)
static bool trace_fprobe_is_tracepoint(struct trace_fprobe *tf)
{
- return tf->tpoint != NULL;
+ return tf->tprobe;
}
static const char *trace_fprobe_symbol(struct trace_fprobe *tf)
@@ -411,6 +554,8 @@ static void free_trace_fprobe(struct trace_fprobe *tf)
{
if (tf) {
trace_probe_cleanup(&tf->tp);
+ if (tf->tuser)
+ tracepoint_user_put(tf->tuser);
kfree(tf->symbol);
kfree(tf);
}
@@ -425,9 +570,8 @@ DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) f
static struct trace_fprobe *alloc_trace_fprobe(const char *group,
const char *event,
const char *symbol,
- struct tracepoint *tpoint,
- struct module *mod,
- int nargs, bool is_return)
+ int nargs, bool is_return,
+ bool is_tracepoint)
{
struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
int ret = -ENOMEM;
@@ -445,8 +589,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
else
tf->fp.entry_handler = fentry_dispatcher;
- tf->tpoint = tpoint;
- tf->mod = mod;
+ tf->tprobe = is_tracepoint;
ret = trace_probe_init(&tf->tp, event, group, false, nargs);
if (ret < 0)
@@ -469,98 +612,6 @@ static struct trace_fprobe *find_trace_fprobe(const char *event,
return NULL;
}
-static inline int __enable_trace_fprobe(struct trace_fprobe *tf)
-{
- if (trace_fprobe_is_registered(tf))
- enable_fprobe(&tf->fp);
-
- return 0;
-}
-
-static void __disable_trace_fprobe(struct trace_probe *tp)
-{
- struct trace_fprobe *tf;
-
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- if (!trace_fprobe_is_registered(tf))
- continue;
- disable_fprobe(&tf->fp);
- }
-}
-
-/*
- * Enable trace_probe
- * if the file is NULL, enable "perf" handler, or enable "trace" handler.
- */
-static int enable_trace_fprobe(struct trace_event_call *call,
- struct trace_event_file *file)
-{
- struct trace_probe *tp;
- struct trace_fprobe *tf;
- bool enabled;
- int ret = 0;
-
- tp = trace_probe_primary_from_call(call);
- if (WARN_ON_ONCE(!tp))
- return -ENODEV;
- enabled = trace_probe_is_enabled(tp);
-
- /* This also changes "enabled" state */
- if (file) {
- ret = trace_probe_add_file(tp, file);
- if (ret)
- return ret;
- } else
- trace_probe_set_flag(tp, TP_FLAG_PROFILE);
-
- if (!enabled) {
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- /* TODO: check the fprobe is gone */
- __enable_trace_fprobe(tf);
- }
- }
-
- return 0;
-}
-
-/*
- * Disable trace_probe
- * if the file is NULL, disable "perf" handler, or disable "trace" handler.
- */
-static int disable_trace_fprobe(struct trace_event_call *call,
- struct trace_event_file *file)
-{
- struct trace_probe *tp;
-
- tp = trace_probe_primary_from_call(call);
- if (WARN_ON_ONCE(!tp))
- return -ENODEV;
-
- if (file) {
- if (!trace_probe_get_file_link(tp, file))
- return -ENOENT;
- if (!trace_probe_has_single_file(tp))
- goto out;
- trace_probe_clear_flag(tp, TP_FLAG_TRACE);
- } else
- trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
-
- if (!trace_probe_is_enabled(tp))
- __disable_trace_fprobe(tp);
-
- out:
- if (file)
- /*
- * Synchronization is done in below function. For perf event,
- * file == NULL and perf_trace_event_unreg() calls
- * tracepoint_synchronize_unregister() to ensure synchronize
- * event. We don't need to care about it.
- */
- trace_probe_remove_file(tp, file);
-
- return 0;
-}
-
/* Event entry printers */
static enum print_line_t
print_fentry_event(struct trace_iterator *iter, int flags,
@@ -712,20 +763,52 @@ static int unregister_fprobe_event(struct trace_fprobe *tf)
static int __regsiter_tracepoint_fprobe(struct trace_fprobe *tf)
{
- struct tracepoint *tpoint = tf->tpoint;
- unsigned long ip = (unsigned long)tpoint->probestub;
+ struct tracepoint_user *tuser __free(tuser_put) = NULL;
+ struct module *mod __free(module_put) = NULL;
+ unsigned long ip;
int ret;
+ if (WARN_ON_ONCE(tf->tuser))
+ return -EINVAL;
+
+ /* If the tracepoint is in a module, it must be locked in this function. */
+ tuser = tracepoint_user_find_get(tf->symbol, &mod);
+ /* This tracepoint is not loaded yet */
+ if (IS_ERR(tuser))
+ return PTR_ERR(tuser);
+ if (!tuser)
+ return -ENOMEM;
+
+ /* Register fprobe only if the tracepoint is loaded. */
+ if (tuser->tpoint) {
+ ip = tracepoint_user_ip(tuser);
+ if (WARN_ON_ONCE(!ip))
+ return -ENOENT;
+
+ ret = register_fprobe_ips(&tf->fp, &ip, 1);
+ if (ret < 0)
+ return ret;
+ }
+
+ tf->tuser = no_free_ptr(tuser);
+ return 0;
+}
+
+/* Returns an error if the target function is not available, or 0 */
+static int trace_fprobe_verify_target(struct trace_fprobe *tf)
+{
+ int ret;
+
+ /* Tracepoint should have a stub function. */
+ if (trace_fprobe_is_tracepoint(tf))
+ return 0;
+
/*
- * Here, we do 2 steps to enable fprobe on a tracepoint.
- * At first, put __probestub_##TP function on the tracepoint
- * and put a fprobe on the stub function.
+ * Note: since we don't lock the module, even if this succeeded,
+ * register_fprobe() later can fail.
*/
- ret = tracepoint_probe_register_prio_may_exist(tpoint,
- tpoint->probestub, NULL, 0);
- if (ret < 0)
- return ret;
- return register_fprobe_ips(&tf->fp, &ip, 1);
+ ret = fprobe_count_ips_from_filter(tf->symbol, NULL);
+ return (ret < 0) ? ret : 0;
}
/* Internal register function - just handle fprobe and flags */
@@ -747,20 +830,10 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
return ret;
}
- /* Set/clear disabled flag according to tp->flag */
- if (trace_probe_is_enabled(&tf->tp))
- tf->fp.flags &= ~FPROBE_FL_DISABLED;
- else
- tf->fp.flags |= FPROBE_FL_DISABLED;
-
- if (trace_fprobe_is_tracepoint(tf)) {
-
- /* This tracepoint is not loaded yet */
- if (tf->tpoint == TRACEPOINT_STUB)
- return 0;
+ tf->fp.flags &= ~FPROBE_FL_DISABLED;
+ if (trace_fprobe_is_tracepoint(tf))
return __regsiter_tracepoint_fprobe(tf);
- }
/* TODO: handle filter, nofilter or symbol list */
return register_fprobe(&tf->fp, tf->symbol, NULL);
@@ -769,15 +842,11 @@ static int __register_trace_fprobe(struct trace_fprobe *tf)
/* Internal unregister function - just handle fprobe and flags */
static void __unregister_trace_fprobe(struct trace_fprobe *tf)
{
- if (trace_fprobe_is_registered(tf)) {
+ if (trace_fprobe_is_registered(tf))
unregister_fprobe(&tf->fp);
- memset(&tf->fp, 0, sizeof(tf->fp));
- if (trace_fprobe_is_tracepoint(tf)) {
- tracepoint_probe_unregister(tf->tpoint,
- tf->tpoint->probestub, NULL);
- tf->tpoint = NULL;
- tf->mod = NULL;
- }
+ if (tf->tuser) {
+ tracepoint_user_put(tf->tuser);
+ tf->tuser = NULL;
}
}
@@ -837,7 +906,7 @@ static bool trace_fprobe_has_same_fprobe(struct trace_fprobe *orig,
return false;
}
-static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
+static int append_trace_fprobe_event(struct trace_fprobe *tf, struct trace_fprobe *to)
{
int ret;
@@ -865,7 +934,7 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
if (ret)
return ret;
- ret = __register_trace_fprobe(tf);
+ ret = trace_fprobe_verify_target(tf);
if (ret)
trace_probe_unlink(&tf->tp);
else
@@ -874,8 +943,8 @@ static int append_trace_fprobe(struct trace_fprobe *tf, struct trace_fprobe *to)
return ret;
}
-/* Register a trace_probe and probe_event */
-static int register_trace_fprobe(struct trace_fprobe *tf)
+/* Register a trace_probe and probe_event, and check the fprobe is available. */
+static int register_trace_fprobe_event(struct trace_fprobe *tf)
{
struct trace_fprobe *old_tf;
int ret;
@@ -885,7 +954,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
trace_probe_group_name(&tf->tp));
if (old_tf)
- return append_trace_fprobe(tf, old_tf);
+ return append_trace_fprobe_event(tf, old_tf);
/* Register new event */
ret = register_fprobe_event(tf);
@@ -898,8 +967,8 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
return ret;
}
- /* Register fprobe */
- ret = __register_trace_fprobe(tf);
+ /* Verify fprobe is sane. */
+ ret = trace_fprobe_verify_target(tf);
if (ret < 0)
unregister_fprobe_event(tf);
else
@@ -963,15 +1032,6 @@ static struct tracepoint *find_tracepoint(const char *tp_name,
}
#ifdef CONFIG_MODULES
-static void reenable_trace_fprobe(struct trace_fprobe *tf)
-{
- struct trace_probe *tp = &tf->tp;
-
- list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
- __enable_trace_fprobe(tf);
- }
-}
-
/*
* Find a tracepoint from specified module. In this case, this does not get the
* module's refcount. The caller must ensure the module is not freed.
@@ -988,36 +1048,94 @@ static struct tracepoint *find_tracepoint_in_module(struct module *mod,
return data.tpoint;
}
+/* These are CONFIG_MODULES=y specific functions. */
+static bool tracepoint_user_within_module(struct tracepoint_user *tuser,
+ struct module *mod)
+{
+ return within_module(tracepoint_user_ip(tuser), mod);
+}
+
+static int tracepoint_user_register_again(struct tracepoint_user *tuser,
+ struct tracepoint *tpoint)
+{
+ tuser->tpoint = tpoint;
+ return tracepoint_user_register(tuser);
+}
+
+static void tracepoint_user_unregister_clear(struct tracepoint_user *tuser)
+{
+ tracepoint_user_unregister(tuser);
+ tuser->tpoint = NULL;
+}
+
+/* module callback for tracepoint_user */
static int __tracepoint_probe_module_cb(struct notifier_block *self,
unsigned long val, void *data)
{
struct tp_module *tp_mod = data;
+ struct tracepoint_user *tuser;
struct tracepoint *tpoint;
+
+ if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
+ return NOTIFY_DONE;
+
+ mutex_lock(&tracepoint_user_mutex);
+ for_each_tracepoint_user(tuser) {
+ if (val == MODULE_STATE_COMING) {
+ /* This is not a tracepoint in this module. Skip it. */
+ tpoint = find_tracepoint_in_module(tp_mod->mod, tuser->name);
+ if (!tpoint)
+ continue;
+ WARN_ON_ONCE(tracepoint_user_register_again(tuser, tpoint));
+ } else if (val == MODULE_STATE_GOING &&
+ tracepoint_user_within_module(tuser, tp_mod->mod)) {
+ /* Unregister all tracepoint_user in this module. */
+ tracepoint_user_unregister_clear(tuser);
+ }
+ }
+ mutex_unlock(&tracepoint_user_mutex);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block tracepoint_module_nb = {
+ .notifier_call = __tracepoint_probe_module_cb,
+};
+
+/* module callback for tprobe events */
+static int __tprobe_event_module_cb(struct notifier_block *self,
+ unsigned long val, void *data)
+{
struct trace_fprobe *tf;
struct dyn_event *pos;
+ struct module *mod = data;
if (val != MODULE_STATE_GOING && val != MODULE_STATE_COMING)
return NOTIFY_DONE;
mutex_lock(&event_mutex);
for_each_trace_fprobe(tf, pos) {
- if (val == MODULE_STATE_COMING && tf->tpoint == TRACEPOINT_STUB) {
- tpoint = find_tracepoint_in_module(tp_mod->mod, tf->symbol);
- if (tpoint) {
- tf->tpoint = tpoint;
- tf->mod = tp_mod->mod;
- if (!WARN_ON_ONCE(__regsiter_tracepoint_fprobe(tf)) &&
- trace_probe_is_enabled(&tf->tp))
- reenable_trace_fprobe(tf);
- }
- } else if (val == MODULE_STATE_GOING && tp_mod->mod == tf->mod) {
+ /* Skip fprobe and disabled tprobe events. */
+ if (!trace_fprobe_is_tracepoint(tf) || !tf->tuser)
+ continue;
+
+ /* Before this notification, tracepoint notifier has already done. */
+ if (val == MODULE_STATE_COMING &&
+ tracepoint_user_within_module(tf->tuser, mod)) {
+ unsigned long ip = tracepoint_user_ip(tf->tuser);
+
+ WARN_ON_ONCE(register_fprobe_ips(&tf->fp, &ip, 1));
+ } else if (val == MODULE_STATE_GOING &&
+ /*
+ * tracepoint_user_within_module() does not work here because
+ * tracepoint_user is already unregistered and cleared tpoint.
+ * Instead, checking whether the fprobe is registered but
+ * tpoint is cleared(unregistered). Such unbalance probes
+ * must be adjusted anyway.
+ */
+ trace_fprobe_is_registered(tf) &&
+ !tf->tuser->tpoint) {
unregister_fprobe(&tf->fp);
- if (trace_fprobe_is_tracepoint(tf)) {
- tracepoint_probe_unregister(tf->tpoint,
- tf->tpoint->probestub, NULL);
- tf->tpoint = TRACEPOINT_STUB;
- tf->mod = NULL;
- }
}
}
mutex_unlock(&event_mutex);
@@ -1025,8 +1143,11 @@ static int __tracepoint_probe_module_cb(struct notifier_block *self,
return NOTIFY_DONE;
}
-static struct notifier_block tracepoint_module_nb = {
- .notifier_call = __tracepoint_probe_module_cb,
+/* NOTE: this must be called after tracepoint callback */
+static struct notifier_block tprobe_event_module_nb = {
+ .notifier_call = __tprobe_event_module_cb,
+ /* Make sure this is later than tracepoint module notifier. */
+ .priority = -10,
};
#endif /* CONFIG_MODULES */
@@ -1086,8 +1207,6 @@ static int parse_symbol_and_return(int argc, const char *argv[],
return 0;
}
-DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T))
-
static int trace_fprobe_create_internal(int argc, const char *argv[],
struct traceprobe_parse_context *ctx)
{
@@ -1116,19 +1235,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
- int i, new_argc = 0, ret = 0;
- bool is_return = false;
- char *symbol __free(kfree) = NULL;
const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
+ struct module *mod __free(module_put) = NULL;
const char **new_argv __free(kfree) = NULL;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- char sbuf[KSYM_NAME_LEN];
- char abuf[MAX_BTF_ARGS_LEN];
+ char *symbol __free(kfree) = NULL;
+ char *ebuf __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
+ char *sbuf __free(kfree) = NULL;
+ char *abuf __free(kfree) = NULL;
char *dbuf __free(kfree) = NULL;
+ int i, new_argc = 0, ret = 0;
bool is_tracepoint = false;
- struct module *tp_mod __free(module_put) = NULL;
- struct tracepoint *tpoint = NULL;
+ bool is_return = false;
if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
return -ECANCELED;
@@ -1156,6 +1274,9 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ return -ENOMEM;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
@@ -1163,15 +1284,18 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
}
if (!event) {
+ ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!ebuf)
+ return -ENOMEM;
/* Make a new event name */
if (is_tracepoint)
- snprintf(buf, MAX_EVENT_NAME_LEN, "%s%s",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s%s",
isdigit(*symbol) ? "_" : "", symbol);
else
- snprintf(buf, MAX_EVENT_NAME_LEN, "%s__%s", symbol,
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%s__%s", symbol,
is_return ? "exit" : "entry");
- sanitize_event_name(buf);
- event = buf;
+ sanitize_event_name(ebuf);
+ event = ebuf;
}
if (is_return)
@@ -1179,25 +1303,28 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
else
ctx->flags |= TPARG_FL_FENTRY;
+ ctx->funcname = NULL;
if (is_tracepoint) {
+ /* Get tracepoint and lock its module until the end of the registration. */
+ struct tracepoint *tpoint;
+
ctx->flags |= TPARG_FL_TPOINT;
- tpoint = find_tracepoint(symbol, &tp_mod);
+ mod = NULL;
+ tpoint = find_tracepoint(symbol, &mod);
if (tpoint) {
- ctx->funcname = kallsyms_lookup(
- (unsigned long)tpoint->probestub,
- NULL, NULL, NULL, sbuf);
- } else if (IS_ENABLED(CONFIG_MODULES)) {
- /* This *may* be loaded afterwards */
- tpoint = TRACEPOINT_STUB;
- ctx->funcname = symbol;
- } else {
- trace_probe_log_set_index(1);
- trace_probe_log_err(0, NO_TRACEPOINT);
- return -EINVAL;
+ sbuf = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
+ if (!sbuf)
+ return -ENOMEM;
+ ctx->funcname = kallsyms_lookup((unsigned long)tpoint->probestub,
+ NULL, NULL, NULL, sbuf);
}
- } else
+ }
+ if (!ctx->funcname)
ctx->funcname = symbol;
+ abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL);
+ if (!abuf)
+ return -ENOMEM;
argc -= 2; argv += 2;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
abuf, MAX_BTF_ARGS_LEN, ctx);
@@ -1218,8 +1345,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
return ret;
/* setup a probe */
- tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod,
- argc, is_return);
+ tf = alloc_trace_fprobe(group, event, symbol, argc, is_return, is_tracepoint);
if (IS_ERR(tf)) {
ret = PTR_ERR(tf);
/* This must return -ENOMEM, else there is a bug */
@@ -1251,7 +1377,7 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
if (ret < 0)
return ret;
- ret = register_trace_fprobe(tf);
+ ret = register_trace_fprobe_event(tf);
if (ret) {
trace_probe_log_set_index(1);
if (ret == -EILSEQ)
@@ -1271,14 +1397,17 @@ static int trace_fprobe_create_internal(int argc, const char *argv[],
static int trace_fprobe_create_cb(int argc, const char *argv[])
{
- struct traceprobe_parse_context ctx = {
- .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
- };
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
int ret;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE;
+
trace_probe_log_init("trace_fprobe", argc, argv);
- ret = trace_fprobe_create_internal(argc, argv, &ctx);
- traceprobe_finish_parse(&ctx);
+ ret = trace_fprobe_create_internal(argc, argv, ctx);
trace_probe_log_clear();
return ret;
}
@@ -1321,6 +1450,84 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
}
/*
+ * Enable trace_probe
+ * if the file is NULL, enable "perf" handler, or enable "trace" handler.
+ */
+static int enable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_probe *tp;
+ struct trace_fprobe *tf;
+ bool enabled;
+ int ret = 0;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+ enabled = trace_probe_is_enabled(tp);
+
+ /* This also changes "enabled" state */
+ if (file) {
+ ret = trace_probe_add_file(tp, file);
+ if (ret)
+ return ret;
+ } else
+ trace_probe_set_flag(tp, TP_FLAG_PROFILE);
+
+ if (!enabled) {
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ ret = __register_trace_fprobe(tf);
+ if (ret < 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Disable trace_probe
+ * if the file is NULL, disable "perf" handler, or disable "trace" handler.
+ */
+static int disable_trace_fprobe(struct trace_event_call *call,
+ struct trace_event_file *file)
+{
+ struct trace_fprobe *tf;
+ struct trace_probe *tp;
+
+ tp = trace_probe_primary_from_call(call);
+ if (WARN_ON_ONCE(!tp))
+ return -ENODEV;
+
+ if (file) {
+ if (!trace_probe_get_file_link(tp, file))
+ return -ENOENT;
+ if (!trace_probe_has_single_file(tp))
+ goto out;
+ trace_probe_clear_flag(tp, TP_FLAG_TRACE);
+ } else
+ trace_probe_clear_flag(tp, TP_FLAG_PROFILE);
+
+ if (!trace_probe_is_enabled(tp)) {
+ list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) {
+ unregister_fprobe(&tf->fp);
+ }
+ }
+
+ out:
+ if (file)
+ /*
+ * Synchronization is done in below function. For perf event,
+ * file == NULL and perf_trace_event_unreg() calls
+ * tracepoint_synchronize_unregister() to ensure synchronize
+ * event. We don't need to care about it.
+ */
+ trace_probe_remove_file(tp, file);
+
+ return 0;
+}
+
+/*
* called by perf_trace_init() or __ftrace_set_clr_event() under event_mutex.
*/
static int fprobe_register(struct trace_event_call *event,
@@ -1365,6 +1572,9 @@ static __init int init_fprobe_trace_early(void)
ret = register_tracepoint_module_notifier(&tracepoint_module_nb);
if (ret)
return ret;
+ ret = register_module_notifier(&tprobe_event_module_nb);
+ if (ret)
+ return ret;
#endif
return 0;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 14d74a7491b8..66e1a527cf1a 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -513,7 +513,7 @@ static void print_graph_proc(struct trace_seq *s, pid_t pid)
{
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
- char pid_str[11];
+ char pid_str[12];
int spaces = 0;
int len;
int i;
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index b65353ec2837..2f7b94e98317 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -325,12 +325,9 @@ static void move_to_next_cpu(void)
cpus_read_lock();
cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
- next_cpu = cpumask_next(raw_smp_processor_id(), current_mask);
+ next_cpu = cpumask_next_wrap(raw_smp_processor_id(), current_mask);
cpus_read_unlock();
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(current_mask);
-
if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
goto change_mode;
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index d7b135de958a..896ff78b8349 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -43,17 +43,15 @@ static void ftrace_dump_buf(int skip_entries, long cpu_file)
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.array_buffer->buffer,
- cpu, GFP_ATOMIC);
- ring_buffer_read_start(iter.buffer_iter[cpu]);
+ ring_buffer_read_start(iter.array_buffer->buffer,
+ cpu, GFP_ATOMIC);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.array_buffer->buffer,
+ ring_buffer_read_start(iter.array_buffer->buffer,
cpu_file, GFP_ATOMIC);
- ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 3e5c47b6d7b2..ccae62d4fb91 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -9,19 +9,19 @@
#include <linux/bpf-cgroup.h>
#include <linux/cleanup.h>
-#include <linux/security.h>
+#include <linux/error-injection.h>
#include <linux/module.h>
-#include <linux/uaccess.h>
#include <linux/rculist.h>
-#include <linux/error-injection.h>
+#include <linux/security.h>
+#include <linux/uaccess.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */
#include "trace_dynevent.h"
#include "trace_kprobe_selftest.h"
#include "trace_probe.h"
-#include "trace_probe_tmpl.h"
#include "trace_probe_kernel.h"
+#include "trace_probe_tmpl.h"
#define KPROBE_EVENT_SYSTEM "kprobes"
#define KRETPROBE_MAXACTIVE_MAX 4096
@@ -861,20 +861,20 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
+ const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
+ const char **new_argv __free(kfree) = NULL;
int i, len, new_argc = 0, ret = 0;
- bool is_return = false;
char *symbol __free(kfree) = NULL;
- char *tmp = NULL;
- const char **new_argv __free(kfree) = NULL;
- const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
+ char *ebuf __free(kfree) = NULL;
+ char *gbuf __free(kfree) = NULL;
+ char *abuf __free(kfree) = NULL;
+ char *dbuf __free(kfree) = NULL;
enum probe_print_type ptype;
+ bool is_return = false;
int maxactive = 0;
- long offset = 0;
void *addr = NULL;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- char abuf[MAX_BTF_ARGS_LEN];
- char *dbuf __free(kfree) = NULL;
+ char *tmp = NULL;
+ long offset = 0;
switch (argv[0][0]) {
case 'r':
@@ -893,6 +893,8 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
event++;
if (isdigit(argv[0][1])) {
+ char *buf __free(kfree) = NULL;
+
if (!is_return) {
trace_probe_log_err(1, BAD_MAXACT_TYPE);
return -EINVAL;
@@ -905,7 +907,7 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
trace_probe_log_err(1, BAD_MAXACT);
return -EINVAL;
}
- memcpy(buf, &argv[0][1], len);
+ buf = kmemdup(&argv[0][1], len + 1, GFP_KERNEL);
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {
@@ -973,6 +975,9 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ return -ENOMEM;
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
@@ -981,16 +986,22 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
if (!event) {
/* Make a new event name */
+ ebuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!ebuf)
+ return -ENOMEM;
if (symbol)
- snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
is_return ? 'r' : 'p', symbol, offset);
else
- snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
+ snprintf(ebuf, MAX_EVENT_NAME_LEN, "%c_0x%p",
is_return ? 'r' : 'p', addr);
- sanitize_event_name(buf);
- event = buf;
+ sanitize_event_name(ebuf);
+ event = ebuf;
}
+ abuf = kmalloc(MAX_BTF_ARGS_LEN, GFP_KERNEL);
+ if (!abuf)
+ return -ENOMEM;
argc -= 2; argv += 2;
ctx->funcname = symbol;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
@@ -1065,14 +1076,18 @@ static int trace_kprobe_create_internal(int argc, const char *argv[],
static int trace_kprobe_create_cb(int argc, const char *argv[])
{
- struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context) = NULL;
int ret;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->flags = TPARG_FL_KERNEL;
+
trace_probe_log_init("trace_kprobe", argc, argv);
- ret = trace_kprobe_create_internal(argc, argv, &ctx);
+ ret = trace_kprobe_create_internal(argc, argv, ctx);
- traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
return ret;
}
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 40830a3ecd96..5cbdc423afeb 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -13,8 +13,8 @@
#include <linux/bpf.h>
#include <linux/fs.h>
-#include "trace_btf.h"
+#include "trace_btf.h"
#include "trace_probe.h"
#undef C
@@ -247,7 +247,25 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset)
return 0;
}
-/* @buf must has MAX_EVENT_NAME_LEN size */
+/**
+ * traceprobe_parse_event_name() - Parse a string into group and event names
+ * @pevent: A pointer to the string to be parsed.
+ * @pgroup: A pointer to the group name.
+ * @buf: A buffer to store the parsed group name.
+ * @offset: The offset of the string in the original user command, for logging.
+ *
+ * This parses a string with the format `[GROUP/][EVENT]` or `[GROUP.][EVENT]`
+ * (either GROUP or EVENT or both must be specified).
+ * Since the parsed group name is stored in @buf, the caller must ensure @buf
+ * is at least MAX_EVENT_NAME_LEN bytes.
+ *
+ * Return: 0 on success, or -EINVAL on failure.
+ *
+ * If success, *@pevent is updated to point to the event name part of the
+ * original string, or NULL if there is no event name.
+ * Also, *@pgroup is updated to point to the parsed group which is stored
+ * in @buf, or NULL if there is no group name.
+ */
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
char *buf, int offset)
{
@@ -779,6 +797,36 @@ static int check_prepare_btf_string_fetch(char *typename,
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
+static void store_entry_arg_at(struct fetch_insn *code, int argnum, int offset)
+{
+ code[0].op = FETCH_OP_ARG;
+ code[0].param = argnum;
+ code[1].op = FETCH_OP_ST_EDATA;
+ code[1].offset = offset;
+}
+
+static int get_entry_arg_max_offset(struct probe_entry_arg *earg)
+{
+ int i, max_offset = 0;
+
+ /*
+ * earg->code[] array has an operation sequence which is run in
+ * the entry handler.
+ * The sequence stopped by FETCH_OP_END and each data stored in
+ * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA
+ * stores the data at the data buffer + its offset, and all data are
+ * "unsigned long" size. The offset must be increased when a data is
+ * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the
+ * code array.
+ */
+ for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i++) {
+ if (earg->code[i].op == FETCH_OP_ST_EDATA)
+ if (earg->code[i].offset > max_offset)
+ max_offset = earg->code[i].offset;
+ }
+ return max_offset;
+}
+
/*
* Add the entry code to store the 'argnum'th parameter and return the offset
* in the entry data buffer where the data will be stored.
@@ -786,8 +834,7 @@ static int check_prepare_btf_string_fetch(char *typename,
static int __store_entry_arg(struct trace_probe *tp, int argnum)
{
struct probe_entry_arg *earg = tp->entry_arg;
- bool match = false;
- int i, offset;
+ int i, offset, last_offset = 0;
if (!earg) {
earg = kzalloc(sizeof(*tp->entry_arg), GFP_KERNEL);
@@ -804,78 +851,59 @@ static int __store_entry_arg(struct trace_probe *tp, int argnum)
for (i = 0; i < earg->size; i++)
earg->code[i].op = FETCH_OP_END;
tp->entry_arg = earg;
+ store_entry_arg_at(earg->code, argnum, 0);
+ return 0;
}
/*
- * The entry code array is repeating the pair of
- * [FETCH_OP_ARG(argnum)][FETCH_OP_ST_EDATA(offset of entry data buffer)]
- * and the rest of entries are filled with [FETCH_OP_END].
+ * NOTE: if anyone change the following rule, please rewrite this.
+ * The entry code array is filled with the pair of
+ *
+ * [FETCH_OP_ARG(argnum)]
+ * [FETCH_OP_ST_EDATA(offset of entry data buffer)]
*
- * To reduce the redundant function parameter fetching, we scan the entry
- * code array to find the FETCH_OP_ARG which already fetches the 'argnum'
- * parameter. If it doesn't match, update 'offset' to find the last
- * offset.
- * If we find the FETCH_OP_END without matching FETCH_OP_ARG entry, we
- * will save the entry with FETCH_OP_ARG and FETCH_OP_ST_EDATA, and
- * return data offset so that caller can find the data offset in the entry
- * data buffer.
+ * and the rest of entries are filled with [FETCH_OP_END].
+ * The offset should be incremented, thus the last pair should
+ * have the largest offset.
*/
- offset = 0;
- for (i = 0; i < earg->size - 1; i++) {
- switch (earg->code[i].op) {
- case FETCH_OP_END:
- earg->code[i].op = FETCH_OP_ARG;
- earg->code[i].param = argnum;
- earg->code[i + 1].op = FETCH_OP_ST_EDATA;
- earg->code[i + 1].offset = offset;
- return offset;
- case FETCH_OP_ARG:
- match = (earg->code[i].param == argnum);
- break;
- case FETCH_OP_ST_EDATA:
- offset = earg->code[i].offset;
- if (match)
- return offset;
- offset += sizeof(unsigned long);
- break;
- default:
- break;
- }
+
+ /* Search the offset for the sprcified argnum. */
+ for (i = 0; i < earg->size - 1 && earg->code[i].op != FETCH_OP_END; i += 2) {
+ if (WARN_ON_ONCE(earg->code[i].op != FETCH_OP_ARG))
+ return -EINVAL;
+
+ if (earg->code[i].param != argnum)
+ continue;
+
+ if (WARN_ON_ONCE(earg->code[i + 1].op != FETCH_OP_ST_EDATA))
+ return -EINVAL;
+
+ return earg->code[i + 1].offset;
+ }
+ /* Not found, append new entry if possible. */
+ if (i >= earg->size - 1)
+ return -ENOSPC;
+
+ /* The last entry must have the largest offset. */
+ if (i != 0) {
+ if (WARN_ON_ONCE(earg->code[i - 1].op != FETCH_OP_ST_EDATA))
+ return -EINVAL;
+ last_offset = earg->code[i - 1].offset;
}
- return -ENOSPC;
+
+ offset = last_offset + sizeof(unsigned long);
+ store_entry_arg_at(&earg->code[i], argnum, offset);
+ return offset;
}
int traceprobe_get_entry_data_size(struct trace_probe *tp)
{
struct probe_entry_arg *earg = tp->entry_arg;
- int i, size = 0;
if (!earg)
return 0;
- /*
- * earg->code[] array has an operation sequence which is run in
- * the entry handler.
- * The sequence stopped by FETCH_OP_END and each data stored in
- * the entry data buffer by FETCH_OP_ST_EDATA. The FETCH_OP_ST_EDATA
- * stores the data at the data buffer + its offset, and all data are
- * "unsigned long" size. The offset must be increased when a data is
- * stored. Thus we need to find the last FETCH_OP_ST_EDATA in the
- * code array.
- */
- for (i = 0; i < earg->size; i++) {
- switch (earg->code[i].op) {
- case FETCH_OP_END:
- goto out;
- case FETCH_OP_ST_EDATA:
- size = earg->code[i].offset + sizeof(unsigned long);
- break;
- default:
- break;
- }
- }
-out:
- return size;
+ return get_entry_arg_max_offset(earg) + sizeof(unsigned long);
}
void store_trace_entry_data(void *edata, struct trace_probe *tp, struct pt_regs *regs)
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 854e5668f5ee..842383fbc03b 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -10,20 +10,22 @@
* Author: Srikar Dronamraju
*/
+#include <linux/bitops.h>
+#include <linux/btf.h>
+#include <linux/cleanup.h>
+#include <linux/kprobes.h>
+#include <linux/limits.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/smp.h>
-#include <linux/tracefs.h>
-#include <linux/types.h>
#include <linux/string.h>
-#include <linux/ptrace.h>
-#include <linux/perf_event.h>
-#include <linux/kprobes.h>
#include <linux/stringify.h>
-#include <linux/limits.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
#include <linux/uaccess.h>
-#include <linux/bitops.h>
-#include <linux/btf.h>
+
#include <asm/bitsperlong.h>
#include "trace.h"
@@ -438,6 +440,14 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
* this MUST be called for clean up the context and return a resource.
*/
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx);
+static inline void traceprobe_free_parse_ctx(struct traceprobe_parse_context *ctx)
+{
+ traceprobe_finish_parse(ctx);
+ kfree(ctx);
+}
+
+DEFINE_FREE(traceprobe_parse_context, struct traceprobe_parse_context *,
+ if (_T) traceprobe_free_parse_ctx(_T))
extern int traceprobe_split_symbol_offset(char *symbol, long *offset);
int traceprobe_parse_event_name(const char **pevent, const char **pgroup,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index f95a2c3d5b1b..8b0bcc0d8f41 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -8,17 +8,19 @@
#define pr_fmt(fmt) "trace_uprobe: " fmt
#include <linux/bpf-cgroup.h>
-#include <linux/security.h>
+#include <linux/cleanup.h>
#include <linux/ctype.h>
+#include <linux/filter.h>
#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/uprobes.h>
#include <linux/namei.h>
-#include <linux/string.h>
-#include <linux/rculist.h>
-#include <linux/filter.h>
#include <linux/percpu.h>
+#include <linux/rculist.h>
+#include <linux/security.h>
+#include <linux/string.h>
+#include <linux/uaccess.h>
+#include <linux/uprobes.h>
+#include "trace.h"
#include "trace_dynevent.h"
#include "trace_probe.h"
#include "trace_probe_tmpl.h"
@@ -537,15 +539,15 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
*/
static int __trace_uprobe_create(int argc, const char **argv)
{
- struct trace_uprobe *tu;
const char *event = NULL, *group = UPROBE_EVENT_SYSTEM;
char *arg, *filename, *rctr, *rctr_end, *tmp;
- char buf[MAX_EVENT_NAME_LEN];
- char gbuf[MAX_EVENT_NAME_LEN];
- enum probe_print_type ptype;
- struct path path;
unsigned long offset, ref_ctr_offset;
+ char *gbuf __free(kfree) = NULL;
+ char *buf __free(kfree) = NULL;
+ enum probe_print_type ptype;
+ struct trace_uprobe *tu;
bool is_return = false;
+ struct path path;
int i, ret;
ref_ctr_offset = 0;
@@ -653,6 +655,10 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* setup a probe */
trace_probe_log_set_index(0);
if (event) {
+ gbuf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!gbuf)
+ goto fail_mem;
+
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
@@ -664,15 +670,16 @@ static int __trace_uprobe_create(int argc, const char **argv)
char *ptr;
tail = kstrdup(kbasename(filename), GFP_KERNEL);
- if (!tail) {
- ret = -ENOMEM;
- goto fail_address_parse;
- }
+ if (!tail)
+ goto fail_mem;
ptr = strpbrk(tail, ".-_");
if (ptr)
*ptr = '\0';
+ buf = kmalloc(MAX_EVENT_NAME_LEN, GFP_KERNEL);
+ if (!buf)
+ goto fail_mem;
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_0x%lx", 'p', tail, offset);
event = buf;
kfree(tail);
@@ -695,13 +702,16 @@ static int __trace_uprobe_create(int argc, const char **argv)
/* parse arguments */
for (i = 0; i < argc; i++) {
- struct traceprobe_parse_context ctx = {
- .flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER,
- };
+ struct traceprobe_parse_context *ctx __free(traceprobe_parse_context)
+ = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto error;
+ }
+ ctx->flags = (is_return ? TPARG_FL_RETURN : 0) | TPARG_FL_USER;
trace_probe_log_set_index(i + 2);
- ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], &ctx);
- traceprobe_finish_parse(&ctx);
+ ret = traceprobe_parse_probe_arg(&tu->tp, i, argv[i], ctx);
if (ret)
goto error;
}
@@ -721,6 +731,9 @@ out:
trace_probe_log_clear();
return ret;
+fail_mem:
+ ret = -ENOMEM;
+
fail_address_parse:
trace_probe_log_clear();
path_put(&path);
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
new file mode 100644
index 000000000000..eae37bea54fd
--- /dev/null
+++ b/kernel/unwind/Makefile
@@ -0,0 +1 @@
+ obj-$(CONFIG_UNWIND_USER) += user.o deferred.o
diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c
new file mode 100644
index 000000000000..dc6040aae3ee
--- /dev/null
+++ b/kernel/unwind/deferred.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Deferred user space unwinding
+ */
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_deferred.h>
+#include <linux/sched/clock.h>
+#include <linux/task_work.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+
+/*
+ * For requesting a deferred user space stack trace from NMI context
+ * the architecture must support a safe cmpxchg in NMI context.
+ * For those architectures that do not have that, then it cannot ask
+ * for a deferred user space stack trace from an NMI context. If it
+ * does, then it will get -EINVAL.
+ */
+#if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG)
+# define CAN_USE_IN_NMI 1
+static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
+{
+ u32 old = 0;
+
+ return try_cmpxchg(&info->id.cnt, &old, cnt);
+}
+#else
+# define CAN_USE_IN_NMI 0
+/* When NMIs are not allowed, this always succeeds */
+static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt)
+{
+ info->id.cnt = cnt;
+ return true;
+}
+#endif
+
+/* Make the cache fit in a 4K page */
+#define UNWIND_MAX_ENTRIES \
+ ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long))
+
+/* Guards adding to or removing from the list of callbacks */
+static DEFINE_MUTEX(callback_mutex);
+static LIST_HEAD(callbacks);
+
+#define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED)
+
+/* Zero'd bits are available for assigning callback users */
+static unsigned long unwind_mask = RESERVED_BITS;
+DEFINE_STATIC_SRCU(unwind_srcu);
+
+static inline bool unwind_pending(struct unwind_task_info *info)
+{
+ return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask);
+}
+
+/*
+ * This is a unique percpu identifier for a given task entry context.
+ * Conceptually, it's incremented every time the CPU enters the kernel from
+ * user space, so that each "entry context" on the CPU gets a unique ID. In
+ * reality, as an optimization, it's only incremented on demand for the first
+ * deferred unwind request after a given entry-from-user.
+ *
+ * It's combined with the CPU id to make a systemwide-unique "context cookie".
+ */
+static DEFINE_PER_CPU(u32, unwind_ctx_ctr);
+
+/*
+ * The context cookie is a unique identifier that is assigned to a user
+ * space stacktrace. As the user space stacktrace remains the same while
+ * the task is in the kernel, the cookie is an identifier for the stacktrace.
+ * Although it is possible for the stacktrace to get another cookie if another
+ * request is made after the cookie was cleared and before reentering user
+ * space.
+ */
+static u64 get_cookie(struct unwind_task_info *info)
+{
+ u32 cnt = 1;
+
+ if (info->id.cpu)
+ return info->id.id;
+
+ /* LSB is always set to ensure 0 is an invalid value */
+ cnt |= __this_cpu_read(unwind_ctx_ctr) + 2;
+ if (try_assign_cnt(info, cnt)) {
+ /* Update the per cpu counter */
+ __this_cpu_write(unwind_ctx_ctr, cnt);
+ }
+ /* Interrupts are disabled, the CPU will always be same */
+ info->id.cpu = smp_processor_id() + 1; /* Must be non zero */
+
+ return info->id.id;
+}
+
+/**
+ * unwind_user_faultable - Produce a user stacktrace in faultable context
+ * @trace: The descriptor that will store the user stacktrace
+ *
+ * This must be called in a known faultable context (usually when entering
+ * or exiting user space). Depending on the available implementations
+ * the @trace will be loaded with the addresses of the user space stacktrace
+ * if it can be found.
+ *
+ * Return: 0 on success and negative on error
+ * On success @trace will contain the user space stacktrace
+ */
+int unwind_user_faultable(struct unwind_stacktrace *trace)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+ struct unwind_cache *cache;
+
+ /* Should always be called from faultable context */
+ might_fault();
+
+ if (!current->mm)
+ return -EINVAL;
+
+ if (!info->cache) {
+ info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES),
+ GFP_KERNEL);
+ if (!info->cache)
+ return -ENOMEM;
+ }
+
+ cache = info->cache;
+ trace->entries = cache->entries;
+
+ if (cache->nr_entries) {
+ /*
+ * The user stack has already been previously unwound in this
+ * entry context. Skip the unwind and use the cache.
+ */
+ trace->nr = cache->nr_entries;
+ return 0;
+ }
+
+ trace->nr = 0;
+ unwind_user(trace, UNWIND_MAX_ENTRIES);
+
+ cache->nr_entries = trace->nr;
+
+ /* Clear nr_entries on way back to user space */
+ set_bit(UNWIND_USED_BIT, &info->unwind_mask);
+
+ return 0;
+}
+
+static void process_unwind_deferred(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+ struct unwind_stacktrace trace;
+ struct unwind_work *work;
+ unsigned long bits;
+ u64 cookie;
+
+ if (WARN_ON_ONCE(!unwind_pending(info)))
+ return;
+
+ /* Clear pending bit but make sure to have the current bits */
+ bits = atomic_long_fetch_andnot(UNWIND_PENDING,
+ (atomic_long_t *)&info->unwind_mask);
+ /*
+ * From here on out, the callback must always be called, even if it's
+ * just an empty trace.
+ */
+ trace.nr = 0;
+ trace.entries = NULL;
+
+ unwind_user_faultable(&trace);
+
+ if (info->cache)
+ bits &= ~(info->cache->unwind_completed);
+
+ cookie = info->id.id;
+
+ guard(srcu)(&unwind_srcu);
+ list_for_each_entry_srcu(work, &callbacks, list,
+ srcu_read_lock_held(&unwind_srcu)) {
+ if (test_bit(work->bit, &bits)) {
+ work->func(work, &trace, cookie);
+ if (info->cache)
+ info->cache->unwind_completed |= BIT(work->bit);
+ }
+ }
+}
+
+static void unwind_deferred_task_work(struct callback_head *head)
+{
+ process_unwind_deferred(current);
+}
+
+void unwind_deferred_task_exit(struct task_struct *task)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+
+ if (!unwind_pending(info))
+ return;
+
+ process_unwind_deferred(task);
+
+ task_work_cancel(task, &info->work);
+}
+
+/**
+ * unwind_deferred_request - Request a user stacktrace on task kernel exit
+ * @work: Unwind descriptor requesting the trace
+ * @cookie: The cookie of the first request made for this task
+ *
+ * Schedule a user space unwind to be done in task work before exiting the
+ * kernel.
+ *
+ * The returned @cookie output is the generated cookie of the very first
+ * request for a user space stacktrace for this task since it entered the
+ * kernel. It can be from a request by any caller of this infrastructure.
+ * Its value will also be passed to the callback function. It can be
+ * used to stitch kernel and user stack traces together in post-processing.
+ *
+ * It's valid to call this function multiple times for the same @work within
+ * the same task entry context. Each call will return the same cookie
+ * while the task hasn't left the kernel. If the callback is not pending
+ * because it has already been previously called for the same entry context,
+ * it will be called again with the same stack trace and cookie.
+ *
+ * Return: 0 if the callback successfully was queued.
+ * 1 if the callback is pending or was already executed.
+ * Negative if there's an error.
+ * @cookie holds the cookie of the first request by any user
+ */
+int unwind_deferred_request(struct unwind_work *work, u64 *cookie)
+{
+ struct unwind_task_info *info = &current->unwind_info;
+ unsigned long old, bits;
+ unsigned long bit;
+ int ret;
+
+ *cookie = 0;
+
+ if ((current->flags & (PF_KTHREAD | PF_EXITING)) ||
+ !user_mode(task_pt_regs(current)))
+ return -EINVAL;
+
+ /*
+ * NMI requires having safe cmpxchg operations.
+ * Trigger a warning to make it obvious that an architecture
+ * is using this in NMI when it should not be.
+ */
+ if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi()))
+ return -EINVAL;
+
+ /* Do not allow cancelled works to request again */
+ bit = READ_ONCE(work->bit);
+ if (WARN_ON_ONCE(bit < 0))
+ return -EINVAL;
+
+ /* Only need the mask now */
+ bit = BIT(bit);
+
+ guard(irqsave)();
+
+ *cookie = get_cookie(info);
+
+ old = READ_ONCE(info->unwind_mask);
+
+ /* Is this already queued or executed */
+ if (old & bit)
+ return 1;
+
+ /*
+ * This work's bit hasn't been set yet. Now set it with the PENDING
+ * bit and fetch the current value of unwind_mask. If ether the
+ * work's bit or PENDING was already set, then this is already queued
+ * to have a callback.
+ */
+ bits = UNWIND_PENDING | bit;
+ old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask);
+ if (old & bits) {
+ /*
+ * If the work's bit was set, whatever set it had better
+ * have also set pending and queued a callback.
+ */
+ WARN_ON_ONCE(!(old & UNWIND_PENDING));
+ return old & bit;
+ }
+
+ /* The work has been claimed, now schedule it. */
+ ret = task_work_add(current, &info->work, TWA_RESUME);
+
+ if (WARN_ON_ONCE(ret))
+ WRITE_ONCE(info->unwind_mask, 0);
+
+ return ret;
+}
+
+void unwind_deferred_cancel(struct unwind_work *work)
+{
+ struct task_struct *g, *t;
+ int bit;
+
+ if (!work)
+ return;
+
+ bit = work->bit;
+
+ /* No work should be using a reserved bit */
+ if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS))
+ return;
+
+ guard(mutex)(&callback_mutex);
+ list_del_rcu(&work->list);
+
+ /* Do not allow any more requests and prevent callbacks */
+ work->bit = -1;
+
+ __clear_bit(bit, &unwind_mask);
+
+ synchronize_srcu(&unwind_srcu);
+
+ guard(rcu)();
+ /* Clear this bit from all threads */
+ for_each_process_thread(g, t) {
+ clear_bit(bit, &t->unwind_info.unwind_mask);
+ if (t->unwind_info.cache)
+ clear_bit(bit, &t->unwind_info.cache->unwind_completed);
+ }
+}
+
+int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func)
+{
+ memset(work, 0, sizeof(*work));
+
+ guard(mutex)(&callback_mutex);
+
+ /* See if there's a bit in the mask available */
+ if (unwind_mask == ~0UL)
+ return -EBUSY;
+
+ work->bit = ffz(unwind_mask);
+ __set_bit(work->bit, &unwind_mask);
+
+ list_add_rcu(&work->list, &callbacks);
+ work->func = func;
+ return 0;
+}
+
+void unwind_task_init(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+
+ memset(info, 0, sizeof(*info));
+ init_task_work(&info->work, unwind_deferred_task_work);
+ info->unwind_mask = 0;
+}
+
+void unwind_task_free(struct task_struct *task)
+{
+ struct unwind_task_info *info = &task->unwind_info;
+
+ kfree(info->cache);
+ task_work_cancel(task, &info->work);
+}
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
new file mode 100644
index 000000000000..97a8415e3216
--- /dev/null
+++ b/kernel/unwind/user.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+* Generic interfaces for unwinding user space
+*/
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/unwind_user.h>
+#include <linux/uaccess.h>
+
+static const struct unwind_user_frame fp_frame = {
+ ARCH_INIT_USER_FP_FRAME
+};
+
+#define for_each_user_frame(state) \
+ for (unwind_user_start(state); !(state)->done; unwind_user_next(state))
+
+static int unwind_user_next_fp(struct unwind_user_state *state)
+{
+ const struct unwind_user_frame *frame = &fp_frame;
+ unsigned long cfa, fp, ra;
+ unsigned int shift;
+
+ if (frame->use_fp) {
+ if (state->fp < state->sp)
+ return -EINVAL;
+ cfa = state->fp;
+ } else {
+ cfa = state->sp;
+ }
+
+ /* Get the Canonical Frame Address (CFA) */
+ cfa += frame->cfa_off;
+
+ /* stack going in wrong direction? */
+ if (cfa <= state->sp)
+ return -EINVAL;
+
+ /* Make sure that the address is word aligned */
+ shift = sizeof(long) == 4 ? 2 : 3;
+ if (cfa & ((1 << shift) - 1))
+ return -EINVAL;
+
+ /* Find the Return Address (RA) */
+ if (get_user(ra, (unsigned long *)(cfa + frame->ra_off)))
+ return -EINVAL;
+
+ if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off)))
+ return -EINVAL;
+
+ state->ip = ra;
+ state->sp = cfa;
+ if (frame->fp_off)
+ state->fp = fp;
+ return 0;
+}
+
+static int unwind_user_next(struct unwind_user_state *state)
+{
+ unsigned long iter_mask = state->available_types;
+ unsigned int bit;
+
+ if (state->done)
+ return -EINVAL;
+
+ for_each_set_bit(bit, &iter_mask, NR_UNWIND_USER_TYPE_BITS) {
+ enum unwind_user_type type = BIT(bit);
+
+ state->current_type = type;
+ switch (type) {
+ case UNWIND_USER_TYPE_FP:
+ if (!unwind_user_next_fp(state))
+ return 0;
+ continue;
+ default:
+ WARN_ONCE(1, "Undefined unwind bit %d", bit);
+ break;
+ }
+ break;
+ }
+
+ /* No successful unwind method. */
+ state->current_type = UNWIND_USER_TYPE_NONE;
+ state->done = true;
+ return -EINVAL;
+}
+
+static int unwind_user_start(struct unwind_user_state *state)
+{
+ struct pt_regs *regs = task_pt_regs(current);
+
+ memset(state, 0, sizeof(*state));
+
+ if ((current->flags & PF_KTHREAD) || !user_mode(regs)) {
+ state->done = true;
+ return -EINVAL;
+ }
+
+ if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
+ state->available_types |= UNWIND_USER_TYPE_FP;
+
+ state->ip = instruction_pointer(regs);
+ state->sp = user_stack_pointer(regs);
+ state->fp = frame_pointer(regs);
+
+ return 0;
+}
+
+int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries)
+{
+ struct unwind_user_state state;
+
+ trace->nr = 0;
+
+ if (!max_entries)
+ return -EINVAL;
+
+ if (current->flags & PF_KTHREAD)
+ return 0;
+
+ for_each_user_frame(&state) {
+ trace->entries[trace->nr++] = state.ip;
+ if (trace->nr >= max_entries)
+ break;
+ }
+
+ return 0;
+}
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index 2f844c279a3e..bc738fa90c1d 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
if (IS_ERR(tsk)) {
kfree(vtsk);
- return ERR_PTR(PTR_ERR(tsk));
+ return ERR_CAST(tsk);
}
vtsk->task = tsk;
diff --git a/kernel/watchdog_buddy.c b/kernel/watchdog_buddy.c
index 34dbfe091f4b..ee754d767c21 100644
--- a/kernel/watchdog_buddy.c
+++ b/kernel/watchdog_buddy.c
@@ -12,10 +12,7 @@ static unsigned int watchdog_next_cpu(unsigned int cpu)
{
unsigned int next_cpu;
- next_cpu = cpumask_next(cpu, &watchdog_cpus);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(&watchdog_cpus);
-
+ next_cpu = cpumask_next_wrap(cpu, &watchdog_cpus);
if (next_cpu == cpu)
return nr_cpu_ids;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9f9148075828..c6b79b3675c3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -505,12 +505,16 @@ static struct kthread_worker *pwq_release_worker __ro_after_init;
struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
+struct workqueue_struct *system_percpu_wq __ro_after_init;
+EXPORT_SYMBOL(system_percpu_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
+struct workqueue_struct *system_dfl_wq __ro_after_init;
+EXPORT_SYMBOL_GPL(system_dfl_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
@@ -1686,17 +1690,14 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
int max = READ_ONCE(nna->max);
+ int old = atomic_read(&nna->nr);
- while (true) {
- int old, tmp;
-
- old = atomic_read(&nna->nr);
+ do {
if (old >= max)
return false;
- tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
- if (tmp == old)
- return true;
- }
+ } while (!atomic_try_cmpxchg_relaxed(&nna->nr, &old, old + 1));
+
+ return true;
}
/**
@@ -2221,12 +2222,9 @@ static int wq_select_unbound_cpu(int cpu)
}
new_cpu = __this_cpu_read(wq_rr_cpu_last);
- new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
- if (unlikely(new_cpu >= nr_cpu_ids)) {
- new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
- if (unlikely(new_cpu >= nr_cpu_ids))
- return cpu;
- }
+ new_cpu = cpumask_next_and_wrap(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids))
+ return cpu;
__this_cpu_write(wq_rr_cpu_last, new_cpu);
return new_cpu;
@@ -4629,7 +4627,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs)
*
* Return: The allocated new workqueue_attr on success. %NULL on failure.
*/
-struct workqueue_attrs *alloc_workqueue_attrs(void)
+struct workqueue_attrs *alloc_workqueue_attrs_noprof(void)
{
struct workqueue_attrs *attrs;
@@ -5682,12 +5680,12 @@ static struct workqueue_struct *__alloc_workqueue(const char *fmt,
else
wq_size = sizeof(*wq);
- wq = kzalloc(wq_size, GFP_KERNEL);
+ wq = kzalloc_noprof(wq_size, GFP_KERNEL);
if (!wq)
return NULL;
if (flags & WQ_UNBOUND) {
- wq->unbound_attrs = alloc_workqueue_attrs();
+ wq->unbound_attrs = alloc_workqueue_attrs_noprof();
if (!wq->unbound_attrs)
goto err_free_wq;
}
@@ -5777,9 +5775,9 @@ err_destroy:
}
__printf(1, 4)
-struct workqueue_struct *alloc_workqueue(const char *fmt,
- unsigned int flags,
- int max_active, ...)
+struct workqueue_struct *alloc_workqueue_noprof(const char *fmt,
+ unsigned int flags,
+ int max_active, ...)
{
struct workqueue_struct *wq;
va_list args;
@@ -5794,7 +5792,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
return wq;
}
-EXPORT_SYMBOL_GPL(alloc_workqueue);
+EXPORT_SYMBOL_GPL(alloc_workqueue_noprof);
#ifdef CONFIG_LOCKDEP
__printf(1, 5)
@@ -6770,31 +6768,6 @@ long work_on_cpu_key(int cpu, long (*fn)(void *),
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);
-
-/**
- * work_on_cpu_safe_key - run a function in thread context on a particular cpu
- * @cpu: the cpu to run on
- * @fn: the function to run
- * @arg: the function argument
- * @key: The lock class key for lock debugging purposes
- *
- * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
- * any locks which would prevent @fn from completing.
- *
- * Return: The value @fn returns.
- */
-long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
- void *arg, struct lock_class_key *key)
-{
- long ret = -ENODEV;
-
- cpus_read_lock();
- if (cpu_online(cpu))
- ret = work_on_cpu_key(cpu, fn, arg, key);
- cpus_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */
#ifdef CONFIG_FREEZER
@@ -7830,10 +7803,11 @@ void __init workqueue_init_early(void)
}
system_wq = alloc_workqueue("events", 0, 0);
+ system_percpu_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
- system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
- WQ_MAX_ACTIVE);
+ system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
+ system_dfl_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
@@ -7844,8 +7818,8 @@ void __init workqueue_init_early(void)
system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
WQ_BH | WQ_HIGHPRI, 0);
- BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
- !system_unbound_wq || !system_freezable_wq ||
+ BUG_ON(!system_wq || !system_percpu_wq|| !system_highpri_wq || !system_long_wq ||
+ !system_unbound_wq || !system_freezable_wq || !system_dfl_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq ||
!system_bh_wq || !system_bh_highpri_wq);