7 files changed, 404 insertions, 0 deletions
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index 465be4e62a17..b4e9ef7edfef 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -758,6 +758,16 @@ include::intel-hybrid.txt[]
 	If the URLs is not specified, the value of DEBUGINFOD_URLS
 	system environment variable is used.
 
+--off-cpu::
+	Enable off-cpu profiling with BPF.  The BPF program will collect
+	task scheduling information with (user) stacktrace and save them
+	as sample data of a software event named "offcpu-time".  The
+	sample period will have the time the task slept in nanoseconds.
+
+	Note that BPF can collect stack traces using frame pointer ("fp")
+	only, as of now.  So the applications built without the frame
+	pointer might see bogus addresses.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-intel-pt[1]
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 6e5aded855cc..8f738e11356d 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -1038,6 +1038,7 @@ SKEL_TMP_OUT := $(abspath $(SKEL_OUT)/.tmp)
 SKELETONS := $(SKEL_OUT)/bpf_prog_profiler.skel.h
 SKELETONS += $(SKEL_OUT)/bperf_leader.skel.h $(SKEL_OUT)/bperf_follower.skel.h
 SKELETONS += $(SKEL_OUT)/bperf_cgroup.skel.h $(SKEL_OUT)/func_latency.skel.h
+SKELETONS += $(SKEL_OUT)/off_cpu.skel.h
 
 $(SKEL_TMP_OUT) $(LIBBPF_OUTPUT):
 	$(Q)$(MKDIR) -p $@
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index cf9a7ce429df..b76f57e3ec73 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -49,6 +49,7 @@
 #include "util/clockid.h"
 #include "util/pmu-hybrid.h"
 #include "util/evlist-hybrid.h"
+#include "util/off_cpu.h"
 #include "asm/bug.h"
 #include "perf.h"
 #include "cputopo.h"
@@ -162,6 +163,7 @@ struct record {
 	bool			buildid_mmap;
 	bool			timestamp_filename;
 	bool			timestamp_boundary;
+	bool			off_cpu;
 	struct switch_output	switch_output;
 	unsigned long long	samples;
 	unsigned long		output_max_size;	/* = 0: unlimited */
@@ -888,6 +890,11 @@ static int record__config_text_poke(struct evlist *evlist)
 	return 0;
 }
 
+static int record__config_off_cpu(struct record *rec)
+{
+	return off_cpu_prepare(rec->evlist);
+}
+
 static bool record__kcore_readable(struct machine *machine)
 {
 	char kcore[PATH_MAX];
@@ -2591,6 +2598,9 @@ out_free_threads:
 	} else
 		status = err;
 
+	if (rec->off_cpu)
+		rec->bytes_written += off_cpu_write(rec->session);
+
 	record__synthesize(rec, true);
 	/* this will be recalculated during process_buildids() */
 	rec->samples = 0;
@@ -3315,6 +3325,7 @@ static struct option __record_options[] = {
 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
 			    "write collected trace data into several data files using parallel threads",
 			    record__parse_threads),
+	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
 	OPT_END()
 };
 
@@ -3736,6 +3747,12 @@ int cmd_record(int argc, const char **argv)
 # undef REASON
 #endif
 
+#ifndef HAVE_BPF_SKEL
+# define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
+	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
+# undef set_nobuild
+#endif
+
 	rec->opts.affinity = PERF_AFFINITY_SYS;
 
 	rec->evlist = evlist__new();
@@ -3972,6 +3989,14 @@ int cmd_record(int argc, const char **argv)
 		}
 	}
 
+	if (rec->off_cpu) {
+		err = record__config_off_cpu(rec);
+		if (err) {
+			pr_err("record__config_off_cpu failed, error %d\n", err);
+			goto out;
+		}
+	}
+
 	if (record_opts__config(&rec->opts)) {
 		err = -EINVAL;
 		goto out;
diff --git a/tools/perf/util/Build b/tools/perf/util/Build
index 9a7209a99e16..a51267d88ca9 100644
--- a/tools/perf/util/Build
+++ b/tools/perf/util/Build
@@ -147,6 +147,7 @@ perf-$(CONFIG_LIBBPF) += bpf_map.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_counter_cgroup.o
 perf-$(CONFIG_PERF_BPF_SKEL) += bpf_ftrace.o
+perf-$(CONFIG_PERF_BPF_SKEL) += bpf_off_cpu.o
 perf-$(CONFIG_BPF_PROLOGUE) += bpf-prologue.o
 perf-$(CONFIG_LIBELF) += symbol-elf.o
 perf-$(CONFIG_LIBELF) += probe-file.o
diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c
new file mode 100644
index 000000000000..9ed7aca3f4ac
--- /dev/null
+++ b/tools/perf/util/bpf_off_cpu.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "util/bpf_counter.h"
+#include "util/debug.h"
+#include "util/evsel.h"
+#include "util/evlist.h"
+#include "util/off_cpu.h"
+#include "util/perf-hooks.h"
+#include "util/session.h"
+#include <bpf/bpf.h>
+
+#include "bpf_skel/off_cpu.skel.h"
+
+#define MAX_STACKS  32
+/* we don't need actual timestamp, just want to put the samples at last */
+#define OFF_CPU_TIMESTAMP  (~0ull << 32)
+
+static struct off_cpu_bpf *skel;
+
+struct off_cpu_key {
+	u32 pid;
+	u32 tgid;
+	u32 stack_id;
+	u32 state;
+};
+
+union off_cpu_data {
+	struct perf_event_header hdr;
+	u64 array[1024 / sizeof(u64)];
+};
+
+static int off_cpu_config(struct evlist *evlist)
+{
+	struct evsel *evsel;
+	struct perf_event_attr attr = {
+		.type	= PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+		.size	= sizeof(attr), /* to capture ABI version */
+	};
+	char *evname = strdup(OFFCPU_EVENT);
+
+	if (evname == NULL)
+		return -ENOMEM;
+
+	evsel = evsel__new(&attr);
+	if (!evsel) {
+		free(evname);
+		return -ENOMEM;
+	}
+
+	evsel->core.attr.freq = 1;
+	evsel->core.attr.sample_period = 1;
+	/* off-cpu analysis depends on stack trace */
+	evsel->core.attr.sample_type = PERF_SAMPLE_CALLCHAIN;
+
+	evlist__add(evlist, evsel);
+
+	free(evsel->name);
+	evsel->name = evname;
+
+	return 0;
+}
+
+static void off_cpu_start(void *arg __maybe_unused)
+{
+	skel->bss->enabled = 1;
+}
+
+static void off_cpu_finish(void *arg __maybe_unused)
+{
+	skel->bss->enabled = 0;
+	off_cpu_bpf__destroy(skel);
+}
+
+int off_cpu_prepare(struct evlist *evlist)
+{
+	int err;
+
+	if (off_cpu_config(evlist) < 0) {
+		pr_err("Failed to config off-cpu BPF event\n");
+		return -1;
+	}
+
+	set_max_rlimit();
+
+	skel = off_cpu_bpf__open_and_load();
+	if (!skel) {
+		pr_err("Failed to open off-cpu BPF skeleton\n");
+		return -1;
+	}
+
+	err = off_cpu_bpf__attach(skel);
+	if (err) {
+		pr_err("Failed to attach off-cpu BPF skeleton\n");
+		goto out;
+	}
+
+	if (perf_hooks__set_hook("record_start", off_cpu_start, NULL) ||
+	    perf_hooks__set_hook("record_end", off_cpu_finish, NULL)) {
+		pr_err("Failed to attach off-cpu skeleton\n");
+		goto out;
+	}
+
+	return 0;
+
+out:
+	off_cpu_bpf__destroy(skel);
+	return -1;
+}
+
+int off_cpu_write(struct perf_session *session)
+{
+	int bytes = 0, size;
+	int fd, stack;
+	u64 sample_type, val, sid = 0;
+	struct evsel *evsel;
+	struct perf_data_file *file = &session->data->file;
+	struct off_cpu_key prev, key;
+	union off_cpu_data data = {
+		.hdr = {
+			.type = PERF_RECORD_SAMPLE,
+			.misc = PERF_RECORD_MISC_USER,
+		},
+	};
+	u64 tstamp = OFF_CPU_TIMESTAMP;
+
+	skel->bss->enabled = 0;
+
+	evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT);
+	if (evsel == NULL) {
+		pr_err("%s evsel not found\n", OFFCPU_EVENT);
+		return 0;
+	}
+
+	sample_type = evsel->core.attr.sample_type;
+
+	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) {
+		if (evsel->core.id)
+			sid = evsel->core.id[0];
+	}
+
+	fd = bpf_map__fd(skel->maps.off_cpu);
+	stack = bpf_map__fd(skel->maps.stacks);
+	memset(&prev, 0, sizeof(prev));
+
+	while (!bpf_map_get_next_key(fd, &prev, &key)) {
+		int n = 1;  /* start from perf_event_header */
+		int ip_pos = -1;
+
+		bpf_map_lookup_elem(fd, &key, &val);
+
+		if (sample_type & PERF_SAMPLE_IDENTIFIER)
+			data.array[n++] = sid;
+		if (sample_type & PERF_SAMPLE_IP) {
+			ip_pos = n;
+			data.array[n++] = 0;  /* will be updated */
+		}
+		if (sample_type & PERF_SAMPLE_TID)
+			data.array[n++] = (u64)key.pid << 32 | key.tgid;
+		if (sample_type & PERF_SAMPLE_TIME)
+			data.array[n++] = tstamp;
+		if (sample_type & PERF_SAMPLE_ID)
+			data.array[n++] = sid;
+		if (sample_type & PERF_SAMPLE_CPU)
+			data.array[n++] = 0;
+		if (sample_type & PERF_SAMPLE_PERIOD)
+			data.array[n++] = val;
+		if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+			int len = 0;
+
+			/* data.array[n] is callchain->nr (updated later) */
+			data.array[n + 1] = PERF_CONTEXT_USER;
+			data.array[n + 2] = 0;
+
+			bpf_map_lookup_elem(stack, &key.stack_id, &data.array[n + 2]);
+			while (data.array[n + 2 + len])
+				len++;
+
+			/* update length of callchain */
+			data.array[n] = len + 1;
+
+			/* update sample ip with the first callchain entry */
+			if (ip_pos >= 0)
+				data.array[ip_pos] = data.array[n + 2];
+
+			/* calculate sample callchain data array length */
+			n += len + 2;
+		}
+		/* TODO: handle more sample types */
+
+		size = n * sizeof(u64);
+		data.hdr.size = size;
+		bytes += size;
+
+		if (perf_data_file__write(file, &data, size) < 0) {
+			pr_err("failed to write perf data, error: %m\n");
+			return bytes;
+		}
+
+		prev = key;
+		/* increase dummy timestamp to sort later samples */
+		tstamp++;
+	}
+	return bytes;
+}
diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c
new file mode 100644
index 000000000000..5173ed882fdf
--- /dev/null
+++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (c) 2022 Google
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+/* task->flags for off-cpu analysis */
+#define PF_KTHREAD   0x00200000  /* I am a kernel thread */
+
+/* task->state for off-cpu analysis */
+#define TASK_INTERRUPTIBLE	0x0001
+#define TASK_UNINTERRUPTIBLE	0x0002
+
+#define MAX_STACKS   32
+#define MAX_ENTRIES  102400
+
+struct tstamp_data {
+	__u32 stack_id;
+	__u32 state;
+	__u64 timestamp;
+};
+
+struct offcpu_key {
+	__u32 pid;
+	__u32 tgid;
+	__u32 stack_id;
+	__u32 state;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, MAX_STACKS * sizeof(__u64));
+	__uint(max_entries, MAX_ENTRIES);
+} stacks SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct tstamp_data);
+} tstamp SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(key_size, sizeof(struct offcpu_key));
+	__uint(value_size, sizeof(__u64));
+	__uint(max_entries, MAX_ENTRIES);
+} off_cpu SEC(".maps");
+
+/* old kernel task_struct definition */
+struct task_struct___old {
+	long state;
+} __attribute__((preserve_access_index));
+
+int enabled = 0;
+
+/*
+ * Old kernel used to call it task_struct->state and now it's '__state'.
+ * Use BPF CO-RE "ignored suffix rule" to deal with it like below:
+ *
+ * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
+ */
+static inline int get_task_state(struct task_struct *t)
+{
+	if (bpf_core_field_exists(t->__state))
+		return BPF_CORE_READ(t, __state);
+
+	/* recast pointer to capture task_struct___old type for compiler */
+	struct task_struct___old *t_old = (void *)t;
+
+	/* now use old "state" name of the field */
+	return BPF_CORE_READ(t_old, state);
+}
+
+SEC("tp_btf/sched_switch")
+int on_switch(u64 *ctx)
+{
+	__u64 ts;
+	int state;
+	__u32 stack_id;
+	struct task_struct *prev, *next;
+	struct tstamp_data *pelem;
+
+	if (!enabled)
+		return 0;
+
+	prev = (struct task_struct *)ctx[1];
+	next = (struct task_struct *)ctx[2];
+	state = get_task_state(prev);
+
+	ts = bpf_ktime_get_ns();
+
+	if (prev->flags & PF_KTHREAD)
+		goto next;
+	if (state != TASK_INTERRUPTIBLE &&
+	    state != TASK_UNINTERRUPTIBLE)
+		goto next;
+
+	stack_id = bpf_get_stackid(ctx, &stacks,
+				   BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
+
+	pelem = bpf_task_storage_get(&tstamp, prev, NULL,
+				     BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!pelem)
+		goto next;
+
+	pelem->timestamp = ts;
+	pelem->state = state;
+	pelem->stack_id = stack_id;
+
+next:
+	pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
+
+	if (pelem && pelem->timestamp) {
+		struct offcpu_key key = {
+			.pid = next->pid,
+			.tgid = next->tgid,
+			.stack_id = pelem->stack_id,
+			.state = pelem->state,
+		};
+		__u64 delta = ts - pelem->timestamp;
+		__u64 *total;
+
+		total = bpf_map_lookup_elem(&off_cpu, &key);
+		if (total)
+			*total += delta;
+		else
+			bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
+
+		/* prevent to reuse the timestamp later */
+		pelem->timestamp = 0;
+	}
+
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
diff --git a/tools/perf/util/off_cpu.h b/tools/perf/util/off_cpu.h
new file mode 100644
index 000000000000..375d03c424ea
--- /dev/null
+++ b/tools/perf/util/off_cpu.h
@@ -0,0 +1,24 @@
+#ifndef PERF_UTIL_OFF_CPU_H
+#define PERF_UTIL_OFF_CPU_H
+
+struct evlist;
+struct perf_session;
+
+#define OFFCPU_EVENT  "offcpu-time"
+
+#ifdef HAVE_BPF_SKEL
+int off_cpu_prepare(struct evlist *evlist);
+int off_cpu_write(struct perf_session *session);
+#else
+static inline int off_cpu_prepare(struct evlist *evlist __maybe_unused)
+{
+	return -1;
+}
+
+static inline int off_cpu_write(struct perf_session *session __maybe_unused)
+{
+	return -1;
+}
+#endif
+
+#endif  /* PERF_UTIL_OFF_CPU_H */