33 files changed, 6393 insertions, 950 deletions
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
new file mode 100644
index 000000000000..b558ab98719f
--- /dev/null
+++ b/tools/perf/bench/Build
@@ -0,0 +1,26 @@
+perf-bench-y += sched-messaging.o
+perf-bench-y += sched-pipe.o
+perf-bench-y += sched-seccomp-notify.o
+perf-bench-y += syscall.o
+perf-bench-y += mem-functions.o
+perf-bench-y += futex.o
+perf-bench-y += futex-hash.o
+perf-bench-y += futex-wake.o
+perf-bench-y += futex-wake-parallel.o
+perf-bench-y += futex-requeue.o
+perf-bench-y += futex-lock-pi.o
+perf-bench-y += epoll-wait.o
+perf-bench-y += epoll-ctl.o
+perf-bench-y += synthesize.o
+perf-bench-y += kallsyms-parse.o
+perf-bench-y += find-bit-bench.o
+perf-bench-y += inject-buildid.o
+perf-bench-y += evlist-open-close.o
+perf-bench-y += breakpoint.o
+perf-bench-y += pmu-scan.o
+perf-bench-y += uprobe.o
+
+perf-bench-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
+perf-bench-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o
+
+perf-bench-$(CONFIG_NUMA) += numa.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 0fdc85269c4d..8519eb5a42fa 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -1,36 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #ifndef BENCH_H
 #define BENCH_H
 
+#include <sys/time.h>
+
+extern struct timeval bench__start, bench__end, bench__runtime;
+
 /*
  * The madvise transparent hugepage constants were added in glibc
  * 2.13. For compatibility with older versions of glibc, define these
  * tokens if they are not already defined.
- *
- * PA-RISC uses different madvise values from other architectures and
- * needs to be special-cased.
  */
-#ifdef __hppa__
-# ifndef MADV_HUGEPAGE
-#  define MADV_HUGEPAGE		67
-# endif
-# ifndef MADV_NOHUGEPAGE
-#  define MADV_NOHUGEPAGE	68
-# endif
-#else
 # ifndef MADV_HUGEPAGE
 #  define MADV_HUGEPAGE		14
 # endif
 # ifndef MADV_NOHUGEPAGE
 #  define MADV_NOHUGEPAGE	15
 # endif
-#endif
 
-extern int bench_numa(int argc, const char **argv, const char *prefix);
-extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);
-extern int bench_sched_pipe(int argc, const char **argv, const char *prefix);
-extern int bench_mem_memcpy(int argc, const char **argv,
-			    const char *prefix __maybe_unused);
-extern int bench_mem_memset(int argc, const char **argv, const char *prefix);
+int bench_numa(int argc, const char **argv);
+int bench_sched_messaging(int argc, const char **argv);
+int bench_sched_pipe(int argc, const char **argv);
+int bench_sched_seccomp_notify(int argc, const char **argv);
+int bench_syscall_basic(int argc, const char **argv);
+int bench_syscall_getpgid(int argc, const char **argv);
+int bench_syscall_fork(int argc, const char **argv);
+int bench_syscall_execve(int argc, const char **argv);
+int bench_mem_memcpy(int argc, const char **argv);
+int bench_mem_memset(int argc, const char **argv);
+int bench_mem_mmap(int argc, const char **argv);
+int bench_mem_find_bit(int argc, const char **argv);
+int bench_futex_hash(int argc, const char **argv);
+int bench_futex_wake(int argc, const char **argv);
+int bench_futex_wake_parallel(int argc, const char **argv);
+int bench_futex_requeue(int argc, const char **argv);
+/* pi futexes */
+int bench_futex_lock_pi(int argc, const char **argv);
+int bench_epoll_wait(int argc, const char **argv);
+int bench_epoll_ctl(int argc, const char **argv);
+int bench_synthesize(int argc, const char **argv);
+int bench_kallsyms_parse(int argc, const char **argv);
+int bench_inject_build_id(int argc, const char **argv);
+int bench_evlist_open_close(int argc, const char **argv);
+int bench_breakpoint_thread(int argc, const char **argv);
+int bench_breakpoint_enable(int argc, const char **argv);
+int bench_uprobe_baseline(int argc, const char **argv);
+int bench_uprobe_empty(int argc, const char **argv);
+int bench_uprobe_trace_printk(int argc, const char **argv);
+int bench_uprobe_empty_ret(int argc, const char **argv);
+int bench_uprobe_trace_printk_ret(int argc, const char **argv);
+int bench_pmu_scan(int argc, const char **argv);
 
 #define BENCH_FORMAT_DEFAULT_STR	"default"
 #define BENCH_FORMAT_DEFAULT		0
@@ -40,5 +59,17 @@ extern int bench_mem_memset(int argc, const char **argv, const char *prefix);
 #define BENCH_FORMAT_UNKNOWN		-1
 
 extern int bench_format;
+extern unsigned int bench_repeat;
+
+#ifndef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
+#include <pthread.h>
+#include <linux/compiler.h>
+static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr __maybe_unused,
+					      size_t cpusetsize __maybe_unused,
+					      cpu_set_t *cpuset __maybe_unused)
+{
+	return 0;
+}
+#endif
 
 #endif
diff --git a/tools/perf/bench/breakpoint.c b/tools/perf/bench/breakpoint.c
new file mode 100644
index 000000000000..dfd18f5db97d
--- /dev/null
+++ b/tools/perf/bench/breakpoint.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <subcmd/parse-options.h>
+#include <linux/hw_breakpoint.h>
+#include <linux/perf_event.h>
+#include <linux/time64.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include "bench.h"
+#include "futex.h"
+
+struct {
+	unsigned int nbreakpoints;
+	unsigned int nparallel;
+	unsigned int nthreads;
+} thread_params = {
+	.nbreakpoints = 1,
+	.nparallel = 1,
+	.nthreads = 1,
+};
+
+static const struct option thread_options[] = {
+	OPT_UINTEGER('b', "breakpoints", &thread_params.nbreakpoints,
+		"Specify amount of breakpoints"),
+	OPT_UINTEGER('p', "parallelism", &thread_params.nparallel, "Specify amount of parallelism"),
+	OPT_UINTEGER('t', "threads", &thread_params.nthreads, "Specify amount of threads"),
+	OPT_END()
+};
+
+static const char * const thread_usage[] = {
+	"perf bench breakpoint thread <options>",
+	NULL
+};
+
+struct breakpoint {
+	int fd;
+	char watched;
+};
+
+static int breakpoint_setup(void *addr)
+{
+	struct perf_event_attr attr = { .size = 0, };
+	int fd;
+
+	attr.type = PERF_TYPE_BREAKPOINT;
+	attr.size = sizeof(attr);
+	attr.inherit = 1;
+	attr.exclude_kernel = 1;
+	attr.exclude_hv = 1;
+	attr.bp_addr = (unsigned long)addr;
+	attr.bp_type = HW_BREAKPOINT_RW;
+	attr.bp_len = HW_BREAKPOINT_LEN_1;
+	fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
+
+	if (fd < 0)
+		fd = -errno;
+
+	return fd;
+}
+
+static void *passive_thread(void *arg)
+{
+	unsigned int *done = (unsigned int *)arg;
+
+	while (!__atomic_load_n(done, __ATOMIC_RELAXED))
+		futex_wait(done, 0, NULL, 0);
+	return NULL;
+}
+
+static void *active_thread(void *arg)
+{
+	unsigned int *done = (unsigned int *)arg;
+
+	while (!__atomic_load_n(done, __ATOMIC_RELAXED));
+	return NULL;
+}
+
+static void *breakpoint_thread(void *arg)
+{
+	unsigned int i, done;
+	int *repeat = (int *)arg;
+	pthread_t *threads;
+
+	threads = calloc(thread_params.nthreads, sizeof(threads[0]));
+	if (!threads)
+		exit((perror("calloc"), EXIT_FAILURE));
+
+	while (__atomic_fetch_sub(repeat, 1, __ATOMIC_RELAXED) > 0) {
+		done = 0;
+		for (i = 0; i < thread_params.nthreads; i++) {
+			if (pthread_create(&threads[i], NULL, passive_thread, &done))
+				exit((perror("pthread_create"), EXIT_FAILURE));
+		}
+		__atomic_store_n(&done, 1, __ATOMIC_RELAXED);
+		futex_wake(&done, thread_params.nthreads, 0);
+		for (i = 0; i < thread_params.nthreads; i++)
+			pthread_join(threads[i], NULL);
+	}
+	free(threads);
+	return NULL;
+}
+
+// The benchmark creates nbreakpoints inheritable breakpoints,
+// then starts nparallel threads which create and join bench_repeat batches of nthreads threads.
+int bench_breakpoint_thread(int argc, const char **argv)
+{
+	unsigned int i, result_usec;
+	int repeat = bench_repeat;
+	struct breakpoint *breakpoints;
+	pthread_t *parallel;
+	struct timeval start, stop, diff;
+
+	if (parse_options(argc, argv, thread_options, thread_usage, 0)) {
+		usage_with_options(thread_usage, thread_options);
+		exit(EXIT_FAILURE);
+	}
+	breakpoints = calloc(thread_params.nbreakpoints, sizeof(breakpoints[0]));
+	parallel = calloc(thread_params.nparallel, sizeof(parallel[0]));
+	if (!breakpoints || !parallel)
+		exit((perror("calloc"), EXIT_FAILURE));
+
+	for (i = 0; i < thread_params.nbreakpoints; i++) {
+		breakpoints[i].fd = breakpoint_setup(&breakpoints[i].watched);
+
+		if (breakpoints[i].fd < 0) {
+			if (breakpoints[i].fd == -ENODEV) {
+				printf("Skipping perf bench breakpoint thread: No hardware support\n");
+				return 0;
+			}
+			exit((perror("perf_event_open"), EXIT_FAILURE));
+		}
+	}
+	gettimeofday(&start, NULL);
+	for (i = 0; i < thread_params.nparallel; i++) {
+		if (pthread_create(&parallel[i], NULL, breakpoint_thread, &repeat))
+			exit((perror("pthread_create"), EXIT_FAILURE));
+	}
+	for (i = 0; i < thread_params.nparallel; i++)
+		pthread_join(parallel[i], NULL);
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+	for (i = 0; i < thread_params.nbreakpoints; i++)
+		close(breakpoints[i].fd);
+	free(parallel);
+	free(breakpoints);
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		printf("# Created/joined %d threads with %d breakpoints and %d parallelism\n",
+			bench_repeat, thread_params.nbreakpoints, thread_params.nparallel);
+		printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
+			(long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC));
+		result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+		printf(" %14lf usecs/op\n",
+			(double)result_usec / bench_repeat / thread_params.nthreads);
+		printf(" %14lf usecs/op/cpu\n",
+			(double)result_usec / bench_repeat /
+			thread_params.nthreads * thread_params.nparallel);
+		break;
+	case BENCH_FORMAT_SIMPLE:
+		printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC));
+		break;
+	default:
+		fprintf(stderr, "Unknown format: %d\n", bench_format);
+		exit(EXIT_FAILURE);
+	}
+	return 0;
+}
+
+struct {
+	unsigned int npassive;
+	unsigned int nactive;
+} enable_params = {
+	.nactive = 0,
+	.npassive = 0,
+};
+
+static const struct option enable_options[] = {
+	OPT_UINTEGER('p', "passive", &enable_params.npassive, "Specify amount of passive threads"),
+	OPT_UINTEGER('a', "active", &enable_params.nactive, "Specify amount of active threads"),
+	OPT_END()
+};
+
+static const char * const enable_usage[] = {
+	"perf bench breakpoint enable <options>",
+	NULL
+};
+
+// The benchmark creates an inheritable breakpoint,
+// then starts npassive threads that block and nactive threads that actively spin
+// and then disables and enables the breakpoint bench_repeat times.
+int bench_breakpoint_enable(int argc, const char **argv)
+{
+	unsigned int i, nthreads, result_usec, done = 0;
+	char watched;
+	int fd;
+	pthread_t *threads;
+	struct timeval start, stop, diff;
+
+	if (parse_options(argc, argv, enable_options, enable_usage, 0)) {
+		usage_with_options(enable_usage, enable_options);
+		exit(EXIT_FAILURE);
+	}
+	fd = breakpoint_setup(&watched);
+
+	if (fd < 0) {
+		if (fd == -ENODEV) {
+			printf("Skipping perf bench breakpoint enable: No hardware support\n");
+			return 0;
+		}
+		exit((perror("perf_event_open"), EXIT_FAILURE));
+	}
+	nthreads = enable_params.npassive + enable_params.nactive;
+	threads = calloc(nthreads, sizeof(threads[0]));
+	if (!threads)
+		exit((perror("calloc"), EXIT_FAILURE));
+
+	for (i = 0; i < nthreads; i++) {
+		if (pthread_create(&threads[i], NULL,
+			i < enable_params.npassive ? passive_thread : active_thread, &done))
+			exit((perror("pthread_create"), EXIT_FAILURE));
+	}
+	usleep(10000);  // let the threads block
+	gettimeofday(&start, NULL);
+	for (i = 0; i < bench_repeat; i++) {
+		if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0))
+			exit((perror("ioctl(PERF_EVENT_IOC_DISABLE)"), EXIT_FAILURE));
+		if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0))
+			exit((perror("ioctl(PERF_EVENT_IOC_ENABLE)"), EXIT_FAILURE));
+	}
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+	__atomic_store_n(&done, 1, __ATOMIC_RELAXED);
+	futex_wake(&done, enable_params.npassive, 0);
+	for (i = 0; i < nthreads; i++)
+		pthread_join(threads[i], NULL);
+	free(threads);
+	close(fd);
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		printf("# Enabled/disabled breakpoint %d time with %d passive and %d active threads\n",
+			bench_repeat, enable_params.npassive, enable_params.nactive);
+		printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
+			(long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC));
+		result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+		printf(" %14lf usecs/op\n", (double)result_usec / bench_repeat);
+		break;
+	case BENCH_FORMAT_SIMPLE:
+		printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC));
+		break;
+	default:
+		fprintf(stderr, "Unknown format: %d\n", bench_format);
+		exit(EXIT_FAILURE);
+	}
+	return 0;
+}
diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c
new file mode 100644
index 000000000000..d66d852b90e4
--- /dev/null
+++ b/tools/perf/bench/epoll-ctl.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Davidlohr Bueso.
+ *
+ * Benchmark the various operations allowed for epoll_ctl(2).
+ * The idea is to concurrently stress a single epoll instance
+ */
+#ifdef HAVE_EVENTFD_SUPPORT
+/* For the CLR_() macros */
+#include <string.h>
+#include <pthread.h>
+
+#include <errno.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include <perf/cpumap.h>
+
+#include "../util/mutex.h"
+#include "../util/stat.h"
+#include <subcmd/parse-options.h>
+#include "bench.h"
+
+#include <err.h>
+
+#define printinfo(fmt, arg...) \
+	do { if (__verbose) printf(fmt, ## arg); } while (0)
+
+static unsigned int nthreads = 0;
+static unsigned int nsecs    = 8;
+static bool done, __verbose, randomize;
+
+/*
+ * epoll related shared variables.
+ */
+
+/* Maximum number of nesting allowed inside epoll sets */
+#define EPOLL_MAXNESTS 4
+
+enum {
+	OP_EPOLL_ADD,
+	OP_EPOLL_MOD,
+	OP_EPOLL_DEL,
+	EPOLL_NR_OPS,
+};
+
+static int epollfd;
+static int *epollfdp;
+static bool noaffinity;
+static unsigned int nested = 0;
+
+/* amount of fds to monitor, per thread */
+static unsigned int nfds = 64;
+
+static struct mutex thread_lock;
+static unsigned int threads_starting;
+static struct stats all_stats[EPOLL_NR_OPS];
+static struct cond thread_parent, thread_worker;
+
+struct worker {
+	int tid;
+	pthread_t thread;
+	unsigned long ops[EPOLL_NR_OPS];
+	int *fdmap;
+};
+
+static const struct option options[] = {
+	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('r', "runtime", &nsecs,    "Specify runtime (in seconds)"),
+	OPT_UINTEGER('f', "nfds", &nfds, "Specify amount of file descriptors to monitor for each thread"),
+	OPT_BOOLEAN( 'n', "noaffinity",  &noaffinity,   "Disables CPU affinity"),
+	OPT_UINTEGER( 'N', "nested",  &nested,   "Nesting level epoll hierarchy (default is 0, no nesting)"),
+	OPT_BOOLEAN( 'R', "randomize", &randomize,   "Perform random operations on random fds"),
+	OPT_BOOLEAN( 'v', "verbose",  &__verbose,   "Verbose mode"),
+	OPT_END()
+};
+
+static const char * const bench_epoll_ctl_usage[] = {
+	"perf bench epoll ctl <options>",
+	NULL
+};
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	/* inform all threads that we're done for the day */
+	done = true;
+	gettimeofday(&bench__end, NULL);
+	timersub(&bench__end, &bench__start, &bench__runtime);
+}
+
+static void nest_epollfd(void)
+{
+	unsigned int i;
+	struct epoll_event ev;
+
+	if (nested > EPOLL_MAXNESTS)
+		nested = EPOLL_MAXNESTS;
+	printinfo("Nesting level(s): %d\n", nested);
+
+	epollfdp = calloc(nested, sizeof(int));
+	if (!epollfdp)
+		err(EXIT_FAILURE, "calloc");
+
+	for (i = 0; i < nested; i++) {
+		epollfdp[i] = epoll_create(1);
+		if (epollfd < 0)
+			err(EXIT_FAILURE, "epoll_create");
+	}
+
+	ev.events = EPOLLHUP; /* anything */
+	ev.data.u64 = i; /* any number */
+
+	for (i = nested - 1; i; i--) {
+		if (epoll_ctl(epollfdp[i - 1], EPOLL_CTL_ADD,
+			      epollfdp[i], &ev) < 0)
+			err(EXIT_FAILURE, "epoll_ctl");
+	}
+
+	if (epoll_ctl(epollfd, EPOLL_CTL_ADD, *epollfdp, &ev) < 0)
+		err(EXIT_FAILURE, "epoll_ctl");
+}
+
+static inline void do_epoll_op(struct worker *w, int op, int fd)
+{
+	int error;
+	struct epoll_event ev;
+
+	ev.events = EPOLLIN;
+	ev.data.u64 = fd;
+
+	switch (op) {
+	case OP_EPOLL_ADD:
+		error = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev);
+		break;
+	case OP_EPOLL_MOD:
+		ev.events = EPOLLOUT;
+		error = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &ev);
+		break;
+	case OP_EPOLL_DEL:
+		error = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, NULL);
+		break;
+	default:
+		error = 1;
+		break;
+	}
+
+	if (!error)
+		w->ops[op]++;
+}
+
+static inline void do_random_epoll_op(struct worker *w)
+{
+	unsigned long rnd1 = random(), rnd2 = random();
+	int op, fd;
+
+	fd = w->fdmap[rnd1 % nfds];
+	op = rnd2 % EPOLL_NR_OPS;
+
+	do_epoll_op(w, op, fd);
+}
+
+static void *workerfn(void *arg)
+{
+	unsigned int i;
+	struct worker *w = (struct worker *) arg;
+	struct timespec ts = { .tv_sec = 0,
+			       .tv_nsec = 250 };
+
+	mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		cond_signal(&thread_parent);
+	cond_wait(&thread_worker, &thread_lock);
+	mutex_unlock(&thread_lock);
+
+	/* Let 'em loose */
+	do {
+		/* random */
+		if (randomize) {
+			do_random_epoll_op(w);
+		} else {
+			for (i = 0; i < nfds; i++) {
+				do_epoll_op(w, OP_EPOLL_ADD, w->fdmap[i]);
+				do_epoll_op(w, OP_EPOLL_MOD, w->fdmap[i]);
+				do_epoll_op(w, OP_EPOLL_DEL, w->fdmap[i]);
+			}
+		}
+
+		nanosleep(&ts, NULL);
+	}  while (!done);
+
+	return NULL;
+}
+
+static void init_fdmaps(struct worker *w, int pct)
+{
+	unsigned int i;
+	int inc;
+	struct epoll_event ev;
+
+	if (!pct)
+		return;
+
+	inc = 100/pct;
+	for (i = 0; i < nfds; i+=inc) {
+		ev.data.fd = w->fdmap[i];
+		ev.events = EPOLLIN;
+
+		if (epoll_ctl(epollfd, EPOLL_CTL_ADD, w->fdmap[i], &ev) < 0)
+			err(EXIT_FAILURE, "epoll_ct");
+	}
+}
+
+static int do_threads(struct worker *worker, struct perf_cpu_map *cpu)
+{
+	pthread_attr_t thread_attr, *attrp = NULL;
+	cpu_set_t *cpuset;
+	unsigned int i, j;
+	int ret = 0;
+	int nrcpus;
+	size_t size;
+
+	if (!noaffinity)
+		pthread_attr_init(&thread_attr);
+
+	nrcpus = cpu__max_cpu().cpu;
+	cpuset = CPU_ALLOC(nrcpus);
+	BUG_ON(!cpuset);
+	size = CPU_ALLOC_SIZE(nrcpus);
+
+	for (i = 0; i < nthreads; i++) {
+		struct worker *w = &worker[i];
+
+		w->tid = i;
+		w->fdmap = calloc(nfds, sizeof(int));
+		if (!w->fdmap)
+			return 1;
+
+		for (j = 0; j < nfds; j++) {
+			w->fdmap[j] = eventfd(0, EFD_NONBLOCK);
+			if (w->fdmap[j] < 0)
+				err(EXIT_FAILURE, "eventfd");
+		}
+
+		/*
+		 * Lets add 50% of the fdmap to the epoll instance, and
+		 * do it before any threads are started; otherwise there is
+		 * an initial bias of the call failing  (mod and del ops).
+		 */
+		if (randomize)
+			init_fdmaps(w, 50);
+
+		if (!noaffinity) {
+			CPU_ZERO_S(size, cpuset);
+			CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu,
+					size, cpuset);
+
+			ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset);
+			if (ret) {
+				CPU_FREE(cpuset);
+				err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+			}
+
+			attrp = &thread_attr;
+		}
+
+		ret = pthread_create(&w->thread, attrp, workerfn,
+				     (void *)(struct worker *) w);
+		if (ret) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_create");
+		}
+	}
+
+	CPU_FREE(cpuset);
+	if (!noaffinity)
+		pthread_attr_destroy(&thread_attr);
+
+	return ret;
+}
+
+static void print_summary(void)
+{
+	int i;
+	unsigned long avg[EPOLL_NR_OPS];
+	double stddev[EPOLL_NR_OPS];
+
+	for (i = 0; i < EPOLL_NR_OPS; i++) {
+		avg[i] = avg_stats(&all_stats[i]);
+		stddev[i] = stddev_stats(&all_stats[i]);
+	}
+
+	printf("\nAveraged %ld ADD operations (+- %.2f%%)\n",
+	       avg[OP_EPOLL_ADD], rel_stddev_stats(stddev[OP_EPOLL_ADD],
+						   avg[OP_EPOLL_ADD]));
+	printf("Averaged %ld MOD operations (+- %.2f%%)\n",
+	       avg[OP_EPOLL_MOD], rel_stddev_stats(stddev[OP_EPOLL_MOD],
+						   avg[OP_EPOLL_MOD]));
+	printf("Averaged %ld DEL operations (+- %.2f%%)\n",
+	       avg[OP_EPOLL_DEL], rel_stddev_stats(stddev[OP_EPOLL_DEL],
+						   avg[OP_EPOLL_DEL]));
+}
+
+int bench_epoll_ctl(int argc, const char **argv)
+{
+	int j, ret = 0;
+	struct sigaction act;
+	struct worker *worker = NULL;
+	struct perf_cpu_map *cpu;
+	struct rlimit rl, prevrl;
+	unsigned int i;
+
+	argc = parse_options(argc, argv, options, bench_epoll_ctl_usage, 0);
+	if (argc) {
+		usage_with_options(bench_epoll_ctl_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&act, 0, sizeof(act));
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	cpu = perf_cpu_map__new_online_cpus();
+	if (!cpu)
+		goto errmem;
+
+	/* a single, main epoll instance */
+	epollfd = epoll_create(1);
+	if (epollfd < 0)
+		err(EXIT_FAILURE, "epoll_create");
+
+	/*
+	 * Deal with nested epolls, if any.
+	 */
+	if (nested)
+		nest_epollfd();
+
+	/* default to the number of CPUs */
+	if (!nthreads)
+		nthreads = perf_cpu_map__nr(cpu);
+
+	worker = calloc(nthreads, sizeof(*worker));
+	if (!worker)
+		goto errmem;
+
+	if (getrlimit(RLIMIT_NOFILE, &prevrl))
+	    err(EXIT_FAILURE, "getrlimit");
+	rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50;
+	printinfo("Setting RLIMIT_NOFILE rlimit from %" PRIu64 " to: %" PRIu64 "\n",
+		  (uint64_t)prevrl.rlim_max, (uint64_t)rl.rlim_max);
+	if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
+		err(EXIT_FAILURE, "setrlimit");
+
+	printf("Run summary [PID %d]: %d threads doing epoll_ctl ops "
+	       "%d file-descriptors for %d secs.\n\n",
+	       getpid(), nthreads, nfds, nsecs);
+
+	for (i = 0; i < EPOLL_NR_OPS; i++)
+		init_stats(&all_stats[i]);
+
+	mutex_init(&thread_lock);
+	cond_init(&thread_parent);
+	cond_init(&thread_worker);
+
+	threads_starting = nthreads;
+
+	gettimeofday(&bench__start, NULL);
+
+	do_threads(worker, cpu);
+
+	mutex_lock(&thread_lock);
+	while (threads_starting)
+		cond_wait(&thread_parent, &thread_lock);
+	cond_broadcast(&thread_worker);
+	mutex_unlock(&thread_lock);
+
+	sleep(nsecs);
+	toggle_done(0, NULL, NULL);
+	printinfo("main thread: toggling done\n");
+
+	for (i = 0; i < nthreads; i++) {
+		ret = pthread_join(worker[i].thread, NULL);
+		if (ret)
+			err(EXIT_FAILURE, "pthread_join");
+	}
+
+	/* cleanup & report results */
+	cond_destroy(&thread_parent);
+	cond_destroy(&thread_worker);
+	mutex_destroy(&thread_lock);
+
+	for (i = 0; i < nthreads; i++) {
+		unsigned long t[EPOLL_NR_OPS];
+
+		for (j = 0; j < EPOLL_NR_OPS; j++) {
+			t[j] = worker[i].ops[j];
+			update_stats(&all_stats[j], t[j]);
+		}
+
+		if (nfds == 1)
+			printf("[thread %2d] fdmap: %p [ add: %04ld; mod: %04ld; del: %04lds ops ]\n",
+			       worker[i].tid, &worker[i].fdmap[0],
+			       t[OP_EPOLL_ADD], t[OP_EPOLL_MOD], t[OP_EPOLL_DEL]);
+		else
+			printf("[thread %2d] fdmap: %p ... %p [ add: %04ld ops; mod: %04ld ops; del: %04ld ops ]\n",
+			       worker[i].tid, &worker[i].fdmap[0],
+			       &worker[i].fdmap[nfds-1],
+			       t[OP_EPOLL_ADD], t[OP_EPOLL_MOD], t[OP_EPOLL_DEL]);
+	}
+
+	print_summary();
+
+	close(epollfd);
+	perf_cpu_map__put(cpu);
+	for (i = 0; i < nthreads; i++)
+		free(worker[i].fdmap);
+
+	free(worker);
+	return ret;
+errmem:
+	err(EXIT_FAILURE, "calloc");
+}
+#endif // HAVE_EVENTFD_SUPPORT
diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c
new file mode 100644
index 000000000000..20fe4f72b4af
--- /dev/null
+++ b/tools/perf/bench/epoll-wait.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifdef HAVE_EVENTFD_SUPPORT
+/*
+ * Copyright (C) 2018 Davidlohr Bueso.
+ *
+ * This program benchmarks concurrent epoll_wait(2) monitoring multiple
+ * file descriptors under one or two load balancing models. The first,
+ * and default, is the single/combined queueing (which refers to a single
+ * epoll instance for N worker threads):
+ *
+ *                          |---> [worker A]
+ *                          |---> [worker B]
+ *        [combined queue]  .---> [worker C]
+ *                          |---> [worker D]
+ *                          |---> [worker E]
+ *
+ * While the second model, enabled via --multiq option, uses multiple
+ * queueing (which refers to one epoll instance per worker). For example,
+ * short lived tcp connections in a high throughput httpd server will
+ * distribute the accept()'ing  connections across CPUs. In this case each
+ * worker does a limited  amount of processing.
+ *
+ *             [queue A]  ---> [worker]
+ *             [queue B]  ---> [worker]
+ *             [queue C]  ---> [worker]
+ *             [queue D]  ---> [worker]
+ *             [queue E]  ---> [worker]
+ *
+ * Naturally, the single queue will enforce more concurrency on the epoll
+ * instance, and can therefore scale poorly compared to multiple queues.
+ * However, this is a benchmark raw data and must be taken with a grain of
+ * salt when choosing how to make use of sys_epoll.
+
+ * Each thread has a number of private, nonblocking file descriptors,
+ * referred to as fdmap. A writer thread will constantly be writing to
+ * the fdmaps of all threads, minimizing each threads's chances of
+ * epoll_wait not finding any ready read events and blocking as this
+ * is not what we want to stress. The size of the fdmap can be adjusted
+ * by the user; enlarging the value will increase the chances of
+ * epoll_wait(2) blocking as the lineal writer thread will take "longer",
+ * at least at a high level.
+ *
+ * Note that because fds are private to each thread, this workload does
+ * not stress scenarios where multiple tasks are awoken per ready IO; ie:
+ * EPOLLEXCLUSIVE semantics.
+ *
+ * The end result/metric is throughput: number of ops/second where an
+ * operation consists of:
+ *
+ *   epoll_wait(2) + [others]
+ *
+ *        ... where [others] is the cost of re-adding the fd (EPOLLET),
+ *            or rearming it (EPOLLONESHOT).
+ *
+ *
+ * The purpose of this is program is that it be useful for measuring
+ * kernel related changes to the sys_epoll, and not comparing different
+ * IO polling methods, for example. Hence everything is very adhoc and
+ * outputs raw microbenchmark numbers. Also this uses eventfd, similar
+ * tools tend to use pipes or sockets, but the result is the same.
+ */
+
+/* For the CLR_() macros */
+#include <string.h>
+#include <pthread.h>
+#include <unistd.h>
+
+#include <errno.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/epoll.h>
+#include <sys/eventfd.h>
+#include <sys/types.h>
+#include <perf/cpumap.h>
+
+#include "../util/stat.h"
+#include "../util/mutex.h"
+#include <subcmd/parse-options.h>
+#include "bench.h"
+
+#include <err.h>
+
+#define printinfo(fmt, arg...) \
+	do { if (__verbose) { printf(fmt, ## arg); fflush(stdout); } } while (0)
+
+static unsigned int nthreads = 0;
+static unsigned int nsecs    = 8;
+static bool wdone, done, __verbose, randomize, nonblocking;
+
+/*
+ * epoll related shared variables.
+ */
+
+/* Maximum number of nesting allowed inside epoll sets */
+#define EPOLL_MAXNESTS 4
+
+static int epollfd;
+static int *epollfdp;
+static bool noaffinity;
+static unsigned int nested = 0;
+static bool et; /* edge-trigger */
+static bool oneshot;
+static bool multiq; /* use an epoll instance per thread */
+
+/* amount of fds to monitor, per thread */
+static unsigned int nfds = 64;
+
+static struct mutex thread_lock;
+static unsigned int threads_starting;
+static struct stats throughput_stats;
+static struct cond thread_parent, thread_worker;
+
+struct worker {
+	int tid;
+	int epollfd; /* for --multiq */
+	pthread_t thread;
+	unsigned long ops;
+	int *fdmap;
+};
+
+static const struct option options[] = {
+	/* general benchmark options */
+	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"),
+	OPT_UINTEGER('f', "nfds",    &nfds,  "Specify amount of file descriptors to monitor for each thread"),
+	OPT_BOOLEAN( 'n', "noaffinity",  &noaffinity,   "Disables CPU affinity"),
+	OPT_BOOLEAN('R', "randomize", &randomize,   "Enable random write behaviour (default is lineal)"),
+	OPT_BOOLEAN( 'v', "verbose", &__verbose, "Verbose mode"),
+
+	/* epoll specific options */
+	OPT_BOOLEAN( 'm', "multiq",  &multiq,   "Use multiple epoll instances (one per thread)"),
+	OPT_BOOLEAN( 'B', "nonblocking", &nonblocking, "Nonblocking epoll_wait(2) behaviour"),
+	OPT_UINTEGER( 'N', "nested",  &nested,   "Nesting level epoll hierarchy (default is 0, no nesting)"),
+	OPT_BOOLEAN( 'S', "oneshot",  &oneshot,   "Use EPOLLONESHOT semantics"),
+	OPT_BOOLEAN( 'E', "edge",  &et,   "Use Edge-triggered interface (default is LT)"),
+
+	OPT_END()
+};
+
+static const char * const bench_epoll_wait_usage[] = {
+	"perf bench epoll wait <options>",
+	NULL
+};
+
+
+/*
+ * Arrange the N elements of ARRAY in random order.
+ * Only effective if N is much smaller than RAND_MAX;
+ * if this may not be the case, use a better random
+ * number generator. -- Ben Pfaff.
+ */
+static void shuffle(void *array, size_t n, size_t size)
+{
+	char *carray = array;
+	void *aux;
+	size_t i;
+
+	if (n <= 1)
+		return;
+
+	aux = calloc(1, size);
+	if (!aux)
+		err(EXIT_FAILURE, "calloc");
+
+	for (i = 1; i < n; ++i) {
+		size_t j =   i + rand() / (RAND_MAX / (n - i) + 1);
+		j *= size;
+
+		memcpy(aux, &carray[j], size);
+		memcpy(&carray[j], &carray[i*size], size);
+		memcpy(&carray[i*size], aux, size);
+	}
+
+	free(aux);
+}
+
+
+static void *workerfn(void *arg)
+{
+	int fd, ret, r;
+	struct worker *w = (struct worker *) arg;
+	unsigned long ops = w->ops;
+	struct epoll_event ev;
+	uint64_t val;
+	int to = nonblocking? 0 : -1;
+	int efd = multiq ? w->epollfd : epollfd;
+
+	mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		cond_signal(&thread_parent);
+	cond_wait(&thread_worker, &thread_lock);
+	mutex_unlock(&thread_lock);
+
+	do {
+		/*
+		 * Block indefinitely waiting for the IN event.
+		 * In order to stress the epoll_wait(2) syscall,
+		 * call it event per event, instead of a larger
+		 * batch (max)limit.
+		 */
+		do {
+			ret = epoll_wait(efd, &ev, 1, to);
+		} while (ret < 0 && errno == EINTR);
+		if (ret < 0)
+			err(EXIT_FAILURE, "epoll_wait");
+
+		fd = ev.data.fd;
+
+		do {
+			r = read(fd, &val, sizeof(val));
+		} while (!done && (r < 0 && errno == EAGAIN));
+
+		if (et) {
+			ev.events = EPOLLIN | EPOLLET;
+			ret = epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev);
+		}
+
+		if (oneshot) {
+			/* rearm the file descriptor with a new event mask */
+			ev.events |= EPOLLIN | EPOLLONESHOT;
+			ret = epoll_ctl(efd, EPOLL_CTL_MOD, fd, &ev);
+		}
+
+		ops++;
+	}  while (!done);
+
+	if (multiq)
+		close(w->epollfd);
+
+	w->ops = ops;
+	return NULL;
+}
+
+static void nest_epollfd(struct worker *w)
+{
+	unsigned int i;
+	struct epoll_event ev;
+	int efd = multiq ? w->epollfd : epollfd;
+
+	if (nested > EPOLL_MAXNESTS)
+		nested = EPOLL_MAXNESTS;
+
+	epollfdp = calloc(nested, sizeof(*epollfdp));
+	if (!epollfdp)
+		err(EXIT_FAILURE, "calloc");
+
+	for (i = 0; i < nested; i++) {
+		epollfdp[i] = epoll_create(1);
+		if (epollfdp[i] < 0)
+			err(EXIT_FAILURE, "epoll_create");
+	}
+
+	ev.events = EPOLLHUP; /* anything */
+	ev.data.u64 = i; /* any number */
+
+	for (i = nested - 1; i; i--) {
+		if (epoll_ctl(epollfdp[i - 1], EPOLL_CTL_ADD,
+			      epollfdp[i], &ev) < 0)
+			err(EXIT_FAILURE, "epoll_ctl");
+	}
+
+	if (epoll_ctl(efd, EPOLL_CTL_ADD, *epollfdp, &ev) < 0)
+		err(EXIT_FAILURE, "epoll_ctl");
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	/* inform all threads that we're done for the day */
+	done = true;
+	gettimeofday(&bench__end, NULL);
+	timersub(&bench__end, &bench__start, &bench__runtime);
+}
+
+static void print_summary(void)
+{
+	unsigned long avg = avg_stats(&throughput_stats);
+	double stddev = stddev_stats(&throughput_stats);
+
+	printf("\nAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n",
+	       avg, rel_stddev_stats(stddev, avg),
+	       (int)bench__runtime.tv_sec);
+}
+
+static int do_threads(struct worker *worker, struct perf_cpu_map *cpu)
+{
+	pthread_attr_t thread_attr, *attrp = NULL;
+	cpu_set_t *cpuset;
+	unsigned int i, j;
+	int ret = 0, events = EPOLLIN;
+	int nrcpus;
+	size_t size;
+
+	if (oneshot)
+		events |= EPOLLONESHOT;
+	if (et)
+		events |= EPOLLET;
+
+	printinfo("starting worker/consumer %sthreads%s\n",
+		  noaffinity ?  "":"CPU affinity ",
+		  nonblocking ? " (nonblocking)":"");
+	if (!noaffinity)
+		pthread_attr_init(&thread_attr);
+
+	nrcpus = cpu__max_cpu().cpu;
+	cpuset = CPU_ALLOC(nrcpus);
+	BUG_ON(!cpuset);
+	size = CPU_ALLOC_SIZE(nrcpus);
+
+	for (i = 0; i < nthreads; i++) {
+		struct worker *w = &worker[i];
+
+		if (multiq) {
+			w->epollfd = epoll_create(1);
+			if (w->epollfd < 0)
+				err(EXIT_FAILURE, "epoll_create");
+
+			if (nested)
+				nest_epollfd(w);
+		}
+
+		w->tid = i;
+		w->fdmap = calloc(nfds, sizeof(int));
+		if (!w->fdmap)
+			return 1;
+
+		for (j = 0; j < nfds; j++) {
+			int efd = multiq ? w->epollfd : epollfd;
+			struct epoll_event ev;
+
+			w->fdmap[j] = eventfd(0, EFD_NONBLOCK);
+			if (w->fdmap[j] < 0)
+				err(EXIT_FAILURE, "eventfd");
+
+			ev.data.fd = w->fdmap[j];
+			ev.events = events;
+
+			ret = epoll_ctl(efd, EPOLL_CTL_ADD,
+					w->fdmap[j], &ev);
+			if (ret < 0)
+				err(EXIT_FAILURE, "epoll_ctl");
+		}
+
+		if (!noaffinity) {
+			CPU_ZERO_S(size, cpuset);
+			CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu,
+					size, cpuset);
+
+			ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset);
+			if (ret) {
+				CPU_FREE(cpuset);
+				err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+			}
+
+			attrp = &thread_attr;
+		}
+
+		ret = pthread_create(&w->thread, attrp, workerfn,
+				     (void *)(struct worker *) w);
+		if (ret) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_create");
+		}
+	}
+
+	CPU_FREE(cpuset);
+	if (!noaffinity)
+		pthread_attr_destroy(&thread_attr);
+
+	return ret;
+}
+
+static void *writerfn(void *p)
+{
+	struct worker *worker = p;
+	size_t i, j, iter;
+	const uint64_t val = 1;
+	ssize_t sz;
+	struct timespec ts = { .tv_sec = 0,
+			       .tv_nsec = 500 };
+
+	printinfo("starting writer-thread: doing %s writes ...\n",
+		  randomize? "random":"lineal");
+
+	for (iter = 0; !wdone; iter++) {
+		if (randomize) {
+			shuffle((void *)worker, nthreads, sizeof(*worker));
+		}
+
+		for (i = 0; i < nthreads; i++) {
+			struct worker *w = &worker[i];
+
+			if (randomize) {
+				shuffle((void *)w->fdmap, nfds, sizeof(int));
+			}
+
+			for (j = 0; j < nfds; j++) {
+				do {
+					sz = write(w->fdmap[j], &val, sizeof(val));
+				} while (!wdone && (sz < 0 && errno == EAGAIN));
+			}
+		}
+
+		nanosleep(&ts, NULL);
+	}
+
+	printinfo("exiting writer-thread (total full-loops: %zd)\n", iter);
+	return NULL;
+}
+
+static int cmpworker(const void *p1, const void *p2)
+{
+
+	struct worker *w1 = (struct worker *) p1;
+	struct worker *w2 = (struct worker *) p2;
+
+	if (w1->tid > w2->tid)
+		return 1;
+	if (w1->tid < w2->tid)
+		return -1;
+	return 0;
+}
+
+int bench_epoll_wait(int argc, const char **argv)
+{
+	int ret = 0;
+	struct sigaction act;
+	unsigned int i;
+	struct worker *worker = NULL;
+	struct perf_cpu_map *cpu;
+	pthread_t wthread;
+	struct rlimit rl, prevrl;
+
+	argc = parse_options(argc, argv, options, bench_epoll_wait_usage, 0);
+	if (argc) {
+		usage_with_options(bench_epoll_wait_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&act, 0, sizeof(act));
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	cpu = perf_cpu_map__new_online_cpus();
+	if (!cpu)
+		goto errmem;
+
+	/* a single, main epoll instance */
+	if (!multiq) {
+		epollfd = epoll_create(1);
+		if (epollfd < 0)
+			err(EXIT_FAILURE, "epoll_create");
+
+		/*
+		 * Deal with nested epolls, if any.
+		 */
+		if (nested)
+			nest_epollfd(NULL);
+	}
+
+	printinfo("Using %s queue model\n", multiq ? "multi" : "single");
+	printinfo("Nesting level(s): %d\n", nested);
+
+	/* default to the number of CPUs and leave one for the writer pthread */
+	if (!nthreads)
+		nthreads = perf_cpu_map__nr(cpu) - 1;
+
+	worker = calloc(nthreads, sizeof(*worker));
+	if (!worker) {
+		goto errmem;
+	}
+
+	if (getrlimit(RLIMIT_NOFILE, &prevrl))
+		err(EXIT_FAILURE, "getrlimit");
+	rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50;
+	printinfo("Setting RLIMIT_NOFILE rlimit from %" PRIu64 " to: %" PRIu64 "\n",
+		  (uint64_t)prevrl.rlim_max, (uint64_t)rl.rlim_max);
+	if (setrlimit(RLIMIT_NOFILE, &rl) < 0)
+		err(EXIT_FAILURE, "setrlimit");
+
+	printf("Run summary [PID %d]: %d threads monitoring%s on "
+	       "%d file-descriptors for %d secs.\n\n",
+	       getpid(), nthreads, oneshot ? " (EPOLLONESHOT semantics)": "", nfds, nsecs);
+
+	init_stats(&throughput_stats);
+	mutex_init(&thread_lock);
+	cond_init(&thread_parent);
+	cond_init(&thread_worker);
+
+	threads_starting = nthreads;
+
+	gettimeofday(&bench__start, NULL);
+
+	do_threads(worker, cpu);
+
+	mutex_lock(&thread_lock);
+	while (threads_starting)
+		cond_wait(&thread_parent, &thread_lock);
+	cond_broadcast(&thread_worker);
+	mutex_unlock(&thread_lock);
+
+	/*
+	 * At this point the workers should be blocked waiting for read events
+	 * to become ready. Launch the writer which will constantly be writing
+	 * to each thread's fdmap.
+	 */
+	ret = pthread_create(&wthread, NULL, writerfn,
+			     (void *)(struct worker *) worker);
+	if (ret)
+		err(EXIT_FAILURE, "pthread_create");
+
+	sleep(nsecs);
+	toggle_done(0, NULL, NULL);
+	printinfo("main thread: toggling done\n");
+
+	sleep(1); /* meh */
+	wdone = true;
+	ret = pthread_join(wthread, NULL);
+	if (ret)
+		err(EXIT_FAILURE, "pthread_join");
+
+	/* cleanup & report results */
+	cond_destroy(&thread_parent);
+	cond_destroy(&thread_worker);
+	mutex_destroy(&thread_lock);
+
+	/* sort the array back before reporting */
+	if (randomize)
+		qsort(worker, nthreads, sizeof(struct worker), cmpworker);
+
+	for (i = 0; i < nthreads; i++) {
+		unsigned long t = bench__runtime.tv_sec > 0 ?
+			worker[i].ops / bench__runtime.tv_sec : 0;
+
+		update_stats(&throughput_stats, t);
+
+		if (nfds == 1)
+			printf("[thread %2d] fdmap: %p [ %04ld ops/sec ]\n",
+			       worker[i].tid, &worker[i].fdmap[0], t);
+		else
+			printf("[thread %2d] fdmap: %p ... %p [ %04ld ops/sec ]\n",
+			       worker[i].tid, &worker[i].fdmap[0],
+			       &worker[i].fdmap[nfds-1], t);
+	}
+
+	print_summary();
+
+	close(epollfd);
+	perf_cpu_map__put(cpu);
+	for (i = 0; i < nthreads; i++)
+		free(worker[i].fdmap);
+
+	free(worker);
+	return ret;
+errmem:
+	err(EXIT_FAILURE, "calloc");
+}
+#endif // HAVE_EVENTFD_SUPPORT
diff --git a/tools/perf/bench/evlist-open-close.c b/tools/perf/bench/evlist-open-close.c
new file mode 100644
index 000000000000..faf9c34b4a5d
--- /dev/null
+++ b/tools/perf/bench/evlist-open-close.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "bench.h"
+#include "../util/debug.h"
+#include "../util/stat.h"
+#include "../util/evlist.h"
+#include "../util/evsel.h"
+#include "../util/strbuf.h"
+#include "../util/record.h"
+#include "../util/parse-events.h"
+#include "internal/threadmap.h"
+#include "internal/cpumap.h"
+#include <linux/perf_event.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+#include <linux/string.h>
+#include <subcmd/parse-options.h>
+
+#define MMAP_FLUSH_DEFAULT 1
+
+static int iterations = 100;
+static int nr_events = 1;
+static const char *event_string = "dummy";
+
+static inline u64 timeval2usec(struct timeval *tv)
+{
+	return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
+}
+
+static struct record_opts opts = {
+	.sample_time	     = true,
+	.mmap_pages	     = UINT_MAX,
+	.user_freq	     = UINT_MAX,
+	.user_interval	     = ULLONG_MAX,
+	.freq		     = 4000,
+	.target		     = {
+		.uses_mmap   = true,
+		.default_per_cpu = true,
+	},
+	.mmap_flush          = MMAP_FLUSH_DEFAULT,
+	.nr_threads_synthesize = 1,
+	.ctl_fd              = -1,
+	.ctl_fd_ack          = -1,
+};
+
+static int evlist__count_evsel_fds(struct evlist *evlist)
+{
+	struct evsel *evsel;
+	int cnt = 0;
+
+	evlist__for_each_entry(evlist, evsel)
+		cnt += evsel->core.threads->nr * perf_cpu_map__nr(evsel->core.cpus);
+
+	return cnt;
+}
+
+static struct evlist *bench__create_evlist(char *evstr, const char *uid_str)
+{
+	struct parse_events_error err;
+	struct evlist *evlist = evlist__new();
+	int ret;
+
+	if (!evlist) {
+		pr_err("Not enough memory to create evlist\n");
+		return NULL;
+	}
+
+	parse_events_error__init(&err);
+	ret = parse_events(evlist, evstr, &err);
+	if (ret) {
+		parse_events_error__print(&err, evstr);
+		parse_events_error__exit(&err);
+		pr_err("Run 'perf list' for a list of valid events\n");
+		ret = 1;
+		goto out_delete_evlist;
+	}
+	parse_events_error__exit(&err);
+	if (uid_str) {
+		uid_t uid = parse_uid(uid_str);
+
+		if (uid == UINT_MAX) {
+			pr_err("Invalid User: %s", uid_str);
+			ret = -EINVAL;
+			goto out_delete_evlist;
+		}
+		ret = parse_uid_filter(evlist, uid);
+		if (ret)
+			goto out_delete_evlist;
+	}
+	ret = evlist__create_maps(evlist, &opts.target);
+	if (ret < 0) {
+		pr_err("Not enough memory to create thread/cpu maps\n");
+		goto out_delete_evlist;
+	}
+
+	evlist__config(evlist, &opts, NULL);
+
+	return evlist;
+
+out_delete_evlist:
+	evlist__delete(evlist);
+	return NULL;
+}
+
+static int bench__do_evlist_open_close(struct evlist *evlist)
+{
+	char sbuf[STRERR_BUFSIZE];
+	int err = evlist__open(evlist);
+
+	if (err < 0) {
+		pr_err("evlist__open: %s\n", str_error_r(errno, sbuf, sizeof(sbuf)));
+		return err;
+	}
+
+	err = evlist__mmap(evlist, opts.mmap_pages);
+	if (err < 0) {
+		pr_err("evlist__mmap: %s\n", str_error_r(errno, sbuf, sizeof(sbuf)));
+		return err;
+	}
+
+	evlist__enable(evlist);
+	evlist__disable(evlist);
+	evlist__munmap(evlist);
+	evlist__close(evlist);
+
+	return 0;
+}
+
+static int bench_evlist_open_close__run(char *evstr, const char *uid_str)
+{
+	// used to print statistics only
+	struct evlist *evlist = bench__create_evlist(evstr, uid_str);
+	double time_average, time_stddev;
+	struct timeval start, end, diff;
+	struct stats time_stats;
+	u64 runtime_us;
+	int i, err;
+
+	if (!evlist)
+		return -ENOMEM;
+
+	init_stats(&time_stats);
+
+	printf("  Number of cpus:\t%d\n", perf_cpu_map__nr(evlist->core.user_requested_cpus));
+	printf("  Number of threads:\t%d\n", evlist->core.threads->nr);
+	printf("  Number of events:\t%d (%d fds)\n",
+		evlist->core.nr_entries, evlist__count_evsel_fds(evlist));
+	printf("  Number of iterations:\t%d\n", iterations);
+
+	evlist__delete(evlist);
+
+	for (i = 0; i < iterations; i++) {
+		pr_debug("Started iteration %d\n", i);
+		evlist = bench__create_evlist(evstr, uid_str);
+		if (!evlist)
+			return -ENOMEM;
+
+		gettimeofday(&start, NULL);
+		err = bench__do_evlist_open_close(evlist);
+		if (err) {
+			evlist__delete(evlist);
+			return err;
+		}
+
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &diff);
+		runtime_us = timeval2usec(&diff);
+		update_stats(&time_stats, runtime_us);
+
+		evlist__delete(evlist);
+		pr_debug("Iteration %d took:\t%" PRIu64 "us\n", i, runtime_us);
+	}
+
+	time_average = avg_stats(&time_stats);
+	time_stddev = stddev_stats(&time_stats);
+	printf("  Average open-close took: %.3f usec (+- %.3f usec)\n", time_average, time_stddev);
+
+	return 0;
+}
+
+static char *bench__repeat_event_string(const char *evstr, int n)
+{
+	char sbuf[STRERR_BUFSIZE];
+	struct strbuf buf;
+	int i, str_size = strlen(evstr),
+	    final_size = str_size * n + n,
+	    err = strbuf_init(&buf, final_size);
+
+	if (err) {
+		pr_err("strbuf_init: %s\n", str_error_r(err, sbuf, sizeof(sbuf)));
+		goto out_error;
+	}
+
+	for (i = 0; i < n; i++) {
+		err = strbuf_add(&buf, evstr, str_size);
+		if (err) {
+			pr_err("strbuf_add: %s\n", str_error_r(err, sbuf, sizeof(sbuf)));
+			goto out_error;
+		}
+
+		err = strbuf_addch(&buf, i == n-1 ? '\0' : ',');
+		if (err) {
+			pr_err("strbuf_addch: %s\n", str_error_r(err, sbuf, sizeof(sbuf)));
+			goto out_error;
+		}
+	}
+
+	return strbuf_detach(&buf, NULL);
+
+out_error:
+	strbuf_release(&buf);
+	return NULL;
+}
+
+
+int bench_evlist_open_close(int argc, const char **argv)
+{
+	const char *uid_str = NULL;
+	const struct option options[] = {
+		OPT_STRING('e', "event", &event_string, "event",
+			   "event selector. use 'perf list' to list available events"),
+		OPT_INTEGER('n', "nr-events", &nr_events,
+			    "number of dummy events to create (default 1). If used with -e, it clones those events n times (1 = no change)"),
+		OPT_INTEGER('i', "iterations", &iterations,
+			    "Number of iterations used to compute average (default=100)"),
+		OPT_BOOLEAN('a', "all-cpus", &opts.target.system_wide,
+			    "system-wide collection from all CPUs"),
+		OPT_STRING('C', "cpu", &opts.target.cpu_list, "cpu",
+			   "list of cpus where to open events"),
+		OPT_STRING('p', "pid", &opts.target.pid, "pid",
+			   "record events on existing process id"),
+		OPT_STRING('t', "tid", &opts.target.tid, "tid",
+			   "record events on existing thread id"),
+		OPT_STRING('u', "uid", &uid_str, "user", "user to profile"),
+		OPT_BOOLEAN(0, "per-thread", &opts.target.per_thread, "use per-thread mmaps"),
+		OPT_END()
+	};
+	const char *const bench_usage[] = {
+		"perf bench internals evlist-open-close <options>",
+		NULL
+	};
+	char *evstr, errbuf[BUFSIZ];
+	int err;
+
+	argc = parse_options(argc, argv, options, bench_usage, 0);
+	if (argc) {
+		usage_with_options(bench_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	err = target__validate(&opts.target);
+	if (err) {
+		target__strerror(&opts.target, err, errbuf, sizeof(errbuf));
+		pr_err("%s\n", errbuf);
+		goto out;
+	}
+
+	/* Enable ignoring missing threads when -p option is defined. */
+	opts.ignore_missing_thread = opts.target.pid;
+
+	evstr = bench__repeat_event_string(event_string, nr_events);
+	if (!evstr) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = bench_evlist_open_close__run(evstr, uid_str);
+
+	free(evstr);
+out:
+	return err;
+}
diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c
new file mode 100644
index 000000000000..e697c20951bc
--- /dev/null
+++ b/tools/perf/bench/find-bit-bench.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Benchmark find_next_bit and related bit operations.
+ *
+ * Copyright 2020 Google LLC.
+ */
+#include <stdlib.h>
+#include "bench.h"
+#include "../util/stat.h"
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/time64.h>
+#include <subcmd/parse-options.h>
+
+static unsigned int outer_iterations = 5;
+static unsigned int inner_iterations = 100000;
+
+static const struct option options[] = {
+	OPT_UINTEGER('i', "outer-iterations", &outer_iterations,
+		"Number of outer iterations used"),
+	OPT_UINTEGER('j', "inner-iterations", &inner_iterations,
+		"Number of inner iterations used"),
+	OPT_END()
+};
+
+static const char *const bench_usage[] = {
+	"perf bench mem find_bit <options>",
+	NULL
+};
+
+static unsigned int accumulator;
+static unsigned int use_of_val;
+
+static noinline void workload(int val)
+{
+	use_of_val += val;
+	accumulator++;
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static bool asm_test_bit(long nr, const unsigned long *addr)
+{
+	bool oldbit;
+
+	asm volatile("bt %2,%1"
+		     : "=@ccc" (oldbit)
+		     : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
+
+	return oldbit;
+}
+#else
+#define asm_test_bit test_bit
+#endif
+
+static int do_for_each_set_bit(unsigned int num_bits)
+{
+	unsigned long *to_test = bitmap_zalloc(num_bits);
+	struct timeval start, end, diff;
+	u64 runtime_us;
+	struct stats fb_time_stats, tb_time_stats;
+	double time_average, time_stddev;
+	unsigned int bit, i, j;
+	unsigned int set_bits, skip;
+
+	init_stats(&fb_time_stats);
+	init_stats(&tb_time_stats);
+
+	for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) {
+		bitmap_zero(to_test, num_bits);
+		skip = num_bits / set_bits;
+		for (i = 0; i < num_bits; i += skip)
+			__set_bit(i, to_test);
+
+		for (i = 0; i < outer_iterations; i++) {
+#ifndef NDEBUG
+			unsigned int old = accumulator;
+#endif
+
+			gettimeofday(&start, NULL);
+			for (j = 0; j < inner_iterations; j++) {
+				for_each_set_bit(bit, to_test, num_bits)
+					workload(bit);
+			}
+			gettimeofday(&end, NULL);
+			assert(old + (inner_iterations * set_bits) == accumulator);
+			timersub(&end, &start, &diff);
+			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+			update_stats(&fb_time_stats, runtime_us);
+
+#ifndef NDEBUG
+			old = accumulator;
+#endif
+			gettimeofday(&start, NULL);
+			for (j = 0; j < inner_iterations; j++) {
+				for (bit = 0; bit < num_bits; bit++) {
+					if (asm_test_bit(bit, to_test))
+						workload(bit);
+				}
+			}
+			gettimeofday(&end, NULL);
+			assert(old + (inner_iterations * set_bits) == accumulator);
+			timersub(&end, &start, &diff);
+			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+			update_stats(&tb_time_stats, runtime_us);
+		}
+
+		printf("%d operations %d bits set of %d bits\n",
+			inner_iterations, set_bits, num_bits);
+		time_average = avg_stats(&fb_time_stats);
+		time_stddev = stddev_stats(&fb_time_stats);
+		printf("  Average for_each_set_bit took: %.3f usec (+- %.3f usec)\n",
+			time_average, time_stddev);
+		time_average = avg_stats(&tb_time_stats);
+		time_stddev = stddev_stats(&tb_time_stats);
+		printf("  Average test_bit loop took:    %.3f usec (+- %.3f usec)\n",
+			time_average, time_stddev);
+
+		if (use_of_val == accumulator)  /* Try to avoid compiler tricks. */
+			printf("\n");
+	}
+	bitmap_free(to_test);
+	return 0;
+}
+
+int bench_mem_find_bit(int argc, const char **argv)
+{
+	int err = 0, i;
+
+	argc = parse_options(argc, argv, options, bench_usage, 0);
+	if (argc) {
+		usage_with_options(bench_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	for (i = 1; i <= 2048; i <<= 1)
+		do_for_each_set_bit(i);
+
+	return err;
+}
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
new file mode 100644
index 000000000000..7e29f04da744
--- /dev/null
+++ b/tools/perf/bench/futex-hash.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com>
+ *
+ * futex-hash: Stress the hell out of the Linux kernel futex uaddr hashing.
+ *
+ * This program is particularly useful for measuring the kernel's futex hash
+ * table/function implementation. In order for it to make sense, use with as
+ * many threads and futexes as possible.
+ */
+
+/* For the CLR_() macros */
+#include <string.h>
+#include <pthread.h>
+
+#include <errno.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/zalloc.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <perf/cpumap.h>
+
+#include "../util/mutex.h"
+#include "../util/stat.h"
+#include <subcmd/parse-options.h>
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+
+static bool done = false;
+static int futex_flag = 0;
+
+struct timeval bench__start, bench__end, bench__runtime;
+static struct mutex thread_lock;
+static unsigned int threads_starting;
+static struct stats throughput_stats;
+static struct cond thread_parent, thread_worker;
+
+struct worker {
+	int tid;
+	u_int32_t *futex;
+	pthread_t thread;
+	unsigned long ops;
+};
+
+static struct bench_futex_parameters params = {
+	.nfutexes = 1024,
+	.runtime  = 10,
+	.nbuckets = -1,
+};
+
+static const struct option options[] = {
+	OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
+	OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('r', "runtime", &params.runtime, "Specify runtime (in seconds)"),
+	OPT_UINTEGER('f', "futexes", &params.nfutexes, "Specify amount of futexes per threads"),
+	OPT_BOOLEAN( 's', "silent",  &params.silent, "Silent mode: do not display data/details"),
+	OPT_BOOLEAN( 'S', "shared",  &params.fshared, "Use shared futexes instead of private ones"),
+	OPT_BOOLEAN( 'm', "mlockall", &params.mlockall, "Lock all current and future memory"),
+	OPT_END()
+};
+
+static const char * const bench_futex_hash_usage[] = {
+	"perf bench futex hash <options>",
+	NULL
+};
+
+static void *workerfn(void *arg)
+{
+	int ret;
+	struct worker *w = (struct worker *) arg;
+	unsigned int i;
+	unsigned long ops = w->ops; /* avoid cacheline bouncing */
+
+	mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		cond_signal(&thread_parent);
+	cond_wait(&thread_worker, &thread_lock);
+	mutex_unlock(&thread_lock);
+
+	do {
+		for (i = 0; i < params.nfutexes; i++, ops++) {
+			/*
+			 * We want the futex calls to fail in order to stress
+			 * the hashing of uaddr and not measure other steps,
+			 * such as internal waitqueue handling, thus enlarging
+			 * the critical region protected by hb->lock.
+			 */
+			ret = futex_wait(&w->futex[i], 1234, NULL, futex_flag);
+			if (!params.silent &&
+			    (!ret || errno != EAGAIN || errno != EWOULDBLOCK))
+				warn("Non-expected futex return call");
+		}
+	}  while (!done);
+
+	w->ops = ops;
+	return NULL;
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	/* inform all threads that we're done for the day */
+	done = true;
+	gettimeofday(&bench__end, NULL);
+	timersub(&bench__end, &bench__start, &bench__runtime);
+}
+
+static void print_summary(void)
+{
+	unsigned long avg = avg_stats(&throughput_stats);
+	double stddev = stddev_stats(&throughput_stats);
+
+	printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n",
+	       !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg),
+	       (int)bench__runtime.tv_sec);
+	futex_print_nbuckets(&params);
+}
+
+int bench_futex_hash(int argc, const char **argv)
+{
+	int ret = 0;
+	cpu_set_t *cpuset;
+	struct sigaction act;
+	unsigned int i;
+	pthread_attr_t thread_attr;
+	struct worker *worker = NULL;
+	struct perf_cpu_map *cpu;
+	int nrcpus;
+	size_t size;
+
+	argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
+	if (argc) {
+		usage_with_options(bench_futex_hash_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	cpu = perf_cpu_map__new_online_cpus();
+	if (!cpu)
+		goto errmem;
+
+	memset(&act, 0, sizeof(act));
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	if (params.mlockall) {
+		if (mlockall(MCL_CURRENT | MCL_FUTURE))
+			err(EXIT_FAILURE, "mlockall");
+	}
+
+	if (!params.nthreads) /* default to the number of CPUs */
+		params.nthreads = perf_cpu_map__nr(cpu);
+
+	worker = calloc(params.nthreads, sizeof(*worker));
+	if (!worker)
+		goto errmem;
+
+	if (!params.fshared)
+		futex_flag = FUTEX_PRIVATE_FLAG;
+	futex_set_nbuckets_param(&params);
+
+	printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
+	       getpid(), params.nthreads, params.nfutexes, params.fshared ? "shared":"private", params.runtime);
+
+	init_stats(&throughput_stats);
+	mutex_init(&thread_lock);
+	cond_init(&thread_parent);
+	cond_init(&thread_worker);
+
+	threads_starting = params.nthreads;
+	pthread_attr_init(&thread_attr);
+	gettimeofday(&bench__start, NULL);
+
+	nrcpus = cpu__max_cpu().cpu;
+	cpuset = CPU_ALLOC(nrcpus);
+	BUG_ON(!cpuset);
+	size = CPU_ALLOC_SIZE(nrcpus);
+
+	for (i = 0; i < params.nthreads; i++) {
+		worker[i].tid = i;
+		worker[i].futex = calloc(params.nfutexes, sizeof(*worker[i].futex));
+		if (!worker[i].futex)
+			goto errmem;
+
+		CPU_ZERO_S(size, cpuset);
+
+		CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset);
+		ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset);
+		if (ret) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+		}
+		ret = pthread_create(&worker[i].thread, &thread_attr, workerfn,
+				     (void *)(struct worker *) &worker[i]);
+		if (ret) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_create");
+		}
+
+	}
+	CPU_FREE(cpuset);
+	pthread_attr_destroy(&thread_attr);
+
+	mutex_lock(&thread_lock);
+	while (threads_starting)
+		cond_wait(&thread_parent, &thread_lock);
+	cond_broadcast(&thread_worker);
+	mutex_unlock(&thread_lock);
+
+	sleep(params.runtime);
+	toggle_done(0, NULL, NULL);
+
+	for (i = 0; i < params.nthreads; i++) {
+		ret = pthread_join(worker[i].thread, NULL);
+		if (ret)
+			err(EXIT_FAILURE, "pthread_join");
+	}
+
+	/* cleanup & report results */
+	cond_destroy(&thread_parent);
+	cond_destroy(&thread_worker);
+	mutex_destroy(&thread_lock);
+
+	for (i = 0; i < params.nthreads; i++) {
+		unsigned long t = bench__runtime.tv_sec > 0 ?
+			worker[i].ops / bench__runtime.tv_sec : 0;
+		update_stats(&throughput_stats, t);
+		if (!params.silent) {
+			if (params.nfutexes == 1)
+				printf("[thread %2d] futex: %p [ %ld ops/sec ]\n",
+				       worker[i].tid, &worker[i].futex[0], t);
+			else
+				printf("[thread %2d] futexes: %p ... %p [ %ld ops/sec ]\n",
+				       worker[i].tid, &worker[i].futex[0],
+				       &worker[i].futex[params.nfutexes-1], t);
+		}
+
+		zfree(&worker[i].futex);
+	}
+
+	print_summary();
+
+	free(worker);
+	free(cpu);
+	return ret;
+errmem:
+	err(EXIT_FAILURE, "calloc");
+}
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
new file mode 100644
index 000000000000..40640b674427
--- /dev/null
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2015 Davidlohr Bueso.
+ */
+
+/* For the CLR_() macros */
+#include <string.h>
+#include <pthread.h>
+
+#include <signal.h>
+#include "../util/mutex.h"
+#include "../util/stat.h"
+#include <subcmd/parse-options.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/zalloc.h>
+#include <errno.h>
+#include <perf/cpumap.h>
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+
+struct worker {
+	int tid;
+	u_int32_t *futex;
+	pthread_t thread;
+	unsigned long ops;
+};
+
+static u_int32_t global_futex = 0;
+static struct worker *worker;
+static bool done = false;
+static int futex_flag = 0;
+static struct mutex thread_lock;
+static unsigned int threads_starting;
+static struct stats throughput_stats;
+static struct cond thread_parent, thread_worker;
+
+static struct bench_futex_parameters params = {
+	.nbuckets = -1,
+	.runtime  = 10,
+};
+
+static const struct option options[] = {
+	OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
+	OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('r', "runtime", &params.runtime, "Specify runtime (in seconds)"),
+	OPT_BOOLEAN( 'M', "multi",   &params.multi, "Use multiple futexes"),
+	OPT_BOOLEAN( 's', "silent",  &params.silent, "Silent mode: do not display data/details"),
+	OPT_BOOLEAN( 'S', "shared",  &params.fshared, "Use shared futexes instead of private ones"),
+	OPT_BOOLEAN( 'm', "mlockall", &params.mlockall, "Lock all current and future memory"),
+	OPT_END()
+};
+
+static const char * const bench_futex_lock_pi_usage[] = {
+	"perf bench futex lock-pi <options>",
+	NULL
+};
+
+static void print_summary(void)
+{
+	unsigned long avg = avg_stats(&throughput_stats);
+	double stddev = stddev_stats(&throughput_stats);
+
+	printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n",
+	       !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg),
+	       (int)bench__runtime.tv_sec);
+	futex_print_nbuckets(&params);
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	/* inform all threads that we're done for the day */
+	done = true;
+	gettimeofday(&bench__end, NULL);
+	timersub(&bench__end, &bench__start, &bench__runtime);
+}
+
+static void *workerfn(void *arg)
+{
+	struct worker *w = (struct worker *) arg;
+	unsigned long ops = w->ops;
+
+	mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		cond_signal(&thread_parent);
+	cond_wait(&thread_worker, &thread_lock);
+	mutex_unlock(&thread_lock);
+
+	do {
+		int ret;
+	again:
+		ret = futex_lock_pi(w->futex, NULL, futex_flag);
+
+		if (ret) { /* handle lock acquisition */
+			if (!params.silent)
+				warn("thread %d: Could not lock pi-lock for %p (%d)",
+				     w->tid, w->futex, ret);
+			if (done)
+				break;
+
+			goto again;
+		}
+
+		usleep(1);
+		ret = futex_unlock_pi(w->futex, futex_flag);
+		if (ret && !params.silent)
+			warn("thread %d: Could not unlock pi-lock for %p (%d)",
+			     w->tid, w->futex, ret);
+		ops++; /* account for thread's share of work */
+	}  while (!done);
+
+	w->ops = ops;
+	return NULL;
+}
+
+static void create_threads(struct worker *w, struct perf_cpu_map *cpu)
+{
+	cpu_set_t *cpuset;
+	unsigned int i;
+	int nrcpus =  cpu__max_cpu().cpu;
+	size_t size;
+
+	threads_starting = params.nthreads;
+
+	cpuset = CPU_ALLOC(nrcpus);
+	BUG_ON(!cpuset);
+	size = CPU_ALLOC_SIZE(nrcpus);
+
+	for (i = 0; i < params.nthreads; i++) {
+		pthread_attr_t thread_attr;
+
+		pthread_attr_init(&thread_attr);
+		worker[i].tid = i;
+
+		if (params.multi) {
+			worker[i].futex = calloc(1, sizeof(u_int32_t));
+			if (!worker[i].futex)
+				err(EXIT_FAILURE, "calloc");
+		} else
+			worker[i].futex = &global_futex;
+
+		CPU_ZERO_S(size, cpuset);
+		CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset);
+
+		if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+		}
+
+		if (pthread_create(&w[i].thread, &thread_attr, workerfn, &worker[i])) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_create");
+		}
+		pthread_attr_destroy(&thread_attr);
+	}
+	CPU_FREE(cpuset);
+}
+
+int bench_futex_lock_pi(int argc, const char **argv)
+{
+	int ret = 0;
+	unsigned int i;
+	struct sigaction act;
+	struct perf_cpu_map *cpu;
+
+	argc = parse_options(argc, argv, options, bench_futex_lock_pi_usage, 0);
+	if (argc)
+		goto err;
+
+	cpu = perf_cpu_map__new_online_cpus();
+	if (!cpu)
+		err(EXIT_FAILURE, "calloc");
+
+	memset(&act, 0, sizeof(act));
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	if (params.mlockall) {
+		if (mlockall(MCL_CURRENT | MCL_FUTURE))
+			err(EXIT_FAILURE, "mlockall");
+	}
+
+	if (!params.nthreads)
+		params.nthreads = perf_cpu_map__nr(cpu);
+
+	worker = calloc(params.nthreads, sizeof(*worker));
+	if (!worker)
+		err(EXIT_FAILURE, "calloc");
+
+	if (!params.fshared)
+		futex_flag = FUTEX_PRIVATE_FLAG;
+
+	printf("Run summary [PID %d]: %d threads doing pi lock/unlock pairing for %d secs.\n\n",
+	       getpid(), params.nthreads, params.runtime);
+
+	init_stats(&throughput_stats);
+	mutex_init(&thread_lock);
+	cond_init(&thread_parent);
+	cond_init(&thread_worker);
+	futex_set_nbuckets_param(&params);
+
+	threads_starting = params.nthreads;
+	gettimeofday(&bench__start, NULL);
+
+	create_threads(worker, cpu);
+
+	mutex_lock(&thread_lock);
+	while (threads_starting)
+		cond_wait(&thread_parent, &thread_lock);
+	cond_broadcast(&thread_worker);
+	mutex_unlock(&thread_lock);
+
+	sleep(params.runtime);
+	toggle_done(0, NULL, NULL);
+
+	for (i = 0; i < params.nthreads; i++) {
+		ret = pthread_join(worker[i].thread, NULL);
+		if (ret)
+			err(EXIT_FAILURE, "pthread_join");
+	}
+
+	/* cleanup & report results */
+	cond_destroy(&thread_parent);
+	cond_destroy(&thread_worker);
+	mutex_destroy(&thread_lock);
+
+	for (i = 0; i < params.nthreads; i++) {
+		unsigned long t = bench__runtime.tv_sec > 0 ?
+			worker[i].ops / bench__runtime.tv_sec : 0;
+
+		update_stats(&throughput_stats, t);
+		if (!params.silent)
+			printf("[thread %3d] futex: %p [ %ld ops/sec ]\n",
+			       worker[i].tid, worker[i].futex, t);
+
+		if (params.multi)
+			zfree(&worker[i].futex);
+	}
+
+	print_summary();
+
+	free(worker);
+	perf_cpu_map__put(cpu);
+	return ret;
+err:
+	usage_with_options(bench_futex_lock_pi_usage, options);
+	exit(EXIT_FAILURE);
+}
diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
new file mode 100644
index 000000000000..0748b0fd689e
--- /dev/null
+++ b/tools/perf/bench/futex-requeue.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com>
+ *
+ * futex-requeue: Block a bunch of threads on futex1 and requeue them
+ *                on futex2, N at a time.
+ *
+ * This program is particularly useful to measure the latency of nthread
+ * requeues without waking up any tasks (in the non-pi case) -- thus
+ * mimicking a regular futex_wait.
+ */
+
+/* For the CLR_() macros */
+#include <string.h>
+#include <pthread.h>
+
+#include <signal.h>
+#include "../util/mutex.h"
+#include "../util/stat.h"
+#include <subcmd/parse-options.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+#include <errno.h>
+#include <perf/cpumap.h>
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+
+static u_int32_t futex1 = 0, futex2 = 0;
+
+static pthread_t *worker;
+static bool done = false;
+static struct mutex thread_lock;
+static struct cond thread_parent, thread_worker;
+static struct stats requeuetime_stats, requeued_stats;
+static unsigned int threads_starting;
+static int futex_flag = 0;
+
+static struct bench_futex_parameters params = {
+	.nbuckets = -1,
+	/*
+	 * How many tasks to requeue at a time.
+	 * Default to 1 in order to make the kernel work more.
+	 */
+	.nrequeue = 1,
+};
+
+static const struct option options[] = {
+	OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
+	OPT_UINTEGER('t', "threads",  &params.nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('q', "nrequeue", &params.nrequeue, "Specify amount of threads to requeue at once"),
+	OPT_BOOLEAN( 's', "silent",   &params.silent, "Silent mode: do not display data/details"),
+	OPT_BOOLEAN( 'S', "shared",   &params.fshared, "Use shared futexes instead of private ones"),
+	OPT_BOOLEAN( 'm', "mlockall", &params.mlockall, "Lock all current and future memory"),
+	OPT_BOOLEAN( 'B', "broadcast", &params.broadcast, "Requeue all threads at once"),
+	OPT_BOOLEAN( 'p', "pi", &params.pi, "Use PI-aware variants of FUTEX_CMP_REQUEUE"),
+
+	OPT_END()
+};
+
+static const char * const bench_futex_requeue_usage[] = {
+	"perf bench futex requeue <options>",
+	NULL
+};
+
+static void print_summary(void)
+{
+	double requeuetime_avg = avg_stats(&requeuetime_stats);
+	double requeuetime_stddev = stddev_stats(&requeuetime_stats);
+	unsigned int requeued_avg = avg_stats(&requeued_stats);
+
+	printf("Requeued %d of %d threads in %.4f ms (+-%.2f%%)\n",
+	       requeued_avg,
+	       params.nthreads,
+	       requeuetime_avg / USEC_PER_MSEC,
+	       rel_stddev_stats(requeuetime_stddev, requeuetime_avg));
+	futex_print_nbuckets(&params);
+}
+
+static void *workerfn(void *arg __maybe_unused)
+{
+	int ret;
+
+	mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		cond_signal(&thread_parent);
+	cond_wait(&thread_worker, &thread_lock);
+	mutex_unlock(&thread_lock);
+
+	while (1) {
+		if (!params.pi) {
+			ret = futex_wait(&futex1, 0, NULL, futex_flag);
+			if (!ret)
+				break;
+
+			if (ret && errno != EAGAIN) {
+				if (!params.silent)
+					warnx("futex_wait");
+				break;
+			}
+		} else {
+			ret = futex_wait_requeue_pi(&futex1, 0, &futex2,
+						    NULL, futex_flag);
+			if (!ret) {
+				/* got the lock at futex2 */
+				futex_unlock_pi(&futex2, futex_flag);
+				break;
+			}
+
+			if (ret && errno != EAGAIN) {
+				if (!params.silent)
+					warnx("futex_wait_requeue_pi");
+				break;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void block_threads(pthread_t *w, struct perf_cpu_map *cpu)
+{
+	cpu_set_t *cpuset;
+	unsigned int i;
+	int nrcpus = cpu__max_cpu().cpu;
+	size_t size;
+
+	threads_starting = params.nthreads;
+
+	cpuset = CPU_ALLOC(nrcpus);
+	BUG_ON(!cpuset);
+	size = CPU_ALLOC_SIZE(nrcpus);
+
+	/* create and block all threads */
+	for (i = 0; i < params.nthreads; i++) {
+		pthread_attr_t thread_attr;
+
+		pthread_attr_init(&thread_attr);
+		CPU_ZERO_S(size, cpuset);
+		CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset);
+
+		if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+		}
+
+		if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_create");
+		}
+		pthread_attr_destroy(&thread_attr);
+	}
+	CPU_FREE(cpuset);
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	done = true;
+}
+
+int bench_futex_requeue(int argc, const char **argv)
+{
+	int ret = 0;
+	unsigned int i, j;
+	struct sigaction act;
+	struct perf_cpu_map *cpu;
+
+	argc = parse_options(argc, argv, options, bench_futex_requeue_usage, 0);
+	if (argc)
+		goto err;
+
+	cpu = perf_cpu_map__new_online_cpus();
+	if (!cpu)
+		err(EXIT_FAILURE, "cpu_map__new");
+
+	memset(&act, 0, sizeof(act));
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	if (params.mlockall) {
+		if (mlockall(MCL_CURRENT | MCL_FUTURE))
+			err(EXIT_FAILURE, "mlockall");
+	}
+
+	if (!params.nthreads)
+		params.nthreads = perf_cpu_map__nr(cpu);
+
+	worker = calloc(params.nthreads, sizeof(*worker));
+	if (!worker)
+		err(EXIT_FAILURE, "calloc");
+
+	if (!params.fshared)
+		futex_flag = FUTEX_PRIVATE_FLAG;
+
+	if (params.nrequeue > params.nthreads)
+		params.nrequeue = params.nthreads;
+
+	if (params.broadcast)
+		params.nrequeue = params.nthreads;
+
+	futex_set_nbuckets_param(&params);
+
+	printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %s%p), "
+	       "%d at a time.\n\n",  getpid(), params.nthreads,
+	       params.fshared ? "shared":"private", &futex1,
+	       params.pi ? "PI ": "", &futex2, params.nrequeue);
+
+	init_stats(&requeued_stats);
+	init_stats(&requeuetime_stats);
+	mutex_init(&thread_lock);
+	cond_init(&thread_parent);
+	cond_init(&thread_worker);
+
+	for (j = 0; j < bench_repeat && !done; j++) {
+		unsigned int nrequeued = 0, wakeups = 0;
+		struct timeval start, end, runtime;
+
+		/* create, launch & block all threads */
+		block_threads(worker, cpu);
+
+		/* make sure all threads are already blocked */
+		mutex_lock(&thread_lock);
+		while (threads_starting)
+			cond_wait(&thread_parent, &thread_lock);
+		cond_broadcast(&thread_worker);
+		mutex_unlock(&thread_lock);
+
+		usleep(100000);
+
+		/* Ok, all threads are patiently blocked, start requeueing */
+		gettimeofday(&start, NULL);
+		while (nrequeued < params.nthreads) {
+			int r;
+
+			/*
+			 * For the regular non-pi case, do not wakeup any tasks
+			 * blocked on futex1, allowing us to really measure
+			 * futex_wait functionality. For the PI case the first
+			 * waiter is always awoken.
+			 */
+			if (!params.pi) {
+				r = futex_cmp_requeue(&futex1, 0, &futex2, 0,
+						      params.nrequeue,
+						      futex_flag);
+			} else {
+				r = futex_cmp_requeue_pi(&futex1, 0, &futex2,
+							 params.nrequeue,
+							 futex_flag);
+				wakeups++; /* assume no error */
+			}
+
+			if (r < 0)
+				err(EXIT_FAILURE, "couldn't requeue from %p to %p",
+				    &futex1, &futex2);
+
+			nrequeued += r;
+		}
+
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &runtime);
+
+		update_stats(&requeued_stats, nrequeued);
+		update_stats(&requeuetime_stats, runtime.tv_usec);
+
+		if (!params.silent) {
+			if (!params.pi)
+				printf("[Run %d]: Requeued %d of %d threads in "
+				       "%.4f ms\n", j + 1, nrequeued,
+				       params.nthreads,
+				       runtime.tv_usec / (double)USEC_PER_MSEC);
+			else {
+				nrequeued -= wakeups;
+				printf("[Run %d]: Awoke and Requeued (%d+%d) of "
+				       "%d threads in %.4f ms\n",
+				       j + 1, wakeups, nrequeued,
+				       params.nthreads,
+				       runtime.tv_usec / (double)USEC_PER_MSEC);
+			}
+
+		}
+
+		if (!params.pi) {
+			/* everybody should be blocked on futex2, wake'em up */
+			nrequeued = futex_wake(&futex2, nrequeued, futex_flag);
+			if (params.nthreads != nrequeued)
+				warnx("couldn't wakeup all tasks (%d/%d)",
+				      nrequeued, params.nthreads);
+		}
+
+		for (i = 0; i < params.nthreads; i++) {
+			ret = pthread_join(worker[i], NULL);
+			if (ret)
+				err(EXIT_FAILURE, "pthread_join");
+		}
+	}
+
+	/* cleanup & report results */
+	cond_destroy(&thread_parent);
+	cond_destroy(&thread_worker);
+	mutex_destroy(&thread_lock);
+
+	print_summary();
+
+	free(worker);
+	perf_cpu_map__put(cpu);
+	return ret;
+err:
+	usage_with_options(bench_futex_requeue_usage, options);
+	exit(EXIT_FAILURE);
+}
diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c
new file mode 100644
index 000000000000..6aede7c46b33
--- /dev/null
+++ b/tools/perf/bench/futex-wake-parallel.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2015 Davidlohr Bueso.
+ *
+ * Block a bunch of threads and let parallel waker threads wakeup an
+ * equal amount of them. The program output reflects the avg latency
+ * for each individual thread to service its share of work. Ultimately
+ * it can be used to measure futex_wake() changes.
+ */
+#include "bench.h"
+#include <linux/compiler.h>
+#include "../util/debug.h"
+#include "../util/mutex.h"
+
+#ifndef HAVE_PTHREAD_BARRIER
+int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe_unused)
+{
+	pr_err("%s: pthread_barrier_t unavailable, disabling this test...\n", __func__);
+	return 0;
+}
+#else /* HAVE_PTHREAD_BARRIER */
+/* For the CLR_() macros */
+#include <string.h>
+#include <pthread.h>
+
+#include <signal.h>
+#include "../util/stat.h"
+#include <subcmd/parse-options.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+#include <errno.h>
+#include "futex.h"
+#include <perf/cpumap.h>
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+
+struct thread_data {
+	pthread_t worker;
+	unsigned int nwoken;
+	struct timeval runtime;
+};
+
+static unsigned int nwakes = 1;
+
+/* all threads will block on the same futex -- hash bucket chaos ;) */
+static u_int32_t futex = 0;
+
+static pthread_t *blocked_worker;
+static bool done = false;
+static struct mutex thread_lock;
+static struct cond thread_parent, thread_worker;
+static pthread_barrier_t barrier;
+static struct stats waketime_stats, wakeup_stats;
+static unsigned int threads_starting;
+static int futex_flag = 0;
+
+static struct bench_futex_parameters params = {
+	.nbuckets = -1,
+};
+
+static const struct option options[] = {
+	OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
+	OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('w', "nwakers", &params.nwakes, "Specify amount of waking threads"),
+	OPT_BOOLEAN( 's', "silent",  &params.silent, "Silent mode: do not display data/details"),
+	OPT_BOOLEAN( 'S', "shared",  &params.fshared, "Use shared futexes instead of private ones"),
+	OPT_BOOLEAN( 'm', "mlockall", &params.mlockall, "Lock all current and future memory"),
+
+	OPT_END()
+};
+
+static const char * const bench_futex_wake_parallel_usage[] = {
+	"perf bench futex wake-parallel <options>",
+	NULL
+};
+
+static void *waking_workerfn(void *arg)
+{
+	struct thread_data *waker = (struct thread_data *) arg;
+	struct timeval start, end;
+
+	pthread_barrier_wait(&barrier);
+
+	gettimeofday(&start, NULL);
+
+	waker->nwoken = futex_wake(&futex, nwakes, futex_flag);
+	if (waker->nwoken != nwakes)
+		warnx("couldn't wakeup all tasks (%d/%d)",
+		      waker->nwoken, nwakes);
+
+	gettimeofday(&end, NULL);
+	timersub(&end, &start, &waker->runtime);
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static void wakeup_threads(struct thread_data *td)
+{
+	unsigned int i;
+	pthread_attr_t thread_attr;
+
+	pthread_attr_init(&thread_attr);
+	pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE);
+
+	pthread_barrier_init(&barrier, NULL, params.nwakes + 1);
+
+	/* create and block all threads */
+	for (i = 0; i < params.nwakes; i++) {
+		/*
+		 * Thread creation order will impact per-thread latency
+		 * as it will affect the order to acquire the hb spinlock.
+		 * For now let the scheduler decide.
+		 */
+		if (pthread_create(&td[i].worker, &thread_attr,
+				   waking_workerfn, (void *)&td[i]))
+			err(EXIT_FAILURE, "pthread_create");
+	}
+
+	pthread_barrier_wait(&barrier);
+
+	for (i = 0; i < params.nwakes; i++)
+		if (pthread_join(td[i].worker, NULL))
+			err(EXIT_FAILURE, "pthread_join");
+
+	pthread_barrier_destroy(&barrier);
+	pthread_attr_destroy(&thread_attr);
+}
+
+static void *blocked_workerfn(void *arg __maybe_unused)
+{
+	mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		cond_signal(&thread_parent);
+	cond_wait(&thread_worker, &thread_lock);
+	mutex_unlock(&thread_lock);
+
+	while (1) { /* handle spurious wakeups */
+		if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR)
+			break;
+	}
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static void block_threads(pthread_t *w, struct perf_cpu_map *cpu)
+{
+	cpu_set_t *cpuset;
+	unsigned int i;
+	int nrcpus = cpu__max_cpu().cpu;
+	size_t size;
+
+	threads_starting = params.nthreads;
+
+	cpuset = CPU_ALLOC(nrcpus);
+	BUG_ON(!cpuset);
+	size = CPU_ALLOC_SIZE(nrcpus);
+
+	/* create and block all threads */
+	for (i = 0; i < params.nthreads; i++) {
+		pthread_attr_t thread_attr;
+
+		pthread_attr_init(&thread_attr);
+		CPU_ZERO_S(size, cpuset);
+		CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset);
+
+		if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+		}
+
+		if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL)) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_create");
+		}
+		pthread_attr_destroy(&thread_attr);
+	}
+	CPU_FREE(cpuset);
+}
+
+static void print_run(struct thread_data *waking_worker, unsigned int run_num)
+{
+	unsigned int i, wakeup_avg;
+	double waketime_avg, waketime_stddev;
+	struct stats __waketime_stats, __wakeup_stats;
+
+	init_stats(&__wakeup_stats);
+	init_stats(&__waketime_stats);
+
+	for (i = 0; i < params.nwakes; i++) {
+		update_stats(&__waketime_stats, waking_worker[i].runtime.tv_usec);
+		update_stats(&__wakeup_stats, waking_worker[i].nwoken);
+	}
+
+	waketime_avg = avg_stats(&__waketime_stats);
+	waketime_stddev = stddev_stats(&__waketime_stats);
+	wakeup_avg = avg_stats(&__wakeup_stats);
+
+	printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) "
+	       "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg,
+	       params.nthreads, waketime_avg / USEC_PER_MSEC,
+	       rel_stddev_stats(waketime_stddev, waketime_avg));
+}
+
+static void print_summary(void)
+{
+	unsigned int wakeup_avg;
+	double waketime_avg, waketime_stddev;
+
+	waketime_avg = avg_stats(&waketime_stats);
+	waketime_stddev = stddev_stats(&waketime_stats);
+	wakeup_avg = avg_stats(&wakeup_stats);
+
+	printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n",
+	       wakeup_avg,
+	       params.nthreads,
+	       waketime_avg / USEC_PER_MSEC,
+	       rel_stddev_stats(waketime_stddev, waketime_avg));
+	futex_print_nbuckets(&params);
+}
+
+
+static void do_run_stats(struct thread_data *waking_worker)
+{
+	unsigned int i;
+
+	for (i = 0; i < params.nwakes; i++) {
+		update_stats(&waketime_stats, waking_worker[i].runtime.tv_usec);
+		update_stats(&wakeup_stats, waking_worker[i].nwoken);
+	}
+
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	done = true;
+}
+
+int bench_futex_wake_parallel(int argc, const char **argv)
+{
+	int ret = 0;
+	unsigned int i, j;
+	struct sigaction act;
+	struct thread_data *waking_worker;
+	struct perf_cpu_map *cpu;
+
+	argc = parse_options(argc, argv, options,
+			     bench_futex_wake_parallel_usage, 0);
+	if (argc) {
+		usage_with_options(bench_futex_wake_parallel_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	memset(&act, 0, sizeof(act));
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	if (params.mlockall) {
+		if (mlockall(MCL_CURRENT | MCL_FUTURE))
+			err(EXIT_FAILURE, "mlockall");
+	}
+
+	cpu = perf_cpu_map__new_online_cpus();
+	if (!cpu)
+		err(EXIT_FAILURE, "calloc");
+
+	if (!params.nthreads)
+		params.nthreads = perf_cpu_map__nr(cpu);
+
+	/* some sanity checks */
+	if (params.nwakes > params.nthreads ||
+	    !params.nwakes)
+		params.nwakes = params.nthreads;
+
+	if (params.nthreads % params.nwakes)
+		errx(EXIT_FAILURE, "Must be perfectly divisible");
+	/*
+	 * Each thread will wakeup nwakes tasks in
+	 * a single futex_wait call.
+	 */
+	nwakes = params.nthreads/params.nwakes;
+
+	blocked_worker = calloc(params.nthreads, sizeof(*blocked_worker));
+	if (!blocked_worker)
+		err(EXIT_FAILURE, "calloc");
+
+	if (!params.fshared)
+		futex_flag = FUTEX_PRIVATE_FLAG;
+
+	futex_set_nbuckets_param(&params);
+
+	printf("Run summary [PID %d]: blocking on %d threads (at [%s] "
+	       "futex %p), %d threads waking up %d at a time.\n\n",
+	       getpid(), params.nthreads, params.fshared ? "shared":"private",
+	       &futex, params.nwakes, nwakes);
+
+	init_stats(&wakeup_stats);
+	init_stats(&waketime_stats);
+
+	mutex_init(&thread_lock);
+	cond_init(&thread_parent);
+	cond_init(&thread_worker);
+
+	for (j = 0; j < bench_repeat && !done; j++) {
+		waking_worker = calloc(params.nwakes, sizeof(*waking_worker));
+		if (!waking_worker)
+			err(EXIT_FAILURE, "calloc");
+
+		/* create, launch & block all threads */
+		block_threads(blocked_worker, cpu);
+
+		/* make sure all threads are already blocked */
+		mutex_lock(&thread_lock);
+		while (threads_starting)
+			cond_wait(&thread_parent, &thread_lock);
+		cond_broadcast(&thread_worker);
+		mutex_unlock(&thread_lock);
+
+		usleep(200000);
+
+		/* Ok, all threads are patiently blocked, start waking folks up */
+		wakeup_threads(waking_worker);
+
+		for (i = 0; i < params.nthreads; i++) {
+			ret = pthread_join(blocked_worker[i], NULL);
+			if (ret)
+				err(EXIT_FAILURE, "pthread_join");
+		}
+
+		do_run_stats(waking_worker);
+		if (!params.silent)
+			print_run(waking_worker, j);
+
+		free(waking_worker);
+	}
+
+	/* cleanup & report results */
+	cond_destroy(&thread_parent);
+	cond_destroy(&thread_worker);
+	mutex_destroy(&thread_lock);
+
+	print_summary();
+
+	free(blocked_worker);
+	perf_cpu_map__put(cpu);
+	return ret;
+}
+#endif /* HAVE_PTHREAD_BARRIER */
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
new file mode 100644
index 000000000000..a31fc1563862
--- /dev/null
+++ b/tools/perf/bench/futex-wake.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com>
+ *
+ * futex-wake: Block a bunch of threads on a futex and wake'em up, N at a time.
+ *
+ * This program is particularly useful to measure the latency of nthread wakeups
+ * in non-error situations:  all waiters are queued and all wake calls wakeup
+ * one or more tasks, and thus the waitqueue is never empty.
+ */
+
+/* For the CLR_() macros */
+#include <string.h>
+#include <pthread.h>
+
+#include <signal.h>
+#include "../util/mutex.h"
+#include "../util/stat.h"
+#include <subcmd/parse-options.h>
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+#include <errno.h>
+#include <perf/cpumap.h>
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+
+/* all threads will block on the same futex */
+static u_int32_t futex1 = 0;
+
+static pthread_t *worker;
+static bool done = false;
+static struct mutex thread_lock;
+static struct cond thread_parent, thread_worker;
+static struct stats waketime_stats, wakeup_stats;
+static unsigned int threads_starting;
+static int futex_flag = 0;
+
+static struct bench_futex_parameters params = {
+	.nbuckets = -1,
+	/*
+	 * How many wakeups to do at a time.
+	 * Default to 1 in order to make the kernel work more.
+	 */
+	.nwakes  = 1,
+};
+
+static const struct option options[] = {
+	OPT_INTEGER( 'b', "buckets", &params.nbuckets, "Specify amount of hash buckets"),
+	OPT_UINTEGER('t', "threads", &params.nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('w', "nwakes",  &params.nwakes, "Specify amount of threads to wake at once"),
+	OPT_BOOLEAN( 's', "silent",  &params.silent, "Silent mode: do not display data/details"),
+	OPT_BOOLEAN( 'S', "shared",  &params.fshared, "Use shared futexes instead of private ones"),
+	OPT_BOOLEAN( 'm', "mlockall", &params.mlockall, "Lock all current and future memory"),
+
+	OPT_END()
+};
+
+static const char * const bench_futex_wake_usage[] = {
+	"perf bench futex wake <options>",
+	NULL
+};
+
+static void *workerfn(void *arg __maybe_unused)
+{
+	mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		cond_signal(&thread_parent);
+	cond_wait(&thread_worker, &thread_lock);
+	mutex_unlock(&thread_lock);
+
+	while (1) {
+		if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR)
+			break;
+	}
+
+	pthread_exit(NULL);
+	return NULL;
+}
+
+static void print_summary(void)
+{
+	double waketime_avg = avg_stats(&waketime_stats);
+	double waketime_stddev = stddev_stats(&waketime_stats);
+	unsigned int wakeup_avg = avg_stats(&wakeup_stats);
+
+	printf("Wokeup %d of %d threads in %.4f ms (+-%.2f%%)\n",
+	       wakeup_avg,
+	       params.nthreads,
+	       waketime_avg / USEC_PER_MSEC,
+	       rel_stddev_stats(waketime_stddev, waketime_avg));
+	futex_print_nbuckets(&params);
+}
+
+static void block_threads(pthread_t *w, struct perf_cpu_map *cpu)
+{
+	cpu_set_t *cpuset;
+	unsigned int i;
+	size_t size;
+	int nrcpus = cpu__max_cpu().cpu;
+	threads_starting = params.nthreads;
+
+	cpuset = CPU_ALLOC(nrcpus);
+	BUG_ON(!cpuset);
+	size = CPU_ALLOC_SIZE(nrcpus);
+
+	/* create and block all threads */
+	for (i = 0; i < params.nthreads; i++) {
+		pthread_attr_t thread_attr;
+
+		pthread_attr_init(&thread_attr);
+		CPU_ZERO_S(size, cpuset);
+		CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset);
+
+		if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+		}
+
+		if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) {
+			CPU_FREE(cpuset);
+			err(EXIT_FAILURE, "pthread_create");
+		}
+		pthread_attr_destroy(&thread_attr);
+	}
+	CPU_FREE(cpuset);
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	done = true;
+}
+
+int bench_futex_wake(int argc, const char **argv)
+{
+	int ret = 0;
+	unsigned int i, j;
+	struct sigaction act;
+	struct perf_cpu_map *cpu;
+
+	argc = parse_options(argc, argv, options, bench_futex_wake_usage, 0);
+	if (argc) {
+		usage_with_options(bench_futex_wake_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	cpu = perf_cpu_map__new_online_cpus();
+	if (!cpu)
+		err(EXIT_FAILURE, "calloc");
+
+	memset(&act, 0, sizeof(act));
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	if (params.mlockall) {
+		if (mlockall(MCL_CURRENT | MCL_FUTURE))
+			err(EXIT_FAILURE, "mlockall");
+	}
+
+	if (!params.nthreads)
+		params.nthreads = perf_cpu_map__nr(cpu);
+
+	worker = calloc(params.nthreads, sizeof(*worker));
+	if (!worker)
+		err(EXIT_FAILURE, "calloc");
+
+	if (!params.fshared)
+		futex_flag = FUTEX_PRIVATE_FLAG;
+
+	printf("Run summary [PID %d]: blocking on %d threads (at [%s] futex %p), "
+	       "waking up %d at a time.\n\n",
+	       getpid(), params.nthreads, params.fshared ? "shared":"private",
+	       &futex1, params.nwakes);
+
+	init_stats(&wakeup_stats);
+	init_stats(&waketime_stats);
+	mutex_init(&thread_lock);
+	cond_init(&thread_parent);
+	cond_init(&thread_worker);
+
+	for (j = 0; j < bench_repeat && !done; j++) {
+		unsigned int nwoken = 0;
+		struct timeval start, end, runtime;
+
+		/* create, launch & block all threads */
+		block_threads(worker, cpu);
+
+		/* make sure all threads are already blocked */
+		mutex_lock(&thread_lock);
+		while (threads_starting)
+			cond_wait(&thread_parent, &thread_lock);
+		cond_broadcast(&thread_worker);
+		mutex_unlock(&thread_lock);
+
+		usleep(100000);
+
+		/* Ok, all threads are patiently blocked, start waking folks up */
+		gettimeofday(&start, NULL);
+		while (nwoken != params.nthreads)
+			nwoken += futex_wake(&futex1,
+					     params.nwakes, futex_flag);
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &runtime);
+
+		update_stats(&wakeup_stats, nwoken);
+		update_stats(&waketime_stats, runtime.tv_usec);
+
+		if (!params.silent) {
+			printf("[Run %d]: Wokeup %d of %d threads in %.4f ms\n",
+			       j + 1, nwoken, params.nthreads,
+			       runtime.tv_usec / (double)USEC_PER_MSEC);
+		}
+
+		for (i = 0; i < params.nthreads; i++) {
+			ret = pthread_join(worker[i], NULL);
+			if (ret)
+				err(EXIT_FAILURE, "pthread_join");
+		}
+
+	}
+
+	/* cleanup & report results */
+	cond_destroy(&thread_parent);
+	cond_destroy(&thread_worker);
+	mutex_destroy(&thread_lock);
+
+	print_summary();
+
+	free(worker);
+	perf_cpu_map__put(cpu);
+	return ret;
+}
diff --git a/tools/perf/bench/futex.c b/tools/perf/bench/futex.c
new file mode 100644
index 000000000000..1968c9d00b5b
--- /dev/null
+++ b/tools/perf/bench/futex.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+
+#include "futex.h"
+
+#ifndef PR_FUTEX_HASH
+#define PR_FUTEX_HASH                   78
+# define PR_FUTEX_HASH_SET_SLOTS        1
+# define PR_FUTEX_HASH_GET_SLOTS        2
+#endif // PR_FUTEX_HASH
+
+void futex_set_nbuckets_param(struct bench_futex_parameters *params)
+{
+	int ret;
+
+	if (params->nbuckets < 0)
+		return;
+
+	ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, params->nbuckets, 0);
+	if (ret) {
+		printf("Requesting %d hash buckets failed: %d/%m\n",
+		       params->nbuckets, ret);
+		err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)");
+	}
+}
+
+void futex_print_nbuckets(struct bench_futex_parameters *params)
+{
+	char *futex_hash_mode;
+	int ret;
+
+	ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS);
+	if (params->nbuckets >= 0) {
+		if (ret != params->nbuckets) {
+			if (ret < 0) {
+				printf("Can't query number of buckets: %m\n");
+				err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)");
+			}
+			printf("Requested number of hash buckets does not currently used.\n");
+			printf("Requested: %d in usage: %d\n", params->nbuckets, ret);
+			err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)");
+		}
+		if (params->nbuckets == 0)
+			ret = asprintf(&futex_hash_mode, "Futex hashing: global hash");
+		else
+			ret = asprintf(&futex_hash_mode, "Futex hashing: %d hash buckets",
+				       params->nbuckets);
+	} else {
+		if (ret <= 0) {
+			ret = asprintf(&futex_hash_mode, "Futex hashing: global hash");
+		} else {
+			ret = asprintf(&futex_hash_mode, "Futex hashing: auto resized to %d buckets",
+				       ret);
+		}
+	}
+	if (ret < 0)
+		err(EXIT_FAILURE, "ENOMEM, futex_hash_mode");
+	printf("%s\n", futex_hash_mode);
+	free(futex_hash_mode);
+}
diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h
new file mode 100644
index 000000000000..fcb72d682cf8
--- /dev/null
+++ b/tools/perf/bench/futex.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Glibc independent futex library for testing kernel functionality.
+ * Shamelessly stolen from Darren Hart <dvhltc@us.ibm.com>
+ *    http://git.kernel.org/cgit/linux/kernel/git/dvhart/futextest.git/
+ */
+
+#ifndef _FUTEX_H
+#define _FUTEX_H
+
+#include <stdbool.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <linux/futex.h>
+
+struct bench_futex_parameters {
+	bool silent;
+	bool fshared;
+	bool mlockall;
+	bool multi; /* lock-pi */
+	bool pi; /* requeue-pi */
+	bool broadcast; /* requeue */
+	unsigned int runtime; /* seconds*/
+	unsigned int nthreads;
+	unsigned int nfutexes;
+	unsigned int nwakes;
+	unsigned int nrequeue;
+	int nbuckets;
+};
+
+/**
+ * futex_syscall() - SYS_futex syscall wrapper
+ * @uaddr:	address of first futex
+ * @op:		futex op code
+ * @val:	typically expected value of uaddr, but varies by op
+ * @timeout:	typically an absolute struct timespec (except where noted
+ *		otherwise). Overloaded by some ops
+ * @uaddr2:	address of second futex for some ops
+ * @val3:	varies by op
+ * @opflags:	flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG
+ *
+ * futex_syscall() is used by all the following futex op wrappers. It can also be
+ * used for misuse and abuse testing. Generally, the specific op wrappers
+ * should be used instead.
+ *
+ * These argument descriptions are the defaults for all
+ * like-named arguments in the following wrappers except where noted below.
+ */
+static inline int
+futex_syscall(volatile u_int32_t *uaddr, int op, u_int32_t val, struct timespec *timeout,
+	      volatile u_int32_t *uaddr2, int val3, int opflags)
+{
+	return syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3);
+}
+
+static inline int
+futex_syscall_nr_requeue(volatile u_int32_t *uaddr, int op, u_int32_t val, int nr_requeue,
+			 volatile u_int32_t *uaddr2, int val3, int opflags)
+{
+	return syscall(SYS_futex, uaddr, op | opflags, val, nr_requeue, uaddr2, val3);
+}
+
+/**
+ * futex_wait() - block on uaddr with optional timeout
+ * @timeout:	relative timeout
+ */
+static inline int
+futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags)
+{
+	return futex_syscall(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags);
+}
+
+/**
+ * futex_wake() - wake one or more tasks blocked on uaddr
+ * @nr_wake:	wake up to this many tasks
+ */
+static inline int
+futex_wake(u_int32_t *uaddr, int nr_wake, int opflags)
+{
+	return futex_syscall(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags);
+}
+
+/**
+ * futex_lock_pi() - block on uaddr as a PI mutex
+ */
+static inline int
+futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags)
+{
+	return futex_syscall(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags);
+}
+
+/**
+ * futex_unlock_pi() - release uaddr as a PI mutex, waking the top waiter
+ */
+static inline int
+futex_unlock_pi(u_int32_t *uaddr, int opflags)
+{
+	return futex_syscall(uaddr, FUTEX_UNLOCK_PI, 0, NULL, NULL, 0, opflags);
+}
+
+/**
+* futex_cmp_requeue() - requeue tasks from uaddr to uaddr2
+* @nr_wake:        wake up to this many tasks
+* @nr_requeue:     requeue up to this many tasks
+*/
+static inline int
+futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wake,
+		 int nr_requeue, int opflags)
+{
+	return futex_syscall_nr_requeue(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2,
+					val, opflags);
+}
+
+/**
+ * futex_wait_requeue_pi() - block on uaddr and prepare to requeue to uaddr2
+ * @uaddr:	non-PI futex source
+ * @uaddr2:	PI futex target
+ *
+ * This is the first half of the requeue_pi mechanism. It shall always be
+ * paired with futex_cmp_requeue_pi().
+ */
+static inline int
+futex_wait_requeue_pi(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2,
+		      struct timespec *timeout, int opflags)
+{
+	return futex_syscall(uaddr, FUTEX_WAIT_REQUEUE_PI, val, timeout, uaddr2, 0,
+			     opflags);
+}
+
+/**
+ * futex_cmp_requeue_pi() - requeue tasks from uaddr to uaddr2
+ * @uaddr:	non-PI futex source
+ * @uaddr2:	PI futex target
+ * @nr_requeue:	requeue up to this many tasks
+ *
+ * This is the second half of the requeue_pi mechanism. It shall always be
+ * paired with futex_wait_requeue_pi(). The first waker is always awoken.
+ */
+static inline int
+futex_cmp_requeue_pi(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2,
+		     int nr_requeue, int opflags)
+{
+	return futex_syscall_nr_requeue(uaddr, FUTEX_CMP_REQUEUE_PI, 1, nr_requeue, uaddr2,
+					val, opflags);
+}
+
+void futex_set_nbuckets_param(struct bench_futex_parameters *params);
+void futex_print_nbuckets(struct bench_futex_parameters *params);
+
+#endif /* _FUTEX_H */
diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c
new file mode 100644
index 000000000000..aad572a78d7f
--- /dev/null
+++ b/tools/perf/bench/inject-buildid.c
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdlib.h>
+#include <stddef.h>
+#include <ftw.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <linux/kernel.h>
+#include <linux/time64.h>
+#include <linux/list.h>
+#include <linux/err.h>
+#include <linux/zalloc.h>
+#include <internal/lib.h>
+#include <subcmd/parse-options.h>
+
+#include "bench.h"
+#include "util/data.h"
+#include "util/stat.h"
+#include "util/debug.h"
+#include "util/symbol.h"
+#include "util/session.h"
+#include "util/build-id.h"
+#include "util/sample.h"
+#include "util/synthetic-events.h"
+
+#define MMAP_DEV_MAJOR  8
+#define DSO_MMAP_RATIO  4
+
+static unsigned int iterations = 100;
+static unsigned int nr_mmaps   = 100;
+static unsigned int nr_samples = 100;  /* samples per mmap */
+
+static u64 bench_sample_type;
+static u16 bench_id_hdr_size;
+
+struct bench_data {
+	int			pid;
+	int			input_pipe[2];
+	int			output_pipe[2];
+	pthread_t		th;
+};
+
+struct bench_dso {
+	struct list_head	list;
+	char			*name;
+	int			ino;
+};
+
+static int nr_dsos;
+static struct bench_dso *dsos;
+
+extern int main(int argc, const char **argv);
+
+static const struct option options[] = {
+	OPT_UINTEGER('i', "iterations", &iterations,
+		     "Number of iterations used to compute average (default: 100)"),
+	OPT_UINTEGER('m', "nr-mmaps", &nr_mmaps,
+		     "Number of mmap events for each iteration (default: 100)"),
+	OPT_UINTEGER('n', "nr-samples", &nr_samples,
+		     "Number of sample events per mmap event (default: 100)"),
+	OPT_INCR('v', "verbose", &verbose,
+		 "be more verbose (show iteration count, DSO name, etc)"),
+	OPT_END()
+};
+
+static const char *const bench_usage[] = {
+	"perf bench internals inject-build-id <options>",
+	NULL
+};
+
+/*
+ * Helper for collect_dso that adds the given file as a dso to dso_list
+ * if it contains a build-id.  Stops after collecting 4 times more than
+ * we need (for MMAP2 events).
+ */
+static int add_dso(const char *fpath, const struct stat *sb __maybe_unused,
+		   int typeflag, struct FTW *ftwbuf __maybe_unused)
+{
+	struct bench_dso *dso = &dsos[nr_dsos];
+	struct build_id bid = { .size = 0, };
+
+	if (typeflag == FTW_D || typeflag == FTW_SL)
+		return 0;
+
+	if (filename__read_build_id(fpath, &bid) < 0)
+		return 0;
+
+	dso->name = realpath(fpath, NULL);
+	if (dso->name == NULL)
+		return -1;
+
+	dso->ino = nr_dsos++;
+	pr_debug2("  Adding DSO: %s\n", fpath);
+
+	/* stop if we collected enough DSOs */
+	if ((unsigned int)nr_dsos == DSO_MMAP_RATIO * nr_mmaps)
+		return 1;
+
+	return 0;
+}
+
+static void collect_dso(void)
+{
+	dsos = calloc(nr_mmaps * DSO_MMAP_RATIO, sizeof(*dsos));
+	if (dsos == NULL) {
+		printf("  Memory allocation failed\n");
+		exit(1);
+	}
+
+	if (nftw("/usr/lib/", add_dso, 10, FTW_PHYS) < 0)
+		return;
+
+	pr_debug("  Collected %d DSOs\n", nr_dsos);
+}
+
+static void release_dso(void)
+{
+	int i;
+
+	for (i = 0; i < nr_dsos; i++) {
+		struct bench_dso *dso = &dsos[i];
+
+		zfree(&dso->name);
+	}
+	free(dsos);
+}
+
+/* Fake address used by mmap and sample events */
+static u64 dso_map_addr(struct bench_dso *dso)
+{
+	return 0x400000ULL + dso->ino * 8192ULL;
+}
+
+static ssize_t synthesize_attr(struct bench_data *data)
+{
+	union perf_event event;
+
+	memset(&event, 0, sizeof(event.attr) + sizeof(u64));
+
+	event.header.type = PERF_RECORD_HEADER_ATTR;
+	event.header.size = sizeof(event.attr) + sizeof(u64);
+
+	event.attr.attr.type = PERF_TYPE_SOFTWARE;
+	event.attr.attr.config = PERF_COUNT_SW_TASK_CLOCK;
+	event.attr.attr.exclude_kernel = 1;
+	event.attr.attr.sample_id_all = 1;
+	event.attr.attr.sample_type = bench_sample_type;
+
+	return writen(data->input_pipe[1], &event, event.header.size);
+}
+
+static ssize_t synthesize_fork(struct bench_data *data)
+{
+	union perf_event event;
+
+	memset(&event, 0, sizeof(event.fork) + bench_id_hdr_size);
+
+	event.header.type = PERF_RECORD_FORK;
+	event.header.misc = PERF_RECORD_MISC_FORK_EXEC;
+	event.header.size = sizeof(event.fork) + bench_id_hdr_size;
+
+	event.fork.ppid = 1;
+	event.fork.ptid = 1;
+	event.fork.pid = data->pid;
+	event.fork.tid = data->pid;
+
+	return writen(data->input_pipe[1], &event, event.header.size);
+}
+
+static ssize_t synthesize_mmap(struct bench_data *data, struct bench_dso *dso, u64 timestamp)
+{
+	union perf_event event;
+	size_t len = offsetof(struct perf_record_mmap2, filename);
+	u64 *id_hdr_ptr = (void *)&event;
+	int ts_idx;
+
+	len += roundup(strlen(dso->name) + 1, 8) + bench_id_hdr_size;
+
+	memset(&event, 0, min(len, sizeof(event.mmap2)));
+
+	event.header.type = PERF_RECORD_MMAP2;
+	event.header.misc = PERF_RECORD_MISC_USER;
+	event.header.size = len;
+
+	event.mmap2.pid = data->pid;
+	event.mmap2.tid = data->pid;
+	event.mmap2.maj = MMAP_DEV_MAJOR;
+	event.mmap2.ino = dso->ino;
+
+	strcpy(event.mmap2.filename, dso->name);
+
+	event.mmap2.start = dso_map_addr(dso);
+	event.mmap2.len = 4096;
+	event.mmap2.prot = PROT_EXEC;
+
+	if (len > sizeof(event.mmap2)) {
+		/* write mmap2 event first */
+		if (writen(data->input_pipe[1], &event, len - bench_id_hdr_size) < 0)
+			return -1;
+		/* zero-fill sample id header */
+		memset(id_hdr_ptr, 0, bench_id_hdr_size);
+		/* put timestamp in the right position */
+		ts_idx = (bench_id_hdr_size / sizeof(u64)) - 2;
+		id_hdr_ptr[ts_idx] = timestamp;
+		if (writen(data->input_pipe[1], id_hdr_ptr, bench_id_hdr_size) < 0)
+			return -1;
+
+		return len;
+	}
+
+	ts_idx = (len / sizeof(u64)) - 2;
+	id_hdr_ptr[ts_idx] = timestamp;
+	return writen(data->input_pipe[1], &event, len);
+}
+
+static ssize_t synthesize_sample(struct bench_data *data, struct bench_dso *dso, u64 timestamp)
+{
+	union perf_event event;
+	struct perf_sample sample = {
+		.tid = data->pid,
+		.pid = data->pid,
+		.ip = dso_map_addr(dso),
+		.time = timestamp,
+	};
+
+	event.header.type = PERF_RECORD_SAMPLE;
+	event.header.misc = PERF_RECORD_MISC_USER;
+	event.header.size = perf_event__sample_event_size(&sample, bench_sample_type, 0);
+
+	perf_event__synthesize_sample(&event, bench_sample_type, 0, &sample);
+
+	return writen(data->input_pipe[1], &event, event.header.size);
+}
+
+static ssize_t synthesize_flush(struct bench_data *data)
+{
+	struct perf_event_header header = {
+		.size = sizeof(header),
+		.type = PERF_RECORD_FINISHED_ROUND,
+	};
+
+	return writen(data->input_pipe[1], &header, header.size);
+}
+
+static void *data_reader(void *arg)
+{
+	struct bench_data *data = arg;
+	char buf[8192];
+	int flag;
+	int n;
+
+	flag = fcntl(data->output_pipe[0], F_GETFL);
+	fcntl(data->output_pipe[0], F_SETFL, flag | O_NONBLOCK);
+
+	/* read out data from child */
+	while (true) {
+		n = read(data->output_pipe[0], buf, sizeof(buf));
+		if (n > 0)
+			continue;
+		if (n == 0)
+			break;
+
+		if (errno != EINTR && errno != EAGAIN)
+			break;
+
+		usleep(100);
+	}
+
+	close(data->output_pipe[0]);
+	return NULL;
+}
+
+static int setup_injection(struct bench_data *data, bool build_id_all)
+{
+	int ready_pipe[2];
+	int dev_null_fd;
+	char buf;
+
+	if (pipe(ready_pipe) < 0)
+		return -1;
+
+	if (pipe(data->input_pipe) < 0)
+		return -1;
+
+	if (pipe(data->output_pipe) < 0)
+		return -1;
+
+	data->pid = fork();
+	if (data->pid < 0)
+		return -1;
+
+	if (data->pid == 0) {
+		const char **inject_argv;
+		int inject_argc = 3;
+
+		close(data->input_pipe[1]);
+		close(data->output_pipe[0]);
+		close(ready_pipe[0]);
+
+		dup2(data->input_pipe[0], STDIN_FILENO);
+		close(data->input_pipe[0]);
+		dup2(data->output_pipe[1], STDOUT_FILENO);
+		close(data->output_pipe[1]);
+
+		dev_null_fd = open("/dev/null", O_WRONLY);
+		if (dev_null_fd < 0)
+			exit(1);
+
+		dup2(dev_null_fd, STDERR_FILENO);
+
+		if (build_id_all)
+			inject_argc++;
+
+		inject_argv = calloc(inject_argc + 1, sizeof(*inject_argv));
+		if (inject_argv == NULL)
+			exit(1);
+
+		inject_argv[0] = strdup("perf");
+		inject_argv[1] = strdup("inject");
+		inject_argv[2] = strdup("-b");
+		if (build_id_all)
+			inject_argv[3] = strdup("--buildid-all");
+
+		/* signal that we're ready to go */
+		close(ready_pipe[1]);
+
+		main(inject_argc, inject_argv);
+
+		exit(0);
+	}
+
+	pthread_create(&data->th, NULL, data_reader, data);
+
+	close(ready_pipe[1]);
+	close(data->input_pipe[0]);
+	close(data->output_pipe[1]);
+
+	/* wait for child ready */
+	if (read(ready_pipe[0], &buf, 1) < 0)
+		return -1;
+	close(ready_pipe[0]);
+
+	return 0;
+}
+
+static int inject_build_id(struct bench_data *data, u64 *max_rss)
+{
+	int status;
+	unsigned int i, k;
+	struct rusage rusage;
+
+	/* this makes the child to run */
+	if (perf_header__write_pipe(data->input_pipe[1]) < 0)
+		return -1;
+
+	if (synthesize_attr(data) < 0)
+		return -1;
+
+	if (synthesize_fork(data) < 0)
+		return -1;
+
+	for (i = 0; i < nr_mmaps; i++) {
+		int idx = rand() % nr_dsos;
+		struct bench_dso *dso = &dsos[idx];
+		u64 timestamp = rand() % 1000000;
+
+		pr_debug2("   [%d] injecting: %s\n", i+1, dso->name);
+		if (synthesize_mmap(data, dso, timestamp) < 0)
+			return -1;
+
+		for (k = 0; k < nr_samples; k++) {
+			if (synthesize_sample(data, dso, timestamp + k * 1000) < 0)
+				return -1;
+		}
+
+		if ((i + 1) % 10 == 0) {
+			if (synthesize_flush(data) < 0)
+				return -1;
+		}
+	}
+
+	/* this makes the child to finish */
+	close(data->input_pipe[1]);
+
+	wait4(data->pid, &status, 0, &rusage);
+	*max_rss = rusage.ru_maxrss;
+
+	pr_debug("   Child %d exited with %d\n", data->pid, status);
+
+	return 0;
+}
+
+static void do_inject_loop(struct bench_data *data, bool build_id_all)
+{
+	unsigned int i;
+	struct stats time_stats, mem_stats;
+	double time_average, time_stddev;
+	double mem_average, mem_stddev;
+
+	init_stats(&time_stats);
+	init_stats(&mem_stats);
+
+	pr_debug("  Build-id%s injection benchmark\n", build_id_all ? "-all" : "");
+
+	for (i = 0; i < iterations; i++) {
+		struct timeval start, end, diff;
+		u64 runtime_us, max_rss;
+
+		pr_debug("  Iteration #%d\n", i+1);
+
+		if (setup_injection(data, build_id_all) < 0) {
+			printf("  Build-id injection setup failed\n");
+			break;
+		}
+
+		gettimeofday(&start, NULL);
+		if (inject_build_id(data, &max_rss) < 0) {
+			printf("  Build-id injection failed\n");
+			break;
+		}
+
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &diff);
+		runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+		update_stats(&time_stats, runtime_us);
+		update_stats(&mem_stats, max_rss);
+
+		pthread_join(data->th, NULL);
+	}
+
+	time_average = avg_stats(&time_stats) / USEC_PER_MSEC;
+	time_stddev = stddev_stats(&time_stats) / USEC_PER_MSEC;
+	printf("  Average build-id%s injection took: %.3f msec (+- %.3f msec)\n",
+	       build_id_all ? "-all" : "", time_average, time_stddev);
+
+	/* each iteration, it processes MMAP2 + BUILD_ID + nr_samples * SAMPLE */
+	time_average = avg_stats(&time_stats) / (nr_mmaps * (nr_samples + 2));
+	time_stddev = stddev_stats(&time_stats) / (nr_mmaps * (nr_samples + 2));
+	printf("  Average time per event: %.3f usec (+- %.3f usec)\n",
+		time_average, time_stddev);
+
+	mem_average = avg_stats(&mem_stats);
+	mem_stddev = stddev_stats(&mem_stats);
+	printf("  Average memory usage: %.0f KB (+- %.0f KB)\n",
+		mem_average, mem_stddev);
+}
+
+static int do_inject_loops(struct bench_data *data)
+{
+
+	srand(time(NULL));
+	symbol__init(NULL);
+
+	bench_sample_type  = PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP;
+	bench_sample_type |= PERF_SAMPLE_TID | PERF_SAMPLE_TIME;
+	bench_id_hdr_size  = 32;
+
+	collect_dso();
+	if (nr_dsos == 0) {
+		printf("  Cannot collect DSOs for injection\n");
+		return -1;
+	}
+
+	do_inject_loop(data, false);
+	do_inject_loop(data, true);
+
+	release_dso();
+	return 0;
+}
+
+int bench_inject_build_id(int argc, const char **argv)
+{
+	struct bench_data data;
+
+	argc = parse_options(argc, argv, options, bench_usage, 0);
+	if (argc) {
+		usage_with_options(bench_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	return do_inject_loops(&data);
+}
+
diff --git a/tools/perf/bench/kallsyms-parse.c b/tools/perf/bench/kallsyms-parse.c
new file mode 100644
index 000000000000..2b0d0f980ae9
--- /dev/null
+++ b/tools/perf/bench/kallsyms-parse.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Benchmark of /proc/kallsyms parsing.
+ *
+ * Copyright 2020 Google LLC.
+ */
+#include <stdlib.h>
+#include "bench.h"
+#include "../util/stat.h"
+#include <linux/time64.h>
+#include <subcmd/parse-options.h>
+#include <symbol/kallsyms.h>
+
+static unsigned int iterations = 100;
+
+static const struct option options[] = {
+	OPT_UINTEGER('i', "iterations", &iterations,
+		"Number of iterations used to compute average"),
+	OPT_END()
+};
+
+static const char *const bench_usage[] = {
+	"perf bench internals kallsyms-parse <options>",
+	NULL
+};
+
+static int bench_process_symbol(void *arg __maybe_unused,
+				const char *name __maybe_unused,
+				char type __maybe_unused,
+				u64 start __maybe_unused)
+{
+	return 0;
+}
+
+static int do_kallsyms_parse(void)
+{
+	struct timeval start, end, diff;
+	u64 runtime_us;
+	unsigned int i;
+	double time_average, time_stddev;
+	int err;
+	struct stats time_stats;
+
+	init_stats(&time_stats);
+
+	for (i = 0; i < iterations; i++) {
+		gettimeofday(&start, NULL);
+		err = kallsyms__parse("/proc/kallsyms", NULL,
+				bench_process_symbol);
+		if (err)
+			return err;
+
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &diff);
+		runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+		update_stats(&time_stats, runtime_us);
+	}
+
+	time_average = avg_stats(&time_stats) / USEC_PER_MSEC;
+	time_stddev = stddev_stats(&time_stats) / USEC_PER_MSEC;
+	printf("  Average kallsyms__parse took: %.3f ms (+- %.3f ms)\n",
+		time_average, time_stddev);
+	return 0;
+}
+
+int bench_kallsyms_parse(int argc, const char **argv)
+{
+	argc = parse_options(argc, argv, options, bench_usage, 0);
+	if (argc) {
+		usage_with_options(bench_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	return do_kallsyms_parse();
+}
diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c
new file mode 100644
index 000000000000..2908a3a796c9
--- /dev/null
+++ b/tools/perf/bench/mem-functions.c
@@ -0,0 +1,563 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * mem-memcpy.c
+ *
+ * Simple memcpy() and memset() benchmarks
+ *
+ * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
+ */
+
+#include "debug.h"
+#include "../perf-sys.h"
+#include <subcmd/parse-options.h>
+#include "../util/header.h"
+#include "../util/cloexec.h"
+#include "../util/string2.h"
+#include "bench.h"
+#include "mem-memcpy-arch.h"
+#include "mem-memset-arch.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <linux/time64.h>
+#include <linux/log2.h>
+
+#define K 1024
+
+#define PAGE_SHIFT_4KB		12
+#define PAGE_SHIFT_2MB		21
+#define PAGE_SHIFT_1GB		30
+
+static const char	*size_str	= "1MB";
+static const char	*function_str	= "all";
+static const char	*page_size_str	= "4KB";
+static const char	*chunk_size_str	= "0";
+static unsigned int	nr_loops	= 1;
+static bool		use_cycles;
+static int		cycles_fd;
+static unsigned int	seed;
+
+static const struct option bench_common_options[] = {
+	OPT_STRING('s', "size", &size_str, "1MB",
+		    "Specify the size of the memory buffers. "
+		    "Available units: B, KB, MB, GB and TB (case insensitive)"),
+
+	OPT_STRING('p', "page", &page_size_str, "4KB",
+		    "Specify page-size for mapping memory buffers. "
+		    "Available sizes: 4KB, 2MB, 1GB (case insensitive)"),
+
+	OPT_STRING('f', "function", &function_str, "all",
+		    "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"),
+
+	OPT_UINTEGER('l', "nr_loops", &nr_loops,
+		    "Specify the number of loops to run. (default: 1)"),
+
+	OPT_BOOLEAN('c', "cycles", &use_cycles,
+		    "Use a cycles event instead of gettimeofday() to measure performance"),
+
+	OPT_END()
+};
+
+static const struct option bench_mem_options[] = {
+	OPT_STRING('k', "chunk", &chunk_size_str, "0",
+		    "Specify the chunk-size for each invocation. "
+		    "Available units: B, KB, MB, GB and TB (case insensitive)"),
+	OPT_PARENT(bench_common_options),
+	OPT_END()
+};
+
+union bench_clock {
+	u64		cycles;
+	struct timeval	tv;
+};
+
+struct bench_params {
+	size_t		size;
+	size_t		size_total;
+	size_t		chunk_size;
+	unsigned int	nr_loops;
+	unsigned int	page_shift;
+	unsigned int	seed;
+};
+
+struct bench_mem_info {
+	const struct function *functions;
+	int (*do_op)(const struct function *r, struct bench_params *p,
+		     void *src, void *dst, union bench_clock *rt);
+	const char *const *usage;
+	const struct option *options;
+	bool alloc_src;
+};
+
+typedef bool (*mem_init_t)(struct bench_mem_info *, struct bench_params *,
+			   void **, void **);
+typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *,
+			   void **, void **);
+typedef void *(*memcpy_t)(void *, const void *, size_t);
+typedef void *(*memset_t)(void *, int, size_t);
+typedef void (*mmap_op_t)(void *, size_t, unsigned int, bool);
+
+struct function {
+	const char *name;
+	const char *desc;
+	struct {
+		mem_init_t init;
+		mem_fini_t fini;
+		union {
+			memcpy_t memcpy;
+			memset_t memset;
+			mmap_op_t mmap_op;
+		};
+	} fn;
+};
+
+static struct perf_event_attr cycle_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES
+};
+
+static int init_cycles(void)
+{
+	cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag());
+
+	if (cycles_fd < 0 && errno == ENOSYS) {
+		pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+		return -1;
+	}
+
+	return cycles_fd;
+}
+
+static u64 get_cycles(void)
+{
+	int ret;
+	u64 clk;
+
+	ret = read(cycles_fd, &clk, sizeof(u64));
+	BUG_ON(ret != sizeof(u64));
+
+	return clk;
+}
+
+static void clock_get(union bench_clock *t)
+{
+	if (use_cycles)
+		t->cycles = get_cycles();
+	else
+		BUG_ON(gettimeofday(&t->tv, NULL));
+}
+
+static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e)
+{
+	union bench_clock t;
+
+	if (use_cycles)
+		t.cycles = e->cycles - s->cycles;
+	else
+		timersub(&e->tv, &s->tv, &t.tv);
+
+	return t;
+}
+
+static void clock_accum(union bench_clock *a, union bench_clock *b)
+{
+	if (use_cycles)
+		a->cycles += b->cycles;
+	else
+		timeradd(&a->tv, &b->tv, &a->tv);
+}
+
+static double timeval2double(struct timeval *ts)
+{
+	return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC;
+}
+
+#define print_bps(x) do {						\
+		if (x < K)						\
+			printf(" %14lf bytes/sec\n", x);		\
+		else if (x < K * K)					\
+			printf(" %14lfd KB/sec\n", x / K);		\
+		else if (x < K * K * K)					\
+			printf(" %14lf MB/sec\n", x / K / K);		\
+		else							\
+			printf(" %14lf GB/sec\n", x / K / K / K);	\
+	} while (0)
+
+static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p,
+				 int r_idx)
+{
+	const struct function *r = &info->functions[r_idx];
+	double result_bps = 0.0;
+	union bench_clock rt = { 0 };
+	void *src = NULL, *dst = NULL;
+
+	printf("# function '%s' (%s)\n", r->name, r->desc);
+
+	if (r->fn.init && r->fn.init(info, p, &src, &dst))
+		goto out_init_failed;
+
+	if (bench_format == BENCH_FORMAT_DEFAULT)
+		printf("# Copying %s bytes ...\n\n", size_str);
+
+	if (info->do_op(r, p, src, dst, &rt))
+		goto out_test_failed;
+
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		if (use_cycles) {
+			printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total);
+		} else {
+			result_bps = (double)p->size_total/timeval2double(&rt.tv);
+			print_bps(result_bps);
+		}
+		break;
+
+	case BENCH_FORMAT_SIMPLE:
+		if (use_cycles) {
+			printf("%lf\n", (double)rt.cycles/(double)p->size_total);
+		} else {
+			result_bps = (double)p->size_total/timeval2double(&rt.tv);
+			printf("%lf\n", result_bps);
+		}
+		break;
+
+	default:
+		BUG_ON(1);
+		break;
+	}
+
+out_test_failed:
+out_free:
+	if (r->fn.fini) r->fn.fini(info, p, &src, &dst);
+	return;
+out_init_failed:
+	printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
+			p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
+	goto out_free;
+}
+
+static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *info)
+{
+	int i;
+	struct bench_params p = { 0 };
+	unsigned int page_size;
+
+	argc = parse_options(argc, argv, info->options, info->usage, 0);
+
+	if (use_cycles) {
+		i = init_cycles();
+		if (i < 0) {
+			fprintf(stderr, "Failed to open cycles counter\n");
+			return i;
+		}
+	}
+
+	p.nr_loops = nr_loops;
+	p.size = (size_t)perf_atoll((char *)size_str);
+
+	if ((s64)p.size <= 0) {
+		fprintf(stderr, "Invalid size:%s\n", size_str);
+		return 1;
+	}
+	p.size_total = p.size * p.nr_loops;
+
+	p.chunk_size = (size_t)perf_atoll((char *)chunk_size_str);
+	if ((s64)p.chunk_size < 0 || (s64)p.chunk_size > (s64)p.size) {
+		fprintf(stderr, "Invalid chunk_size:%s\n", chunk_size_str);
+		return 1;
+	}
+	if (!p.chunk_size)
+		p.chunk_size = p.size;
+
+	page_size = (unsigned int)perf_atoll((char *)page_size_str);
+	if (page_size != (1 << PAGE_SHIFT_4KB) &&
+	    page_size != (1 << PAGE_SHIFT_2MB) &&
+	    page_size != (1 << PAGE_SHIFT_1GB)) {
+		fprintf(stderr, "Invalid page-size:%s\n", page_size_str);
+		return 1;
+	}
+	p.page_shift = ilog2(page_size);
+
+	p.seed = seed;
+
+	if (!strncmp(function_str, "all", 3)) {
+		for (i = 0; info->functions[i].name; i++)
+			__bench_mem_function(info, &p, i);
+		return 0;
+	}
+
+	for (i = 0; info->functions[i].name; i++) {
+		if (!strcmp(info->functions[i].name, function_str))
+			break;
+	}
+	if (!info->functions[i].name) {
+		if (strcmp(function_str, "help") && strcmp(function_str, "h"))
+			printf("Unknown function: %s\n", function_str);
+		printf("Available functions:\n");
+		for (i = 0; info->functions[i].name; i++) {
+			printf("\t%s ... %s\n",
+			       info->functions[i].name, info->functions[i].desc);
+		}
+		return 1;
+	}
+
+	__bench_mem_function(info, &p, i);
+
+	return 0;
+}
+
+static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst)
+{
+	/* Make sure to always prefault zero pages even if MMAP_THRESH is crossed: */
+	memset(src, 0, size);
+
+	/*
+	 * We prefault the freshly allocated memory range here,
+	 * to not measure page fault overhead:
+	 */
+	fn(dst, src, size);
+}
+
+static int do_memcpy(const struct function *r, struct bench_params *p,
+		     void *src, void *dst, union bench_clock *rt)
+{
+	union bench_clock start, end;
+	memcpy_t fn = r->fn.memcpy;
+
+	memcpy_prefault(fn, p->size, src, dst);
+
+	clock_get(&start);
+	for (unsigned int i = 0; i < p->nr_loops; ++i)
+		for (size_t off = 0; off < p->size; off += p->chunk_size)
+			fn(dst + off, src + off, min(p->chunk_size, p->size - off));
+	clock_get(&end);
+
+	*rt = clock_diff(&start, &end);
+
+	return 0;
+}
+
+static void *bench_mmap(size_t size, bool populate, unsigned int page_shift)
+{
+	void *p;
+	int extra = populate ? MAP_POPULATE : 0;
+
+	if (page_shift != PAGE_SHIFT_4KB)
+		extra |= MAP_HUGETLB | (page_shift << MAP_HUGE_SHIFT);
+
+	p = mmap(NULL, size, PROT_READ|PROT_WRITE,
+		 extra | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+
+	return p == MAP_FAILED ? NULL : p;
+}
+
+static void bench_munmap(void *p, size_t size)
+{
+	if (p)
+		munmap(p, size);
+}
+
+static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p,
+		      void **src, void **dst)
+{
+	bool failed;
+
+	*dst = bench_mmap(p->size, true, p->page_shift);
+	failed = *dst == NULL;
+
+	if (info->alloc_src) {
+		*src = bench_mmap(p->size, true, p->page_shift);
+		failed = failed || *src == NULL;
+	}
+
+	return failed;
+}
+
+static void mem_free(struct bench_mem_info *info __maybe_unused,
+		     struct bench_params *p __maybe_unused,
+		     void **src, void **dst)
+{
+	bench_munmap(*dst, p->size);
+	bench_munmap(*src, p->size);
+
+	*dst = *src = NULL;
+}
+
+struct function memcpy_functions[] = {
+	{ .name		= "default",
+	  .desc		= "Default memcpy() provided by glibc",
+	  .fn.init	= mem_alloc,
+	  .fn.fini	= mem_free,
+	  .fn.memcpy	= memcpy },
+
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+# define MEMCPY_FN(_fn, _init, _fini, _name, _desc)	\
+	{.name = _name, .desc = _desc, .fn.memcpy = _fn, .fn.init = _init, .fn.fini = _fini },
+# include "mem-memcpy-x86-64-asm-def.h"
+# undef MEMCPY_FN
+#endif
+
+	{ .name = NULL, }
+};
+
+static const char * const bench_mem_memcpy_usage[] = {
+	"perf bench mem memcpy <options>",
+	NULL
+};
+
+int bench_mem_memcpy(int argc, const char **argv)
+{
+	struct bench_mem_info info = {
+		.functions		= memcpy_functions,
+		.do_op			= do_memcpy,
+		.usage			= bench_mem_memcpy_usage,
+		.options		= bench_mem_options,
+		.alloc_src              = true,
+	};
+
+	return bench_mem_common(argc, argv, &info);
+}
+
+static int do_memset(const struct function *r, struct bench_params *p,
+		     void *src __maybe_unused, void *dst, union bench_clock *rt)
+{
+	union bench_clock start, end;
+	memset_t fn = r->fn.memset;
+
+	/*
+	 * We prefault the freshly allocated memory range here,
+	 * to not measure page fault overhead:
+	 */
+	fn(dst, -1, p->size);
+
+	clock_get(&start);
+	for (unsigned int i = 0; i < p->nr_loops; ++i)
+		for (size_t off = 0; off < p->size; off += p->chunk_size)
+			fn(dst + off, i, min(p->chunk_size, p->size - off));
+	clock_get(&end);
+
+	*rt = clock_diff(&start, &end);
+
+	return 0;
+}
+
+static const char * const bench_mem_memset_usage[] = {
+	"perf bench mem memset <options>",
+	NULL
+};
+
+static const struct function memset_functions[] = {
+	{ .name		= "default",
+	  .desc		= "Default memset() provided by glibc",
+	  .fn.init	= mem_alloc,
+	  .fn.fini	= mem_free,
+	  .fn.memset	= memset },
+
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+# define MEMSET_FN(_fn, _init, _fini, _name, _desc) \
+	{.name = _name, .desc = _desc, .fn.memset = _fn, .fn.init = _init, .fn.fini = _fini },
+# include "mem-memset-x86-64-asm-def.h"
+# undef MEMSET_FN
+#endif
+
+	{ .name = NULL, }
+};
+
+int bench_mem_memset(int argc, const char **argv)
+{
+	struct bench_mem_info info = {
+		.functions		= memset_functions,
+		.do_op			= do_memset,
+		.usage			= bench_mem_memset_usage,
+		.options		= bench_mem_options,
+	};
+
+	return bench_mem_common(argc, argv, &info);
+}
+
+static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, bool random)
+{
+	unsigned long npages = size / (1 << page_shift);
+	unsigned long offset = 0, r = 0;
+
+	for (unsigned long i = 0; i < npages; i++) {
+		if (random)
+			r = rand() % (1 << page_shift);
+
+		*((char *)dst + offset + r) = *(char *)(dst + offset + r) + i;
+		offset += 1 << page_shift;
+	}
+}
+
+static int do_mmap(const struct function *r, struct bench_params *p,
+		  void *src __maybe_unused, void *dst __maybe_unused,
+		  union bench_clock *accum)
+{
+	union bench_clock start, end, diff;
+	mmap_op_t fn = r->fn.mmap_op;
+	bool populate = strcmp(r->name, "populate") == 0;
+
+	if (p->seed)
+		srand(p->seed);
+
+	for (unsigned int i = 0; i < p->nr_loops; i++) {
+		clock_get(&start);
+		dst = bench_mmap(p->size, populate, p->page_shift);
+		if (!dst)
+			goto out;
+
+		fn(dst, p->size, p->page_shift, p->seed);
+		clock_get(&end);
+		diff = clock_diff(&start, &end);
+		clock_accum(accum, &diff);
+
+		bench_munmap(dst, p->size);
+	}
+
+	return 0;
+out:
+	printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str,
+			p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large");
+	return -1;
+}
+
+static const char * const bench_mem_mmap_usage[] = {
+	"perf bench mem mmap <options>",
+	NULL
+};
+
+static const struct function mmap_functions[] = {
+	{ .name		= "demand",
+	  .desc		= "Demand loaded mmap()",
+	  .fn.mmap_op	= mmap_page_touch },
+
+	{ .name		= "populate",
+	  .desc		= "Eagerly populated mmap()",
+	  .fn.mmap_op	= mmap_page_touch },
+
+	{ .name = NULL, }
+};
+
+int bench_mem_mmap(int argc, const char **argv)
+{
+	static const struct option bench_mmap_options[] = {
+		OPT_UINTEGER('r', "randomize", &seed,
+			    "Seed to randomize page access offset."),
+		OPT_PARENT(bench_common_options),
+		OPT_END()
+	};
+
+	struct bench_mem_info info = {
+		.functions		= mmap_functions,
+		.do_op			= do_mmap,
+		.usage			= bench_mem_mmap_usage,
+		.options		= bench_mmap_options,
+	};
+
+	return bench_mem_common(argc, argv, &info);
+}
diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h
index a72e36cb5394..852e48cfd8fe 100644
--- a/tools/perf/bench/mem-memcpy-arch.h
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -1,8 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 
-#ifdef ARCH_X86_64
+#ifdef HAVE_ARCH_X86_64_SUPPORT
 
-#define MEMCPY_FN(fn, name, desc)		\
-	extern void *fn(void *, const void *, size_t);
+#define MEMCPY_FN(fn, init, fini, name, desc)		\
+	void *fn(void *, const void *, size_t);
 
 #include "mem-memcpy-x86-64-asm-def.h"
 
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index d66ab799b35f..f43038f4448b 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -1,12 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_orig,
+	mem_alloc,
+	mem_free,
 	"x86-64-unrolled",
 	"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c,
+MEMCPY_FN(__memcpy,
+	mem_alloc,
+	mem_free,
 	"x86-64-movsq",
 	"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
-
-MEMCPY_FN(memcpy_c_e,
-	"x86-64-movsb",
-	"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index fcd9cf00600a..1b9fef7efcdc 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,9 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Various wrappers to make the kernel .S file build in user-space: */
+
+// memcpy_orig is being defined as SYM_L_LOCAL but we need it
+#define SYM_FUNC_START_LOCAL(name)                      \
+        SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemcpy_c globl memcpy_c; memcpy_c
-#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
-#include "../../../arch/x86/lib/memcpy_64.S"
+#define _ASM_EXTABLE_FAULT(x, y)
+#define _ASM_EXTABLE(x, y)
+
+#include "../../arch/x86/lib/memcpy_64.S"
 /*
  * We need to provide note.GNU-stack section, saying that we want
  * NOT executable stack. Otherwise the final linking will assume that
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
deleted file mode 100644
index 25fd3f1966f1..000000000000
--- a/tools/perf/bench/mem-memcpy.c
+++ /dev/null
@@ -1,303 +0,0 @@
-/*
- * mem-memcpy.c
- *
- * memcpy: Simple memory copy in various ways
- *
- * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
- */
-
-#include "../perf.h"
-#include "../util/util.h"
-#include "../util/parse-options.h"
-#include "../util/header.h"
-#include "bench.h"
-#include "mem-memcpy-arch.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <errno.h>
-
-#define K 1024
-
-static const char	*length_str	= "1MB";
-static const char	*routine	= "default";
-static int		iterations	= 1;
-static bool		use_cycle;
-static int		cycle_fd;
-static bool		only_prefault;
-static bool		no_prefault;
-
-static const struct option options[] = {
-	OPT_STRING('l', "length", &length_str, "1MB",
-		    "Specify length of memory to copy. "
-		    "Available units: B, KB, MB, GB and TB (upper and lower)"),
-	OPT_STRING('r', "routine", &routine, "default",
-		    "Specify routine to copy"),
-	OPT_INTEGER('i', "iterations", &iterations,
-		    "repeat memcpy() invocation this number of times"),
-	OPT_BOOLEAN('c', "cycle", &use_cycle,
-		    "Use cycles event instead of gettimeofday() for measuring"),
-	OPT_BOOLEAN('o', "only-prefault", &only_prefault,
-		    "Show only the result with page faults before memcpy()"),
-	OPT_BOOLEAN('n', "no-prefault", &no_prefault,
-		    "Show only the result without page faults before memcpy()"),
-	OPT_END()
-};
-
-typedef void *(*memcpy_t)(void *, const void *, size_t);
-
-struct routine {
-	const char *name;
-	const char *desc;
-	memcpy_t fn;
-};
-
-struct routine routines[] = {
-	{ "default",
-	  "Default memcpy() provided by glibc",
-	  memcpy },
-#ifdef ARCH_X86_64
-
-#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
-#include "mem-memcpy-x86-64-asm-def.h"
-#undef MEMCPY_FN
-
-#endif
-
-	{ NULL,
-	  NULL,
-	  NULL   }
-};
-
-static const char * const bench_mem_memcpy_usage[] = {
-	"perf bench mem memcpy <options>",
-	NULL
-};
-
-static struct perf_event_attr cycle_attr = {
-	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES
-};
-
-static void init_cycle(void)
-{
-	cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0);
-
-	if (cycle_fd < 0 && errno == ENOSYS)
-		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-	else
-		BUG_ON(cycle_fd < 0);
-}
-
-static u64 get_cycle(void)
-{
-	int ret;
-	u64 clk;
-
-	ret = read(cycle_fd, &clk, sizeof(u64));
-	BUG_ON(ret != sizeof(u64));
-
-	return clk;
-}
-
-static double timeval2double(struct timeval *ts)
-{
-	return (double)ts->tv_sec +
-		(double)ts->tv_usec / (double)1000000;
-}
-
-static void alloc_mem(void **dst, void **src, size_t length)
-{
-	*dst = zalloc(length);
-	if (!*dst)
-		die("memory allocation failed - maybe length is too large?\n");
-
-	*src = zalloc(length);
-	if (!*src)
-		die("memory allocation failed - maybe length is too large?\n");
-}
-
-static u64 do_memcpy_cycle(memcpy_t fn, size_t len, bool prefault)
-{
-	u64 cycle_start = 0ULL, cycle_end = 0ULL;
-	void *src = NULL, *dst = NULL;
-	int i;
-
-	alloc_mem(&src, &dst, len);
-
-	if (prefault)
-		fn(dst, src, len);
-
-	cycle_start = get_cycle();
-	for (i = 0; i < iterations; ++i)
-		fn(dst, src, len);
-	cycle_end = get_cycle();
-
-	free(src);
-	free(dst);
-	return cycle_end - cycle_start;
-}
-
-static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
-{
-	struct timeval tv_start, tv_end, tv_diff;
-	void *src = NULL, *dst = NULL;
-	int i;
-
-	alloc_mem(&src, &dst, len);
-
-	if (prefault)
-		fn(dst, src, len);
-
-	BUG_ON(gettimeofday(&tv_start, NULL));
-	for (i = 0; i < iterations; ++i)
-		fn(dst, src, len);
-	BUG_ON(gettimeofday(&tv_end, NULL));
-
-	timersub(&tv_end, &tv_start, &tv_diff);
-
-	free(src);
-	free(dst);
-	return (double)((double)len / timeval2double(&tv_diff));
-}
-
-#define pf (no_prefault ? 0 : 1)
-
-#define print_bps(x) do {					\
-		if (x < K)					\
-			printf(" %14lf B/Sec", x);		\
-		else if (x < K * K)				\
-			printf(" %14lfd KB/Sec", x / K);	\
-		else if (x < K * K * K)				\
-			printf(" %14lf MB/Sec", x / K / K);	\
-		else						\
-			printf(" %14lf GB/Sec", x / K / K / K); \
-	} while (0)
-
-int bench_mem_memcpy(int argc, const char **argv,
-		     const char *prefix __maybe_unused)
-{
-	int i;
-	size_t len;
-	double result_bps[2];
-	u64 result_cycle[2];
-
-	argc = parse_options(argc, argv, options,
-			     bench_mem_memcpy_usage, 0);
-
-	if (use_cycle)
-		init_cycle();
-
-	len = (size_t)perf_atoll((char *)length_str);
-
-	result_cycle[0] = result_cycle[1] = 0ULL;
-	result_bps[0] = result_bps[1] = 0.0;
-
-	if ((s64)len <= 0) {
-		fprintf(stderr, "Invalid length:%s\n", length_str);
-		return 1;
-	}
-
-	/* same to without specifying either of prefault and no-prefault */
-	if (only_prefault && no_prefault)
-		only_prefault = no_prefault = false;
-
-	for (i = 0; routines[i].name; i++) {
-		if (!strcmp(routines[i].name, routine))
-			break;
-	}
-	if (!routines[i].name) {
-		printf("Unknown routine:%s\n", routine);
-		printf("Available routines...\n");
-		for (i = 0; routines[i].name; i++) {
-			printf("\t%s ... %s\n",
-			       routines[i].name, routines[i].desc);
-		}
-		return 1;
-	}
-
-	if (bench_format == BENCH_FORMAT_DEFAULT)
-		printf("# Copying %s Bytes ...\n\n", length_str);
-
-	if (!only_prefault && !no_prefault) {
-		/* show both of results */
-		if (use_cycle) {
-			result_cycle[0] =
-				do_memcpy_cycle(routines[i].fn, len, false);
-			result_cycle[1] =
-				do_memcpy_cycle(routines[i].fn, len, true);
-		} else {
-			result_bps[0] =
-				do_memcpy_gettimeofday(routines[i].fn,
-						len, false);
-			result_bps[1] =
-				do_memcpy_gettimeofday(routines[i].fn,
-						len, true);
-		}
-	} else {
-		if (use_cycle) {
-			result_cycle[pf] =
-				do_memcpy_cycle(routines[i].fn,
-						len, only_prefault);
-		} else {
-			result_bps[pf] =
-				do_memcpy_gettimeofday(routines[i].fn,
-						len, only_prefault);
-		}
-	}
-
-	switch (bench_format) {
-	case BENCH_FORMAT_DEFAULT:
-		if (!only_prefault && !no_prefault) {
-			if (use_cycle) {
-				printf(" %14lf Cycle/Byte\n",
-					(double)result_cycle[0]
-					/ (double)len);
-				printf(" %14lf Cycle/Byte (with prefault)\n",
-					(double)result_cycle[1]
-					/ (double)len);
-			} else {
-				print_bps(result_bps[0]);
-				printf("\n");
-				print_bps(result_bps[1]);
-				printf(" (with prefault)\n");
-			}
-		} else {
-			if (use_cycle) {
-				printf(" %14lf Cycle/Byte",
-					(double)result_cycle[pf]
-					/ (double)len);
-			} else
-				print_bps(result_bps[pf]);
-
-			printf("%s\n", only_prefault ? " (with prefault)" : "");
-		}
-		break;
-	case BENCH_FORMAT_SIMPLE:
-		if (!only_prefault && !no_prefault) {
-			if (use_cycle) {
-				printf("%lf %lf\n",
-					(double)result_cycle[0] / (double)len,
-					(double)result_cycle[1] / (double)len);
-			} else {
-				printf("%lf %lf\n",
-					result_bps[0], result_bps[1]);
-			}
-		} else {
-			if (use_cycle) {
-				printf("%lf\n", (double)result_cycle[pf]
-					/ (double)len);
-			} else
-				printf("%lf\n", result_bps[pf]);
-		}
-		break;
-	default:
-		/* reaching this means there's some disaster: */
-		die("unknown format: %d\n", bench_format);
-		break;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/bench/mem-memset-arch.h b/tools/perf/bench/mem-memset-arch.h
index a040fa77665b..278c5da12d63 100644
--- a/tools/perf/bench/mem-memset-arch.h
+++ b/tools/perf/bench/mem-memset-arch.h
@@ -1,8 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 
-#ifdef ARCH_X86_64
+#ifdef HAVE_ARCH_X86_64_SUPPORT
 
-#define MEMSET_FN(fn, name, desc)		\
-	extern void *fn(void *, int, size_t);
+#define MEMSET_FN(fn, init, fini, name, desc)	\
+	void *fn(void *, int, size_t);
 
 #include "mem-memset-x86-64-asm-def.h"
 
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index a71dff97c1f5..80ad1b7ea770 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,12 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 
-MEMSET_FN(__memset,
+MEMSET_FN(memset_orig,
+	mem_alloc,
+	mem_free,
 	"x86-64-unrolled",
 	"unrolled memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c,
+MEMSET_FN(__memset,
+	mem_alloc,
+	mem_free,
 	"x86-64-stosq",
 	"movsq-based memset() in arch/x86/lib/memset_64.S")
-
-MEMSET_FN(memset_c_e,
-	"x86-64-stosb",
-	"movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
index 9e5af89ed13a..abd26c95f1aa 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,9 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// memset_orig is being defined as SYM_L_LOCAL but we need it
+#define SYM_FUNC_START_LOCAL(name)                      \
+        SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
 #define memset MEMSET /* don't hide glibc's memset() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemset_c globl memset_c; memset_c
-#define Lmemset_c_e globl memset_c_e; memset_c_e
-#include "../../../arch/x86/lib/memset_64.S"
+#include "../../arch/x86/lib/memset_64.S"
 
 /*
  * We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/bench/mem-memset.c b/tools/perf/bench/mem-memset.c
deleted file mode 100644
index 4a2f12081964..000000000000
--- a/tools/perf/bench/mem-memset.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * mem-memset.c
- *
- * memset: Simple memory set in various ways
- *
- * Trivial clone of mem-memcpy.c.
- */
-
-#include "../perf.h"
-#include "../util/util.h"
-#include "../util/parse-options.h"
-#include "../util/header.h"
-#include "bench.h"
-#include "mem-memset-arch.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/time.h>
-#include <errno.h>
-
-#define K 1024
-
-static const char	*length_str	= "1MB";
-static const char	*routine	= "default";
-static int		iterations	= 1;
-static bool		use_cycle;
-static int		cycle_fd;
-static bool		only_prefault;
-static bool		no_prefault;
-
-static const struct option options[] = {
-	OPT_STRING('l', "length", &length_str, "1MB",
-		    "Specify length of memory to set. "
-		    "Available units: B, KB, MB, GB and TB (upper and lower)"),
-	OPT_STRING('r', "routine", &routine, "default",
-		    "Specify routine to set"),
-	OPT_INTEGER('i', "iterations", &iterations,
-		    "repeat memset() invocation this number of times"),
-	OPT_BOOLEAN('c', "cycle", &use_cycle,
-		    "Use cycles event instead of gettimeofday() for measuring"),
-	OPT_BOOLEAN('o', "only-prefault", &only_prefault,
-		    "Show only the result with page faults before memset()"),
-	OPT_BOOLEAN('n', "no-prefault", &no_prefault,
-		    "Show only the result without page faults before memset()"),
-	OPT_END()
-};
-
-typedef void *(*memset_t)(void *, int, size_t);
-
-struct routine {
-	const char *name;
-	const char *desc;
-	memset_t fn;
-};
-
-static const struct routine routines[] = {
-	{ "default",
-	  "Default memset() provided by glibc",
-	  memset },
-#ifdef ARCH_X86_64
-
-#define MEMSET_FN(fn, name, desc) { name, desc, fn },
-#include "mem-memset-x86-64-asm-def.h"
-#undef MEMSET_FN
-
-#endif
-
-	{ NULL,
-	  NULL,
-	  NULL   }
-};
-
-static const char * const bench_mem_memset_usage[] = {
-	"perf bench mem memset <options>",
-	NULL
-};
-
-static struct perf_event_attr cycle_attr = {
-	.type		= PERF_TYPE_HARDWARE,
-	.config		= PERF_COUNT_HW_CPU_CYCLES
-};
-
-static void init_cycle(void)
-{
-	cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0);
-
-	if (cycle_fd < 0 && errno == ENOSYS)
-		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-	else
-		BUG_ON(cycle_fd < 0);
-}
-
-static u64 get_cycle(void)
-{
-	int ret;
-	u64 clk;
-
-	ret = read(cycle_fd, &clk, sizeof(u64));
-	BUG_ON(ret != sizeof(u64));
-
-	return clk;
-}
-
-static double timeval2double(struct timeval *ts)
-{
-	return (double)ts->tv_sec +
-		(double)ts->tv_usec / (double)1000000;
-}
-
-static void alloc_mem(void **dst, size_t length)
-{
-	*dst = zalloc(length);
-	if (!*dst)
-		die("memory allocation failed - maybe length is too large?\n");
-}
-
-static u64 do_memset_cycle(memset_t fn, size_t len, bool prefault)
-{
-	u64 cycle_start = 0ULL, cycle_end = 0ULL;
-	void *dst = NULL;
-	int i;
-
-	alloc_mem(&dst, len);
-
-	if (prefault)
-		fn(dst, -1, len);
-
-	cycle_start = get_cycle();
-	for (i = 0; i < iterations; ++i)
-		fn(dst, i, len);
-	cycle_end = get_cycle();
-
-	free(dst);
-	return cycle_end - cycle_start;
-}
-
-static double do_memset_gettimeofday(memset_t fn, size_t len, bool prefault)
-{
-	struct timeval tv_start, tv_end, tv_diff;
-	void *dst = NULL;
-	int i;
-
-	alloc_mem(&dst, len);
-
-	if (prefault)
-		fn(dst, -1, len);
-
-	BUG_ON(gettimeofday(&tv_start, NULL));
-	for (i = 0; i < iterations; ++i)
-		fn(dst, i, len);
-	BUG_ON(gettimeofday(&tv_end, NULL));
-
-	timersub(&tv_end, &tv_start, &tv_diff);
-
-	free(dst);
-	return (double)((double)len / timeval2double(&tv_diff));
-}
-
-#define pf (no_prefault ? 0 : 1)
-
-#define print_bps(x) do {					\
-		if (x < K)					\
-			printf(" %14lf B/Sec", x);		\
-		else if (x < K * K)				\
-			printf(" %14lfd KB/Sec", x / K);	\
-		else if (x < K * K * K)				\
-			printf(" %14lf MB/Sec", x / K / K);	\
-		else						\
-			printf(" %14lf GB/Sec", x / K / K / K); \
-	} while (0)
-
-int bench_mem_memset(int argc, const char **argv,
-		     const char *prefix __maybe_unused)
-{
-	int i;
-	size_t len;
-	double result_bps[2];
-	u64 result_cycle[2];
-
-	argc = parse_options(argc, argv, options,
-			     bench_mem_memset_usage, 0);
-
-	if (use_cycle)
-		init_cycle();
-
-	len = (size_t)perf_atoll((char *)length_str);
-
-	result_cycle[0] = result_cycle[1] = 0ULL;
-	result_bps[0] = result_bps[1] = 0.0;
-
-	if ((s64)len <= 0) {
-		fprintf(stderr, "Invalid length:%s\n", length_str);
-		return 1;
-	}
-
-	/* same to without specifying either of prefault and no-prefault */
-	if (only_prefault && no_prefault)
-		only_prefault = no_prefault = false;
-
-	for (i = 0; routines[i].name; i++) {
-		if (!strcmp(routines[i].name, routine))
-			break;
-	}
-	if (!routines[i].name) {
-		printf("Unknown routine:%s\n", routine);
-		printf("Available routines...\n");
-		for (i = 0; routines[i].name; i++) {
-			printf("\t%s ... %s\n",
-			       routines[i].name, routines[i].desc);
-		}
-		return 1;
-	}
-
-	if (bench_format == BENCH_FORMAT_DEFAULT)
-		printf("# Copying %s Bytes ...\n\n", length_str);
-
-	if (!only_prefault && !no_prefault) {
-		/* show both of results */
-		if (use_cycle) {
-			result_cycle[0] =
-				do_memset_cycle(routines[i].fn, len, false);
-			result_cycle[1] =
-				do_memset_cycle(routines[i].fn, len, true);
-		} else {
-			result_bps[0] =
-				do_memset_gettimeofday(routines[i].fn,
-						len, false);
-			result_bps[1] =
-				do_memset_gettimeofday(routines[i].fn,
-						len, true);
-		}
-	} else {
-		if (use_cycle) {
-			result_cycle[pf] =
-				do_memset_cycle(routines[i].fn,
-						len, only_prefault);
-		} else {
-			result_bps[pf] =
-				do_memset_gettimeofday(routines[i].fn,
-						len, only_prefault);
-		}
-	}
-
-	switch (bench_format) {
-	case BENCH_FORMAT_DEFAULT:
-		if (!only_prefault && !no_prefault) {
-			if (use_cycle) {
-				printf(" %14lf Cycle/Byte\n",
-					(double)result_cycle[0]
-					/ (double)len);
-				printf(" %14lf Cycle/Byte (with prefault)\n ",
-					(double)result_cycle[1]
-					/ (double)len);
-			} else {
-				print_bps(result_bps[0]);
-				printf("\n");
-				print_bps(result_bps[1]);
-				printf(" (with prefault)\n");
-			}
-		} else {
-			if (use_cycle) {
-				printf(" %14lf Cycle/Byte",
-					(double)result_cycle[pf]
-					/ (double)len);
-			} else
-				print_bps(result_bps[pf]);
-
-			printf("%s\n", only_prefault ? " (with prefault)" : "");
-		}
-		break;
-	case BENCH_FORMAT_SIMPLE:
-		if (!only_prefault && !no_prefault) {
-			if (use_cycle) {
-				printf("%lf %lf\n",
-					(double)result_cycle[0] / (double)len,
-					(double)result_cycle[1] / (double)len);
-			} else {
-				printf("%lf %lf\n",
-					result_bps[0], result_bps[1]);
-			}
-		} else {
-			if (use_cycle) {
-				printf("%lf\n", (double)result_cycle[pf]
-					/ (double)len);
-			} else
-				printf("%lf\n", result_bps[pf]);
-		}
-		break;
-	default:
-		/* reaching this means there's some disaster: */
-		die("unknown format: %d\n", bench_format);
-		break;
-	}
-
-	return 0;
-}
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index 30d1c3225b46..19be2aaf4dc0 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -1,13 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  * numa.c
  *
  * numa: Simulate NUMA-sensitive workload and measure their NUMA performance
  */
 
-#include "../perf.h"
-#include "../builtin.h"
-#include "../util/util.h"
-#include "../util/parse-options.h"
+#include <inttypes.h>
+
+#include <subcmd/parse-options.h>
+#include "../util/cloexec.h"
 
 #include "bench.h"
 
@@ -15,34 +16,48 @@
 #include <sched.h>
 #include <stdio.h>
 #include <assert.h>
+#include <debug.h>
 #include <malloc.h>
 #include <signal.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
-#include <pthread.h>
 #include <sys/mman.h>
 #include <sys/time.h>
+#include <sys/resource.h>
 #include <sys/wait.h>
 #include <sys/prctl.h>
+#include <sys/stat.h>
 #include <sys/types.h>
-
+#include <linux/kernel.h>
+#include <linux/time64.h>
+#include <linux/numa.h>
+#include <linux/zalloc.h>
+
+#include "../util/header.h"
+#include "../util/mutex.h"
+#include <api/fs/fs.h>
 #include <numa.h>
 #include <numaif.h>
 
+#ifndef RUSAGE_THREAD
+# define RUSAGE_THREAD 1
+#endif
+
 /*
- * Regular printout to the terminal, supressed if -q is specified:
+ * Regular printout to the terminal, suppressed if -q is specified:
  */
 #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
 
 /*
  * Debug printf:
  */
+#undef dprintf
 #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
 
 struct thread_data {
 	int			curr_cpu;
-	cpu_set_t		bind_cpumask;
+	cpu_set_t		*bind_cpumask;
 	int			bind_node;
 	u8			*process_data;
 	int			process_nr;
@@ -51,7 +66,10 @@ struct thread_data {
 	unsigned int		loops_done;
 	u64			val;
 	u64			runtime_ns;
-	pthread_mutex_t		*process_lock;
+	u64			system_time_ns;
+	u64			user_time_ns;
+	double			speed_gbs;
+	struct mutex		*process_lock;
 };
 
 /* Parameters set by options: */
@@ -101,7 +119,6 @@ struct params {
 	long			bytes_thread;
 
 	int			nr_tasks;
-	bool			show_quiet;
 
 	bool			show_convergence;
 	bool			measure_convergence;
@@ -121,15 +138,16 @@ struct params {
 struct global_info {
 	u8			*data;
 
-	pthread_mutex_t		startup_mutex;
+	struct mutex		startup_mutex;
+	struct cond		startup_cond;
 	int			nr_tasks_started;
 
-	pthread_mutex_t		startup_done_mutex;
-
-	pthread_mutex_t		start_work_mutex;
+	struct mutex		start_work_mutex;
+	struct cond		start_work_cond;
 	int			nr_tasks_working;
+	bool			start_work;
 
-	pthread_mutex_t		stop_work_mutex;
+	struct mutex		stop_work_mutex;
 	u64			bytes_done;
 
 	struct thread_data	*threads;
@@ -159,11 +177,11 @@ static const struct option options[] = {
 	OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"),
 	OPT_STRING('T', "mb_thread"	, &p0.mb_thread_str,	"MB", "thread  memory (MBs)"),
 
-	OPT_UINTEGER('l', "nr_loops"	, &p0.nr_loops,		"max number of loops to run"),
-	OPT_UINTEGER('s', "nr_secs"	, &p0.nr_secs,		"max number of seconds to run"),
+	OPT_UINTEGER('l', "nr_loops"	, &p0.nr_loops,		"max number of loops to run (default: unlimited)"),
+	OPT_UINTEGER('s', "nr_secs"	, &p0.nr_secs,		"max number of seconds to run (default: 5 secs)"),
 	OPT_UINTEGER('u', "usleep"	, &p0.sleep_usecs,	"usecs to sleep per loop iteration"),
 
-	OPT_BOOLEAN('R', "data_reads"	, &p0.data_reads,	"access the data via writes (can be mixed with -W)"),
+	OPT_BOOLEAN('R', "data_reads"	, &p0.data_reads,	"access the data via reads (can be mixed with -W)"),
 	OPT_BOOLEAN('W', "data_writes"	, &p0.data_writes,	"access the data via writes (can be mixed with -R)"),
 	OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,	"access the data backwards as well"),
 	OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
@@ -178,9 +196,11 @@ static const struct option options[] = {
 	OPT_INCR   ('d', "show_details"	, &p0.show_details,	"Show details"),
 	OPT_INCR   ('a', "all"		, &p0.run_all,		"Run all tests in the suite"),
 	OPT_INTEGER('H', "thp"		, &p0.thp,		"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
-	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
+	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, "
+		    "convergence is reached when each process (all its threads) is running on a single NUMA node."),
 	OPT_BOOLEAN('m', "measure_convergence",	&p0.measure_convergence, "measure convergence latency"),
-	OPT_BOOLEAN('q', "quiet"	, &p0.show_quiet,	"bzero the initial allocations"),
+	OPT_BOOLEAN('q', "quiet"	, &quiet,
+		    "quiet mode (do not show any warnings or messages)"),
 	OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
 
 	/* Special option string parsing callbacks: */
@@ -203,72 +223,163 @@ static const char * const numa_usage[] = {
 	NULL
 };
 
-static cpu_set_t bind_to_cpu(int target_cpu)
+/*
+ * To get number of numa nodes present.
+ */
+static int nr_numa_nodes(void)
 {
-	cpu_set_t orig_mask, mask;
-	int ret;
+	int i, nr_nodes = 0;
 
-	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
-	BUG_ON(ret);
+	for (i = 0; i < g->p.nr_nodes; i++) {
+		if (numa_bitmask_isbitset(numa_nodes_ptr, i))
+			nr_nodes++;
+	}
+
+	return nr_nodes;
+}
+
+/*
+ * To check if given numa node is present.
+ */
+static int is_node_present(int node)
+{
+	return numa_bitmask_isbitset(numa_nodes_ptr, node);
+}
+
+/*
+ * To check given numa node has cpus.
+ */
+static bool node_has_cpus(int node)
+{
+	struct bitmask *cpumask = numa_allocate_cpumask();
+	bool ret = false; /* fall back to nocpus */
+	int cpu;
+
+	BUG_ON(!cpumask);
+	if (!numa_node_to_cpus(node, cpumask)) {
+		for (cpu = 0; cpu < (int)cpumask->size; cpu++) {
+			if (numa_bitmask_isbitset(cpumask, cpu)) {
+				ret = true;
+				break;
+			}
+		}
+	}
+	numa_free_cpumask(cpumask);
+
+	return ret;
+}
+
+static cpu_set_t *bind_to_cpu(int target_cpu)
+{
+	int nrcpus = numa_num_possible_cpus();
+	cpu_set_t *orig_mask, *mask;
+	size_t size;
+
+	orig_mask = CPU_ALLOC(nrcpus);
+	BUG_ON(!orig_mask);
+	size = CPU_ALLOC_SIZE(nrcpus);
+	CPU_ZERO_S(size, orig_mask);
+
+	if (sched_getaffinity(0, size, orig_mask))
+		goto err_out;
+
+	mask = CPU_ALLOC(nrcpus);
+	if (!mask)
+		goto err_out;
 
-	CPU_ZERO(&mask);
+	CPU_ZERO_S(size, mask);
 
 	if (target_cpu == -1) {
 		int cpu;
 
 		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-			CPU_SET(cpu, &mask);
+			CPU_SET_S(cpu, size, mask);
 	} else {
-		BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
-		CPU_SET(target_cpu, &mask);
+		if (target_cpu < 0 || target_cpu >= g->p.nr_cpus)
+			goto err;
+
+		CPU_SET_S(target_cpu, size, mask);
 	}
 
-	ret = sched_setaffinity(0, sizeof(mask), &mask);
-	BUG_ON(ret);
+	if (sched_setaffinity(0, size, mask))
+		goto err;
 
 	return orig_mask;
+
+err:
+	CPU_FREE(mask);
+err_out:
+	CPU_FREE(orig_mask);
+
+	/* BUG_ON due to failure in allocation of orig_mask/mask */
+	BUG_ON(-1);
+	return NULL;
 }
 
-static cpu_set_t bind_to_node(int target_node)
+static cpu_set_t *bind_to_node(int target_node)
 {
-	int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
-	cpu_set_t orig_mask, mask;
+	int nrcpus = numa_num_possible_cpus();
+	size_t size;
+	cpu_set_t *orig_mask, *mask;
 	int cpu;
-	int ret;
 
-	BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus);
-	BUG_ON(!cpus_per_node);
+	orig_mask = CPU_ALLOC(nrcpus);
+	BUG_ON(!orig_mask);
+	size = CPU_ALLOC_SIZE(nrcpus);
+	CPU_ZERO_S(size, orig_mask);
 
-	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
-	BUG_ON(ret);
+	if (sched_getaffinity(0, size, orig_mask))
+		goto err_out;
 
-	CPU_ZERO(&mask);
+	mask = CPU_ALLOC(nrcpus);
+	if (!mask)
+		goto err_out;
 
-	if (target_node == -1) {
+	CPU_ZERO_S(size, mask);
+
+	if (target_node == NUMA_NO_NODE) {
 		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-			CPU_SET(cpu, &mask);
+			CPU_SET_S(cpu, size, mask);
 	} else {
-		int cpu_start = (target_node + 0) * cpus_per_node;
-		int cpu_stop  = (target_node + 1) * cpus_per_node;
+		struct bitmask *cpumask = numa_allocate_cpumask();
 
-		BUG_ON(cpu_stop > g->p.nr_cpus);
+		if (!cpumask)
+			goto err;
 
-		for (cpu = cpu_start; cpu < cpu_stop; cpu++)
-			CPU_SET(cpu, &mask);
+		if (!numa_node_to_cpus(target_node, cpumask)) {
+			for (cpu = 0; cpu < (int)cpumask->size; cpu++) {
+				if (numa_bitmask_isbitset(cpumask, cpu))
+					CPU_SET_S(cpu, size, mask);
+			}
+		}
+		numa_free_cpumask(cpumask);
 	}
 
-	ret = sched_setaffinity(0, sizeof(mask), &mask);
-	BUG_ON(ret);
+	if (sched_setaffinity(0, size, mask))
+		goto err;
 
 	return orig_mask;
+
+err:
+	CPU_FREE(mask);
+err_out:
+	CPU_FREE(orig_mask);
+
+	/* BUG_ON due to failure in allocation of orig_mask/mask */
+	BUG_ON(-1);
+	return NULL;
 }
 
-static void bind_to_cpumask(cpu_set_t mask)
+static void bind_to_cpumask(cpu_set_t *mask)
 {
 	int ret;
+	size_t size = CPU_ALLOC_SIZE(numa_num_possible_cpus());
 
-	ret = sched_setaffinity(0, sizeof(mask), &mask);
-	BUG_ON(ret);
+	ret = sched_setaffinity(0, size, mask);
+	if (ret) {
+		CPU_FREE(mask);
+		BUG_ON(ret);
+	}
 }
 
 static void mempol_restore(void)
@@ -282,18 +393,22 @@ static void mempol_restore(void)
 
 static void bind_to_memnode(int node)
 {
-	unsigned long nodemask;
+	struct bitmask *node_mask;
 	int ret;
 
-	if (node == -1)
+	if (node == NUMA_NO_NODE)
 		return;
 
-	BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask));
-	nodemask = 1L << node;
+	node_mask = numa_allocate_nodemask();
+	BUG_ON(!node_mask);
 
-	ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
-	dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret);
+	numa_bitmask_clearall(node_mask);
+	numa_bitmask_setbit(node_mask, node);
 
+	ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1);
+	dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret);
+
+	numa_bitmask_free(node_mask);
 	BUG_ON(ret);
 }
 
@@ -310,7 +425,7 @@ do {							\
 static u8 *alloc_data(ssize_t bytes0, int map_flags,
 		      int init_zero, int init_cpu0, int thp, int init_random)
 {
-	cpu_set_t orig_mask;
+	cpu_set_t *orig_mask = NULL;
 	ssize_t bytes;
 	u8 *buf;
 	int ret;
@@ -320,8 +435,10 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags,
 
 	/* Allocate and initialize all memory on CPU#0: */
 	if (init_cpu0) {
-		orig_mask = bind_to_node(0);
-		bind_to_memnode(0);
+		int node = numa_node_of_cpu(0);
+
+		orig_mask = bind_to_node(node);
+		bind_to_memnode(node);
 	}
 
 	bytes = bytes0 + HPSIZE;
@@ -366,6 +483,7 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags,
 	/* Restore affinity: */
 	if (init_cpu0) {
 		bind_to_cpumask(orig_mask);
+		CPU_FREE(orig_mask);
 		mempol_restore();
 	}
 
@@ -408,18 +526,6 @@ static void * setup_private_data(ssize_t bytes)
 	return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
 }
 
-/*
- * Return a process-shared (global) mutex:
- */
-static void init_global_mutex(pthread_mutex_t *mutex)
-{
-	pthread_mutexattr_t attr;
-
-	pthread_mutexattr_init(&attr);
-	pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
-	pthread_mutex_init(mutex, &attr);
-}
-
 static int parse_cpu_list(const char *arg)
 {
 	p0.cpu_list_str = strdup(arg);
@@ -429,14 +535,65 @@ static int parse_cpu_list(const char *arg)
 	return 0;
 }
 
-static void parse_setup_cpu_list(void)
+/*
+ * Check whether a CPU is online
+ *
+ * Returns:
+ *     1 -> if CPU is online
+ *     0 -> if CPU is offline
+ *    -1 -> error case
+ */
+static int is_cpu_online(unsigned int cpu)
+{
+	char *str;
+	size_t strlen;
+	char buf[256];
+	int status = -1;
+	struct stat statbuf;
+
+	snprintf(buf, sizeof(buf),
+		"/sys/devices/system/cpu/cpu%d", cpu);
+	if (stat(buf, &statbuf) != 0)
+		return 0;
+
+	/*
+	 * Check if /sys/devices/system/cpu/cpux/online file
+	 * exists. Some cases cpu0 won't have online file since
+	 * it is not expected to be turned off generally.
+	 * In kernels without CONFIG_HOTPLUG_CPU, this
+	 * file won't exist
+	 */
+	snprintf(buf, sizeof(buf),
+		"/sys/devices/system/cpu/cpu%d/online", cpu);
+	if (stat(buf, &statbuf) != 0)
+		return 1;
+
+	/*
+	 * Read online file using sysfs__read_str.
+	 * If read or open fails, return -1.
+	 * If read succeeds, return value from file
+	 * which gets stored in "str"
+	 */
+	snprintf(buf, sizeof(buf),
+		"devices/system/cpu/cpu%d/online", cpu);
+
+	if (sysfs__read_str(buf, &str, &strlen) < 0)
+		return status;
+
+	status = atoi(str);
+
+	free(str);
+	return status;
+}
+
+static int parse_setup_cpu_list(void)
 {
 	struct thread_data *td;
 	char *str0, *str;
 	int t;
 
 	if (!g->p.cpu_list_str)
-		return;
+		return 0;
 
 	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
 
@@ -500,11 +657,21 @@ static void parse_setup_cpu_list(void)
 
 		dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
 
-		BUG_ON(bind_cpu_0 < 0 || bind_cpu_0 >= g->p.nr_cpus);
-		BUG_ON(bind_cpu_1 < 0 || bind_cpu_1 >= g->p.nr_cpus);
+		if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) {
+			printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus);
+			return -1;
+		}
+
+		if (is_cpu_online(bind_cpu_0) != 1 || is_cpu_online(bind_cpu_1) != 1) {
+			printf("\nTest not applicable, bind_cpu_0 or bind_cpu_1 is offline\n");
+			return -1;
+		}
+
+		BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0);
 		BUG_ON(bind_cpu_0 > bind_cpu_1);
 
 		for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
+			size_t size = CPU_ALLOC_SIZE(g->p.nr_cpus);
 			int i;
 
 			for (i = 0; i < mul; i++) {
@@ -524,10 +691,15 @@ static void parse_setup_cpu_list(void)
 					tprintf("%2d", bind_cpu);
 				}
 
-				CPU_ZERO(&td->bind_cpumask);
+				td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus);
+				BUG_ON(!td->bind_cpumask);
+				CPU_ZERO_S(size, td->bind_cpumask);
 				for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
-					BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
-					CPU_SET(cpu, &td->bind_cpumask);
+					if (cpu < 0 || cpu >= g->p.nr_cpus) {
+						CPU_FREE(td->bind_cpumask);
+						BUG_ON(-1);
+					}
+					CPU_SET_S(cpu, size, td->bind_cpumask);
 				}
 				t++;
 			}
@@ -541,6 +713,7 @@ out:
 		printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
 
 	free(str0);
+	return 0;
 }
 
 static int parse_cpus_opt(const struct option *opt __maybe_unused,
@@ -561,14 +734,14 @@ static int parse_node_list(const char *arg)
 	return 0;
 }
 
-static void parse_setup_node_list(void)
+static int parse_setup_node_list(void)
 {
 	struct thread_data *td;
 	char *str0, *str;
 	int t;
 
 	if (!g->p.node_list_str)
-		return;
+		return 0;
 
 	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
 
@@ -619,15 +792,19 @@ static void parse_setup_node_list(void)
 
 		dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
 
-		BUG_ON(bind_node_0 < 0 || bind_node_0 >= g->p.nr_nodes);
-		BUG_ON(bind_node_1 < 0 || bind_node_1 >= g->p.nr_nodes);
+		if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) {
+			printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes);
+			return -1;
+		}
+
+		BUG_ON(bind_node_0 < 0 || bind_node_1 < 0);
 		BUG_ON(bind_node_0 > bind_node_1);
 
 		for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
 			int i;
 
 			for (i = 0; i < mul; i++) {
-				if (t >= g->p.nr_tasks) {
+				if (t >= g->p.nr_tasks || !node_has_cpus(bind_node)) {
 					printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node);
 					goto out;
 				}
@@ -651,6 +828,7 @@ out:
 		printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
 
 	free(str0);
+	return 0;
 }
 
 static int parse_nodes_opt(const struct option *opt __maybe_unused,
@@ -660,12 +838,8 @@ static int parse_nodes_opt(const struct option *opt __maybe_unused,
 		return -1;
 
 	return parse_node_list(arg);
-
-	return 0;
 }
 
-#define BIT(x) (1ul << x)
-
 static inline uint32_t lfsr_32(uint32_t lfsr)
 {
 	const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31);
@@ -678,7 +852,7 @@ static inline uint32_t lfsr_32(uint32_t lfsr)
  * kernel (KSM, zero page, etc.) cannot optimize away RAM
  * accesses:
  */
-static inline u64 access_data(u64 *data __attribute__((unused)), u64 val)
+static inline u64 access_data(u64 *data, u64 val)
 {
 	if (g->p.data_reads)
 		val += *data;
@@ -726,7 +900,7 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val
 
 	if (g->p.data_rand_walk) {
 		u32 lfsr = nr + loop + val;
-		int j;
+		long j;
 
 		for (i = 0; i < words/1024; i++) {
 			long start, end;
@@ -744,12 +918,12 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val
 			}
 		}
 	} else if (!g->p.data_backwards || (nr + loop) & 1) {
+		/* Process data forwards: */
 
 		d0 = data + off;
 		d  = data + off + 1;
 		d1 = data + words;
 
-		/* Process data forwards: */
 		for (;;) {
 			if (unlikely(d >= d1))
 				d = data;
@@ -767,7 +941,6 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val
 		d  = data + off - 1;
 		d1 = data + words;
 
-		/* Process data forwards: */
 		for (;;) {
 			if (unlikely(d < data))
 				d = data + words-1;
@@ -793,8 +966,6 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
 	prctl(0, bytes_worked);
 }
 
-#define MAX_NR_NODES	64
-
 /*
  * Count the number of nodes a process's threads
  * are spread out on.
@@ -805,10 +976,15 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
  */
 static int count_process_nodes(int process_nr)
 {
-	char node_present[MAX_NR_NODES] = { 0, };
+	char *node_present;
 	int nodes;
 	int n, t;
 
+	node_present = (char *)malloc(g->p.nr_nodes * sizeof(char));
+	BUG_ON(!node_present);
+	for (nodes = 0; nodes < g->p.nr_nodes; nodes++)
+		node_present[nodes] = 0;
+
 	for (t = 0; t < g->p.nr_threads; t++) {
 		struct thread_data *td;
 		int task_nr;
@@ -818,14 +994,20 @@ static int count_process_nodes(int process_nr)
 		td = g->threads + task_nr;
 
 		node = numa_node_of_cpu(td->curr_cpu);
+		if (node < 0) /* curr_cpu was likely still -1 */ {
+			free(node_present);
+			return 0;
+		}
+
 		node_present[node] = 1;
 	}
 
 	nodes = 0;
 
-	for (n = 0; n < MAX_NR_NODES; n++)
+	for (n = 0; n < g->p.nr_nodes; n++)
 		nodes += node_present[n];
 
+	free(node_present);
 	return nodes;
 }
 
@@ -872,6 +1054,11 @@ static void calc_convergence_compression(int *strong)
 	for (p = 0; p < g->p.nr_proc; p++) {
 		unsigned int nodes = count_process_nodes(p);
 
+		if (!nodes) {
+			*strong = 0;
+			return;
+		}
+
 		nodes_min = min(nodes, nodes_min);
 		nodes_max = max(nodes, nodes_max);
 	}
@@ -889,7 +1076,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 {
 	unsigned int loops_done_min, loops_done_max;
 	int process_groups;
-	int nodes[MAX_NR_NODES];
+	int *nodes;
 	int distance;
 	int nr_min;
 	int nr_max;
@@ -903,6 +1090,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 	if (!g->p.show_convergence && !g->p.measure_convergence)
 		return;
 
+	nodes = (int *)malloc(g->p.nr_nodes * sizeof(int));
+	BUG_ON(!nodes);
 	for (node = 0; node < g->p.nr_nodes; node++)
 		nodes[node] = 0;
 
@@ -933,6 +1122,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 	sum = 0;
 
 	for (node = 0; node < g->p.nr_nodes; node++) {
+		if (!is_node_present(node))
+			continue;
 		nr = nodes[node];
 		nr_min = min(nr, nr_min);
 		nr_max = max(nr, nr_max);
@@ -942,8 +1133,10 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 
 	BUG_ON(sum > g->p.nr_tasks);
 
-	if (0 && (sum < g->p.nr_tasks))
+	if (0 && (sum < g->p.nr_tasks)) {
+		free(nodes);
 		return;
+	}
 
 	/*
 	 * Count the number of distinct process groups present
@@ -953,8 +1146,11 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 	process_groups = 0;
 
 	for (node = 0; node < g->p.nr_nodes; node++) {
-		int processes = count_node_processes(node);
+		int processes;
 
+		if (!is_node_present(node))
+			continue;
+		processes = count_node_processes(node);
 		nr = nodes[node];
 		tprintf(" %2d/%-2d", nr, processes);
 
@@ -979,7 +1175,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 	if (strong && process_groups == g->p.nr_proc) {
 		if (!*convergence) {
 			*convergence = runtime_ns_max;
-			tprintf(" (%6.1fs converged)\n", *convergence/1e9);
+			tprintf(" (%6.1fs converged)\n", *convergence / NSEC_PER_SEC);
 			if (g->p.measure_convergence) {
 				g->all_converged = true;
 				g->stop_work = true;
@@ -987,17 +1183,19 @@ static void calc_convergence(double runtime_ns_max, double *convergence)
 		}
 	} else {
 		if (*convergence) {
-			tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
+			tprintf(" (%6.1fs de-converged)", runtime_ns_max / NSEC_PER_SEC);
 			*convergence = 0;
 		}
 		tprintf("\n");
 	}
+
+	free(nodes);
 }
 
 static void show_summary(double runtime_ns_max, int l, double *convergence)
 {
 	tprintf("\r #  %5.1f%%  [%.1f mins]",
-		(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
+		(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max / NSEC_PER_SEC / 60.0);
 
 	calc_convergence(runtime_ns_max, convergence);
 
@@ -1021,9 +1219,10 @@ static void *worker_thread(void *__tdata)
 	u8 *global_data;
 	u8 *process_data;
 	u8 *thread_data;
-	u64 bytes_done;
+	u64 bytes_done, secs;
 	long work_done;
 	u32 l;
+	struct rusage rusage;
 
 	bind_to_cpumask(td->bind_cpumask);
 	bind_to_memnode(td->bind_node);
@@ -1050,19 +1249,22 @@ static void *worker_thread(void *__tdata)
 	}
 
 	if (g->p.serialize_startup) {
-		pthread_mutex_lock(&g->startup_mutex);
+		mutex_lock(&g->startup_mutex);
 		g->nr_tasks_started++;
-		pthread_mutex_unlock(&g->startup_mutex);
+		/* The last thread wakes the main process. */
+		if (g->nr_tasks_started == g->p.nr_tasks)
+			cond_signal(&g->startup_cond);
+
+		mutex_unlock(&g->startup_mutex);
 
 		/* Here we will wait for the main process to start us all at once: */
-		pthread_mutex_lock(&g->start_work_mutex);
+		mutex_lock(&g->start_work_mutex);
+		g->start_work = false;
 		g->nr_tasks_working++;
+		while (!g->start_work)
+			cond_wait(&g->start_work_cond, &g->start_work_mutex);
 
-		/* Last one wake the main process: */
-		if (g->nr_tasks_working == g->p.nr_tasks)
-			pthread_mutex_unlock(&g->startup_done_mutex);
-
-		pthread_mutex_unlock(&g->start_work_mutex);
+		mutex_unlock(&g->start_work_mutex);
 	}
 
 	gettimeofday(&start0, NULL);
@@ -1081,17 +1283,17 @@ static void *worker_thread(void *__tdata)
 		val += do_work(thread_data,  g->p.bytes_thread,  0,          1,		l, val);
 
 		if (g->p.sleep_usecs) {
-			pthread_mutex_lock(td->process_lock);
+			mutex_lock(td->process_lock);
 			usleep(g->p.sleep_usecs);
-			pthread_mutex_unlock(td->process_lock);
+			mutex_unlock(td->process_lock);
 		}
 		/*
 		 * Amount of work to be done under a process-global lock:
 		 */
 		if (g->p.bytes_process_locked) {
-			pthread_mutex_lock(td->process_lock);
+			mutex_lock(td->process_lock);
 			val += do_work(process_data, g->p.bytes_process_locked, thread_nr,  g->p.nr_threads,	l, val);
-			pthread_mutex_unlock(td->process_lock);
+			mutex_unlock(td->process_lock);
 		}
 
 		work_done = g->p.bytes_global + g->p.bytes_process +
@@ -1110,7 +1312,7 @@ static void *worker_thread(void *__tdata)
 		/* Check whether our max runtime timed out: */
 		if (g->p.nr_secs) {
 			timersub(&stop, &start0, &diff);
-			if (diff.tv_sec >= g->p.nr_secs) {
+			if ((u32)diff.tv_sec >= g->p.nr_secs) {
 				g->stop_work = true;
 				break;
 			}
@@ -1125,7 +1327,7 @@ static void *worker_thread(void *__tdata)
 		 * by migrating to CPU#0:
 		 */
 		if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
-			cpu_set_t orig_mask;
+			cpu_set_t *orig_mask;
 			int target_cpu;
 			int this_cpu;
 
@@ -1149,15 +1351,16 @@ static void *worker_thread(void *__tdata)
 				printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
 
 			bind_to_cpumask(orig_mask);
+			CPU_FREE(orig_mask);
 		}
 
 		if (details >= 3) {
 			timersub(&stop, &start, &diff);
-			runtime_ns_max = diff.tv_sec * 1000000000;
-			runtime_ns_max += diff.tv_usec * 1000;
+			runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
+			runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
 
 			if (details >= 0) {
-				printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016lx]\n",
+				printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n",
 					process_nr, thread_nr, runtime_ns_max / bytes_done, val);
 			}
 			fflush(stdout);
@@ -1166,22 +1369,30 @@ static void *worker_thread(void *__tdata)
 			continue;
 
 		timersub(&stop, &start0, &diff);
-		runtime_ns_max = diff.tv_sec * 1000000000ULL;
-		runtime_ns_max += diff.tv_usec * 1000ULL;
+		runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
+		runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
 
 		show_summary(runtime_ns_max, l, &convergence);
 	}
 
 	gettimeofday(&stop, NULL);
 	timersub(&stop, &start0, &diff);
-	td->runtime_ns = diff.tv_sec * 1000000000ULL;
-	td->runtime_ns += diff.tv_usec * 1000ULL;
+	td->runtime_ns = diff.tv_sec * NSEC_PER_SEC;
+	td->runtime_ns += diff.tv_usec * NSEC_PER_USEC;
+	secs = td->runtime_ns / NSEC_PER_SEC;
+	td->speed_gbs = secs ? bytes_done / secs / 1e9 : 0;
+
+	getrusage(RUSAGE_THREAD, &rusage);
+	td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC;
+	td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC;
+	td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC;
+	td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC;
 
 	free_data(thread_data, g->p.bytes_thread);
 
-	pthread_mutex_lock(&g->stop_work_mutex);
+	mutex_lock(&g->stop_work_mutex);
 	g->bytes_done += bytes_done;
-	pthread_mutex_unlock(&g->stop_work_mutex);
+	mutex_unlock(&g->stop_work_mutex);
 
 	return NULL;
 }
@@ -1191,7 +1402,7 @@ static void *worker_thread(void *__tdata)
  */
 static void worker_process(int process_nr)
 {
-	pthread_mutex_t process_lock;
+	struct mutex process_lock;
 	struct thread_data *td;
 	pthread_t *pthreads;
 	u8 *process_data;
@@ -1199,7 +1410,7 @@ static void worker_process(int process_nr)
 	int ret;
 	int t;
 
-	pthread_mutex_init(&process_lock, NULL);
+	mutex_init(&process_lock);
 	set_taskname("process %d", process_nr);
 
 	/*
@@ -1252,7 +1463,7 @@ static void print_summary(void)
 
 	printf("\n ###\n");
 	printf(" # %d %s will execute (on %d nodes, %d CPUs):\n",
-		g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus);
+		g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", nr_numa_nodes(), g->p.nr_cpus);
 	printf(" #      %5dx %5ldMB global  shared mem operations\n",
 			g->p.nr_loops, g->p.bytes_global/1024/1024);
 	printf(" #      %5dx %5ldMB process shared mem operations\n",
@@ -1274,21 +1485,31 @@ static void init_thread_data(void)
 
 	for (t = 0; t < g->p.nr_tasks; t++) {
 		struct thread_data *td = g->threads + t;
+		size_t cpuset_size = CPU_ALLOC_SIZE(g->p.nr_cpus);
 		int cpu;
 
 		/* Allow all nodes by default: */
-		td->bind_node = -1;
+		td->bind_node = NUMA_NO_NODE;
 
 		/* Allow all CPUs by default: */
-		CPU_ZERO(&td->bind_cpumask);
+		td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus);
+		BUG_ON(!td->bind_cpumask);
+		CPU_ZERO_S(cpuset_size, td->bind_cpumask);
 		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
-			CPU_SET(cpu, &td->bind_cpumask);
+			CPU_SET_S(cpu, cpuset_size, td->bind_cpumask);
 	}
 }
 
 static void deinit_thread_data(void)
 {
 	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
+	int t;
+
+	/* Free the bind_cpumask allocated for thread_data */
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		struct thread_data *td = g->threads + t;
+		CPU_FREE(td->bind_cpumask);
+	}
 
 	free_data(g->threads, size);
 }
@@ -1305,9 +1526,9 @@ static int init(void)
 	g->p.nr_nodes = numa_max_node() + 1;
 
 	/* char array in count_process_nodes(): */
-	BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
+	BUG_ON(g->p.nr_nodes < 0);
 
-	if (g->p.show_quiet && !g->p.show_details)
+	if (quiet && !g->p.show_details)
 		g->p.show_details = -1;
 
 	/* Some memory should be specified: */
@@ -1348,16 +1569,17 @@ static int init(void)
 	g->data = setup_shared_data(g->p.bytes_global);
 
 	/* Startup serialization: */
-	init_global_mutex(&g->start_work_mutex);
-	init_global_mutex(&g->startup_mutex);
-	init_global_mutex(&g->startup_done_mutex);
-	init_global_mutex(&g->stop_work_mutex);
+	mutex_init_pshared(&g->start_work_mutex);
+	cond_init_pshared(&g->start_work_cond);
+	mutex_init_pshared(&g->startup_mutex);
+	cond_init_pshared(&g->startup_cond);
+	mutex_init_pshared(&g->stop_work_mutex);
 
 	init_thread_data();
 
 	tprintf("#\n");
-	parse_setup_cpu_list();
-	parse_setup_node_list();
+	if (parse_setup_cpu_list() || parse_setup_node_list())
+		return -1;
 	tprintf("#\n");
 
 	print_summary();
@@ -1385,7 +1607,7 @@ static void print_res(const char *name, double val,
 	if (!name)
 		name = "main,";
 
-	if (g->p.show_quiet)
+	if (!quiet)
 		printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
 	else
 		printf(" %14.3f %s\n", val, txt_long);
@@ -1402,7 +1624,7 @@ static int __bench_numa(const char *name)
 	double runtime_sec_min;
 	int wait_stat;
 	double bytes;
-	int i, t;
+	int i, t, p;
 
 	if (init())
 		return -1;
@@ -1410,9 +1632,6 @@ static int __bench_numa(const char *name)
 	pids = zalloc(g->p.nr_proc * sizeof(*pids));
 	pid = -1;
 
-	/* All threads try to acquire it, this way we can wait for them to start up: */
-	pthread_mutex_lock(&g->start_work_mutex);
-
 	if (g->p.serialize_startup) {
 		tprintf(" #\n");
 		tprintf(" # Startup synchronization: ..."); fflush(stdout);
@@ -1434,36 +1653,47 @@ static int __bench_numa(const char *name)
 		pids[i] = pid;
 
 	}
-	/* Wait for all the threads to start up: */
-	while (g->nr_tasks_started != g->p.nr_tasks)
-		usleep(1000);
-
-	BUG_ON(g->nr_tasks_started != g->p.nr_tasks);
 
 	if (g->p.serialize_startup) {
+		bool threads_ready = false;
 		double startup_sec;
 
-		pthread_mutex_lock(&g->startup_done_mutex);
-
-		/* This will start all threads: */
-		pthread_mutex_unlock(&g->start_work_mutex);
-
-		/* This mutex is locked - the last started thread will wake us: */
-		pthread_mutex_lock(&g->startup_done_mutex);
+		/*
+		 * Wait for all the threads to start up. The last thread will
+		 * signal this process.
+		 */
+		mutex_lock(&g->startup_mutex);
+		while (g->nr_tasks_started != g->p.nr_tasks)
+			cond_wait(&g->startup_cond, &g->startup_mutex);
+
+		mutex_unlock(&g->startup_mutex);
+
+		/* Wait for all threads to be at the start_work_cond. */
+		while (!threads_ready) {
+			mutex_lock(&g->start_work_mutex);
+			threads_ready = (g->nr_tasks_working == g->p.nr_tasks);
+			mutex_unlock(&g->start_work_mutex);
+			if (!threads_ready)
+				usleep(1);
+		}
 
 		gettimeofday(&stop, NULL);
 
 		timersub(&stop, &start, &diff);
 
-		startup_sec = diff.tv_sec * 1000000000.0;
-		startup_sec += diff.tv_usec * 1000.0;
-		startup_sec /= 1e9;
+		startup_sec = diff.tv_sec * NSEC_PER_SEC;
+		startup_sec += diff.tv_usec * NSEC_PER_USEC;
+		startup_sec /= NSEC_PER_SEC;
 
 		tprintf(" threads initialized in %.6f seconds.\n", startup_sec);
 		tprintf(" #\n");
 
 		start = stop;
-		pthread_mutex_unlock(&g->startup_done_mutex);
+		/* Start all threads running. */
+		mutex_lock(&g->start_work_mutex);
+		g->start_work = true;
+		mutex_unlock(&g->start_work_mutex);
+		cond_broadcast(&g->start_work_cond);
 	} else {
 		gettimeofday(&start, NULL);
 	}
@@ -1496,14 +1726,14 @@ static int __bench_numa(const char *name)
 	tprintf("\n ###\n");
 	tprintf("\n");
 
-	runtime_sec_max = diff.tv_sec * 1000000000.0;
-	runtime_sec_max += diff.tv_usec * 1000.0;
-	runtime_sec_max /= 1e9;
+	runtime_sec_max = diff.tv_sec * NSEC_PER_SEC;
+	runtime_sec_max += diff.tv_usec * NSEC_PER_USEC;
+	runtime_sec_max /= NSEC_PER_SEC;
 
-	runtime_sec_min = runtime_ns_min/1e9;
+	runtime_sec_min = runtime_ns_min / NSEC_PER_SEC;
 
 	bytes = g->bytes_done;
-	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
+	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC;
 
 	if (g->p.measure_convergence) {
 		print_res(name, runtime_sec_max,
@@ -1529,7 +1759,7 @@ static int __bench_numa(const char *name)
 	print_res(name, bytes / 1e9,
 		"GB,", "data-total",		"GB data processed, total");
 
-	print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks),
+	print_res(name, runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks),
 		"nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime");
 
 	print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max,
@@ -1538,6 +1768,24 @@ static int __bench_numa(const char *name)
 	print_res(name, bytes / runtime_sec_max / 1e9,
 		"GB/sec,", "total-speed",	"GB/sec total speed");
 
+	if (g->p.show_details >= 2) {
+		char tname[14 + 2 * 11 + 1];
+		struct thread_data *td;
+		for (p = 0; p < g->p.nr_proc; p++) {
+			for (t = 0; t < g->p.nr_threads; t++) {
+				memset(tname, 0, sizeof(tname));
+				td = g->threads + p*g->p.nr_threads + t;
+				snprintf(tname, sizeof(tname), "process%d:thread%d", p, t);
+				print_res(tname, td->speed_gbs,
+					"GB/sec",	"thread-speed", "GB/sec/thread speed");
+				print_res(tname, td->system_time_ns / NSEC_PER_SEC,
+					"secs",	"thread-system-time", "system CPU time/thread");
+				print_res(tname, td->user_time_ns / NSEC_PER_SEC,
+					"secs",	"thread-user-time", "user CPU time/thread");
+			}
+		}
+	}
+
 	free(pids);
 
 	deinit();
@@ -1583,6 +1831,11 @@ static void init_params(struct params *p, const char *name, int argc, const char
 	p->data_rand_walk		= true;
 	p->nr_loops			= -1;
 	p->init_random			= true;
+	p->mb_global_str		= "1";
+	p->nr_proc			= 1;
+	p->nr_threads			= 1;
+	p->nr_secs			= 5;
+	p->run_all			= argc == 1;
 }
 
 static int run_bench_numa(const char *name, const char **argv)
@@ -1600,7 +1853,6 @@ static int run_bench_numa(const char *name, const char **argv)
 	return 0;
 
 err:
-	usage_with_options(numa_usage, options);
 	return -1;
 }
 
@@ -1620,12 +1872,12 @@ err:
  */
 static const char *tests[][MAX_ARGS] = {
    /* Basic single-stream NUMA bandwidth measurements: */
-   { "RAM-bw-local,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+   { "RAM-bw-local,",     "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
 			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM },
    { "RAM-bw-local-NOTHP,",
 			  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
 			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM_NOTHP },
-   { "RAM-bw-remote,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+   { "RAM-bw-remote,",    "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
 			  "-C" ,   "0", "-M",   "1", OPT_BW_RAM },
 
    /* 2-stream NUMA bandwidth measurements: */
@@ -1642,7 +1894,7 @@ static const char *tests[][MAX_ARGS] = {
    { " 1x3-convergence,", "mem",  "-p",  "1", "-t",  "3", "-P",  "512", OPT_CONV },
    { " 1x4-convergence,", "mem",  "-p",  "1", "-t",  "4", "-P",  "512", OPT_CONV },
    { " 1x6-convergence,", "mem",  "-p",  "1", "-t",  "6", "-P", "1020", OPT_CONV },
-   { " 2x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
+   { " 2x3-convergence,", "mem",  "-p",  "2", "-t",  "3", "-P", "1020", OPT_CONV },
    { " 3x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
    { " 4x4-convergence,", "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV },
    { " 4x4-convergence-NOTHP,",
@@ -1667,24 +1919,24 @@ static const char *tests[][MAX_ARGS] = {
 			  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW_NOTHP },
    { "16x1-bw-process,",  "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_BW },
 
-   { " 4x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "4", "-T",  "256", OPT_BW },
-   { " 8x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "8", "-T",  "256", OPT_BW },
-   { "16x1-bw-thread,",   "mem",  "-p",  "1", "-t", "16", "-T",  "128", OPT_BW },
-   { "32x1-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-T",   "64", OPT_BW },
+   { " 1x4-bw-thread,",   "mem",  "-p",  "1", "-t",  "4", "-T",  "256", OPT_BW },
+   { " 1x8-bw-thread,",   "mem",  "-p",  "1", "-t",  "8", "-T",  "256", OPT_BW },
+   { "1x16-bw-thread,",   "mem",  "-p",  "1", "-t", "16", "-T",  "128", OPT_BW },
+   { "1x32-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-T",   "64", OPT_BW },
 
-   { " 2x3-bw-thread,",	  "mem",  "-p",  "2", "-t",  "3", "-P",  "512", OPT_BW },
-   { " 4x4-bw-thread,",	  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_BW },
-   { " 4x6-bw-thread,",	  "mem",  "-p",  "4", "-t",  "6", "-P",  "512", OPT_BW },
-   { " 4x8-bw-thread,",	  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW },
-   { " 4x8-bw-thread-NOTHP,",
+   { " 2x3-bw-process,",  "mem",  "-p",  "2", "-t",  "3", "-P",  "512", OPT_BW },
+   { " 4x4-bw-process,",  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_BW },
+   { " 4x6-bw-process,",  "mem",  "-p",  "4", "-t",  "6", "-P",  "512", OPT_BW },
+   { " 4x8-bw-process,",  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW },
+   { " 4x8-bw-process-NOTHP,",
 			  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW_NOTHP },
-   { " 3x3-bw-thread,",	  "mem",  "-p",  "3", "-t",  "3", "-P",  "512", OPT_BW },
-   { " 5x5-bw-thread,",	  "mem",  "-p",  "5", "-t",  "5", "-P",  "512", OPT_BW },
+   { " 3x3-bw-process,",  "mem",  "-p",  "3", "-t",  "3", "-P",  "512", OPT_BW },
+   { " 5x5-bw-process,",  "mem",  "-p",  "5", "-t",  "5", "-P",  "512", OPT_BW },
 
-   { "2x16-bw-thread,",   "mem",  "-p",  "2", "-t", "16", "-P",  "512", OPT_BW },
-   { "1x32-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-P", "2048", OPT_BW },
+   { "2x16-bw-process,",  "mem",  "-p",  "2", "-t", "16", "-P",  "512", OPT_BW },
+   { "1x32-bw-process,",  "mem",  "-p",  "1", "-t", "32", "-P", "2048", OPT_BW },
 
-   { "numa02-bw,",	  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW },
+   { "numa02-bw,",        "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW },
    { "numa02-bw-NOTHP,",  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW_NOTHP },
    { "numa01-bw-thread,", "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW },
    { "numa01-bw-thread-NOTHP,",
@@ -1701,8 +1953,7 @@ static int bench_all(void)
 	BUG_ON(ret < 0);
 
 	for (i = 0; i < nr; i++) {
-		if (run_bench_numa(tests[i][0], tests[i] + 1))
-			return -1;
+		run_bench_numa(tests[i][0], tests[i] + 1);
 	}
 
 	printf("\n");
@@ -1710,7 +1961,7 @@ static int bench_all(void)
 	return 0;
 }
 
-int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused)
+int bench_numa(int argc, const char **argv)
 {
 	init_params(&p0, "main,", argc, argv);
 	argc = parse_options(argc, argv, options, bench_numa_usage, 0);
diff --git a/tools/perf/bench/pmu-scan.c b/tools/perf/bench/pmu-scan.c
new file mode 100644
index 000000000000..14a464ad8cea
--- /dev/null
+++ b/tools/perf/bench/pmu-scan.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Benchmark scanning sysfs files for PMU information.
+ *
+ * Copyright 2023 Google LLC.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include "bench.h"
+#include "util/debug.h"
+#include "util/pmu.h"
+#include "util/pmus.h"
+#include "util/stat.h"
+#include <linux/atomic.h>
+#include <linux/err.h>
+#include <linux/time64.h>
+#include <subcmd/parse-options.h>
+
+static unsigned int iterations = 100;
+
+struct pmu_scan_result {
+	char *name;
+	int nr_aliases;
+	int nr_formats;
+	int nr_caps;
+	bool is_core;
+};
+
+static const struct option options[] = {
+	OPT_UINTEGER('i', "iterations", &iterations,
+		"Number of iterations used to compute average"),
+	OPT_END()
+};
+
+static const char *const bench_usage[] = {
+	"perf bench internals pmu-scan <options>",
+	NULL
+};
+
+static int nr_pmus;
+static struct pmu_scan_result *results;
+
+static int save_result(void)
+{
+	struct perf_pmu *pmu = NULL;
+	struct list_head *list;
+	struct pmu_scan_result *r;
+
+	while ((pmu = perf_pmus__scan(pmu)) != NULL) {
+		r = realloc(results, (nr_pmus + 1) * sizeof(*r));
+		if (r == NULL)
+			return -ENOMEM;
+
+		results = r;
+		r = results + nr_pmus;
+
+		r->name = strdup(pmu->name);
+		r->is_core = pmu->is_core;
+		r->nr_caps = pmu->nr_caps;
+
+		r->nr_aliases = perf_pmu__num_events(pmu);
+
+		r->nr_formats = 0;
+		list_for_each(list, &pmu->format)
+			r->nr_formats++;
+
+		pr_debug("pmu[%d] name=%s, nr_caps=%d, nr_aliases=%d, nr_formats=%d\n",
+			nr_pmus, r->name, r->nr_caps, r->nr_aliases, r->nr_formats);
+		nr_pmus++;
+	}
+
+	perf_pmus__destroy();
+	return 0;
+}
+
+static int check_result(bool core_only)
+{
+	struct pmu_scan_result *r;
+	struct perf_pmu *pmu;
+	struct list_head *list;
+	int nr;
+
+	for (int i = 0; i < nr_pmus; i++) {
+		r = &results[i];
+		if (core_only && !r->is_core)
+			continue;
+
+		pmu = perf_pmus__find(r->name);
+		if (pmu == NULL) {
+			pr_err("Cannot find PMU %s\n", r->name);
+			return -1;
+		}
+
+		if (pmu->nr_caps != (u32)r->nr_caps) {
+			pr_err("Unmatched number of event caps in %s: expect %d vs got %d\n",
+				pmu->name, r->nr_caps, pmu->nr_caps);
+			return -1;
+		}
+
+		nr = perf_pmu__num_events(pmu);
+		if (nr != r->nr_aliases) {
+			pr_err("Unmatched number of event aliases in %s: expect %d vs got %d\n",
+				pmu->name, r->nr_aliases, nr);
+			return -1;
+		}
+
+		nr = 0;
+		list_for_each(list, &pmu->format)
+			nr++;
+		if (nr != r->nr_formats) {
+			pr_err("Unmatched number of event formats in %s: expect %d vs got %d\n",
+				pmu->name, r->nr_formats, nr);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static void delete_result(void)
+{
+	for (int i = 0; i < nr_pmus; i++)
+		free(results[i].name);
+	free(results);
+
+	results = NULL;
+	nr_pmus = 0;
+}
+
+static int run_pmu_scan(void)
+{
+	struct stats stats;
+	struct timeval start, end, diff;
+	double time_average, time_stddev;
+	u64 runtime_us;
+	int ret;
+
+	init_stats(&stats);
+	pr_info("Computing performance of sysfs PMU event scan for %u times\n",
+		iterations);
+
+	if (save_result() < 0) {
+		pr_err("Failed to initialize PMU scan result\n");
+		return -1;
+	}
+
+	for (int j = 0; j < 2; j++) {
+		bool core_only = (j == 0);
+
+		for (unsigned int i = 0; i < iterations; i++) {
+			gettimeofday(&start, NULL);
+			if (core_only)
+				perf_pmus__scan_core(NULL);
+			else
+				perf_pmus__scan(NULL);
+			gettimeofday(&end, NULL);
+			timersub(&end, &start, &diff);
+			runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+			update_stats(&stats, runtime_us);
+
+			ret = check_result(core_only);
+			perf_pmus__destroy();
+			if (ret < 0)
+				break;
+		}
+		time_average = avg_stats(&stats);
+		time_stddev = stddev_stats(&stats);
+		pr_info("  Average%s PMU scanning took: %.3f usec (+- %.3f usec)\n",
+			core_only ? " core" : "", time_average, time_stddev);
+	}
+	delete_result();
+	return 0;
+}
+
+int bench_pmu_scan(int argc, const char **argv)
+{
+	int err = 0;
+
+	argc = parse_options(argc, argv, options, bench_usage, 0);
+	if (argc) {
+		usage_with_options(bench_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	err = run_pmu_scan();
+
+	return err;
+}
diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index cc1190a0849b..93dcd9dba3d0 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *
  * sched-messaging.c
@@ -9,10 +10,7 @@
  *
  */
 
-#include "../perf.h"
-#include "../util/util.h"
-#include "../util/parse-options.h"
-#include "../builtin.h"
+#include <subcmd/parse-options.h>
 #include "bench.h"
 
 /* Test groups of 20 processes spraying to 20 receivers */
@@ -26,35 +24,44 @@
 #include <sys/socket.h>
 #include <sys/wait.h>
 #include <sys/time.h>
-#include <sys/poll.h>
+#include <poll.h>
 #include <limits.h>
+#include <err.h>
+#include <linux/list.h>
+#include <linux/time64.h>
 
 #define DATASIZE 100
 
 static bool use_pipes = false;
-static unsigned int loops = 100;
+static unsigned int nr_loops = 100;
 static bool thread_mode = false;
 static unsigned int num_groups = 10;
+static unsigned int total_children = 0;
+static struct list_head sender_contexts = LIST_HEAD_INIT(sender_contexts);
+static struct list_head receiver_contexts = LIST_HEAD_INIT(receiver_contexts);
 
 struct sender_context {
+	struct list_head list;
 	unsigned int num_fds;
 	int ready_out;
 	int wakefd;
-	int out_fds[0];
+	int out_fds[];
 };
 
 struct receiver_context {
+	struct list_head list;
 	unsigned int num_packets;
 	int in_fds[2];
 	int ready_out;
 	int wakefd;
 };
 
-static void barf(const char *msg)
-{
-	fprintf(stderr, "%s (error: %s)\n", msg, strerror(errno));
-	exit(1);
-}
+union messaging_worker {
+	pthread_t thread;
+	pid_t pid;
+};
+
+static union messaging_worker *worker_tab;
 
 static void fdpair(int fds[2])
 {
@@ -66,42 +73,42 @@ static void fdpair(int fds[2])
 			return;
 	}
 
-	barf(use_pipes ? "pipe()" : "socketpair()");
+	err(EXIT_FAILURE, use_pipes ? "pipe()" : "socketpair()");
 }
 
 /* Block until we're ready to go */
 static void ready(int ready_out, int wakefd)
 {
-	char dummy;
 	struct pollfd pollfd = { .fd = wakefd, .events = POLLIN };
 
 	/* Tell them we're ready. */
-	if (write(ready_out, &dummy, 1) != 1)
-		barf("CLIENT: ready write");
+	if (write(ready_out, "R", 1) != 1)
+		err(EXIT_FAILURE, "CLIENT: ready write");
 
 	/* Wait for "GO" signal */
 	if (poll(&pollfd, 1, -1) != 1)
-		barf("poll");
+		err(EXIT_FAILURE, "poll");
 }
 
-/* Sender sprays loops messages down each file descriptor */
+/* Sender sprays nr_loops messages down each file descriptor */
 static void *sender(struct sender_context *ctx)
 {
 	char data[DATASIZE];
 	unsigned int i, j;
 
 	ready(ctx->ready_out, ctx->wakefd);
+	memset(data, 'S', sizeof(data));
 
 	/* Now pump to every receiver. */
-	for (i = 0; i < loops; i++) {
+	for (i = 0; i < nr_loops; i++) {
 		for (j = 0; j < ctx->num_fds; j++) {
 			int ret, done = 0;
 
 again:
 			ret = write(ctx->out_fds[j], data + done,
-				    sizeof(data)-done);
+				    sizeof(data) - done);
 			if (ret < 0)
-				barf("SENDER: write");
+				err(EXIT_FAILURE, "SENDER: write");
 			done += ret;
 			if (done < DATASIZE)
 				goto again;
@@ -131,7 +138,7 @@ static void *receiver(struct receiver_context* ctx)
 again:
 		ret = read(ctx->in_fds[0], data + done, DATASIZE - done);
 		if (ret < 0)
-			barf("SERVER: read");
+			err(EXIT_FAILURE, "SERVER: read");
 		done += ret;
 		if (done < DATASIZE)
 			goto again;
@@ -140,48 +147,51 @@ again:
 	return NULL;
 }
 
-static pthread_t create_worker(void *ctx, void *(*func)(void *))
+static void create_thread_worker(union messaging_worker *worker,
+				 void *ctx, void *(*func)(void *))
 {
 	pthread_attr_t attr;
-	pthread_t childid;
-	int err;
-
-	if (!thread_mode) {
-		/* process mode */
-		/* Fork the receiver. */
-		switch (fork()) {
-		case -1:
-			barf("fork()");
-			break;
-		case 0:
-			(*func) (ctx);
-			exit(0);
-			break;
-		default:
-			break;
-		}
-
-		return (pthread_t)0;
-	}
+	int ret;
 
 	if (pthread_attr_init(&attr) != 0)
-		barf("pthread_attr_init:");
+		err(EXIT_FAILURE, "pthread_attr_init:");
 
 #ifndef __ia64__
 	if (pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN) != 0)
-		barf("pthread_attr_setstacksize");
+		err(EXIT_FAILURE, "pthread_attr_setstacksize");
 #endif
 
-	err = pthread_create(&childid, &attr, func, ctx);
-	if (err != 0) {
-		fprintf(stderr, "pthread_create failed: %s (%d)\n",
-			strerror(err), err);
-		exit(-1);
+	ret = pthread_create(&worker->thread, &attr, func, ctx);
+	if (ret != 0)
+		err(EXIT_FAILURE, "pthread_create failed");
+
+	pthread_attr_destroy(&attr);
+}
+
+static void create_process_worker(union messaging_worker *worker,
+				  void *ctx, void *(*func)(void *))
+{
+	/* Fork the receiver. */
+	worker->pid = fork();
+
+	if (worker->pid == -1) {
+		err(EXIT_FAILURE, "fork()");
+	} else if (worker->pid == 0) {
+		(*func) (ctx);
+		exit(0);
 	}
-	return childid;
 }
 
-static void reap_worker(pthread_t id)
+static void create_worker(union messaging_worker *worker,
+			  void *ctx, void *(*func)(void *))
+{
+	if (!thread_mode)
+		return create_process_worker(worker, ctx, func);
+	else
+		return create_thread_worker(worker, ctx, func);
+}
+
+static void reap_worker(union messaging_worker *worker)
 {
 	int proc_status;
 	void *thread_status;
@@ -192,41 +202,43 @@ static void reap_worker(pthread_t id)
 		if (!WIFEXITED(proc_status))
 			exit(1);
 	} else {
-		pthread_join(id, &thread_status);
+		pthread_join(worker->thread, &thread_status);
 	}
 }
 
 /* One group of senders and receivers */
-static unsigned int group(pthread_t *pth,
+static unsigned int group(union messaging_worker *worker,
 		unsigned int num_fds,
 		int ready_out,
 		int wakefd)
 {
 	unsigned int i;
-	struct sender_context *snd_ctx = malloc(sizeof(struct sender_context)
-			+ num_fds * sizeof(int));
+	struct sender_context *snd_ctx = malloc(sizeof(struct sender_context) +
+						num_fds * sizeof(int));
 
 	if (!snd_ctx)
-		barf("malloc()");
+		err(EXIT_FAILURE, "malloc()");
 
+	list_add(&snd_ctx->list, &sender_contexts);
 	for (i = 0; i < num_fds; i++) {
 		int fds[2];
 		struct receiver_context *ctx = malloc(sizeof(*ctx));
 
 		if (!ctx)
-			barf("malloc()");
+			err(EXIT_FAILURE, "malloc()");
 
+		list_add(&ctx->list, &receiver_contexts);
 
 		/* Create the pipe between client and server */
 		fdpair(fds);
 
-		ctx->num_packets = num_fds * loops;
+		ctx->num_packets = num_fds * nr_loops;
 		ctx->in_fds[0] = fds[0];
 		ctx->in_fds[1] = fds[1];
 		ctx->ready_out = ready_out;
 		ctx->wakefd = wakefd;
 
-		pth[i] = create_worker(ctx, (void *)receiver);
+		create_worker(worker + i, ctx, (void *)receiver);
 
 		snd_ctx->out_fds[i] = fds[1];
 		if (!thread_mode)
@@ -239,7 +251,7 @@ static unsigned int group(pthread_t *pth,
 		snd_ctx->wakefd = wakefd;
 		snd_ctx->num_fds = num_fds;
 
-		pth[num_fds+i] = create_worker(snd_ctx, (void *)sender);
+		create_worker(worker + num_fds + i, snd_ctx, (void *)sender);
 	}
 
 	/* Close the fds we have left */
@@ -251,13 +263,24 @@ static unsigned int group(pthread_t *pth,
 	return num_fds * 2;
 }
 
+static void sig_handler(int sig __maybe_unused)
+{
+	unsigned int i;
+
+	/*
+	 * When exit abnormally, kill all forked child processes.
+	 */
+	for (i = 0; i < total_children; i++)
+		kill(worker_tab[i].pid, SIGKILL);
+}
+
 static const struct option options[] = {
 	OPT_BOOLEAN('p', "pipe", &use_pipes,
 		    "Use pipe() instead of socketpair()"),
 	OPT_BOOLEAN('t', "thread", &thread_mode,
 		    "Be multi thread instead of multi process"),
 	OPT_UINTEGER('g', "group", &num_groups, "Specify number of groups"),
-	OPT_UINTEGER('l', "loop", &loops, "Specify number of loops"),
+	OPT_UINTEGER('l', "nr_loops", &nr_loops, "Specify the number of loops to run (default: 100)"),
 	OPT_END()
 };
 
@@ -266,45 +289,48 @@ static const char * const bench_sched_message_usage[] = {
 	NULL
 };
 
-int bench_sched_messaging(int argc, const char **argv,
-		    const char *prefix __maybe_unused)
+int bench_sched_messaging(int argc, const char **argv)
 {
-	unsigned int i, total_children;
+	unsigned int i;
 	struct timeval start, stop, diff;
 	unsigned int num_fds = 20;
 	int readyfds[2], wakefds[2];
 	char dummy;
-	pthread_t *pth_tab;
+	struct sender_context *pos, *n;
 
 	argc = parse_options(argc, argv, options,
 			     bench_sched_message_usage, 0);
 
-	pth_tab = malloc(num_fds * 2 * num_groups * sizeof(pthread_t));
-	if (!pth_tab)
-		barf("main:malloc()");
+	worker_tab = malloc(num_fds * 2 * num_groups * sizeof(union messaging_worker));
+	if (!worker_tab)
+		err(EXIT_FAILURE, "main:malloc()");
 
 	fdpair(readyfds);
 	fdpair(wakefds);
 
-	total_children = 0;
+	if (!thread_mode) {
+		signal(SIGINT, sig_handler);
+		signal(SIGTERM, sig_handler);
+	}
+
 	for (i = 0; i < num_groups; i++)
-		total_children += group(pth_tab+total_children, num_fds,
+		total_children += group(worker_tab + total_children, num_fds,
 					readyfds[1], wakefds[0]);
 
 	/* Wait for everyone to be ready */
 	for (i = 0; i < total_children; i++)
 		if (read(readyfds[0], &dummy, 1) != 1)
-			barf("Reading for readyfds");
+			err(EXIT_FAILURE, "Reading for readyfds");
 
 	gettimeofday(&start, NULL);
 
 	/* Kick them off */
 	if (write(wakefds[1], &dummy, 1) != 1)
-		barf("Writing to start them");
+		err(EXIT_FAILURE, "Writing to start them");
 
 	/* Reap them all */
 	for (i = 0; i < total_children; i++)
-		reap_worker(pth_tab[i]);
+		reap_worker(worker_tab + i);
 
 	gettimeofday(&stop, NULL);
 
@@ -318,12 +344,12 @@ int bench_sched_messaging(int argc, const char **argv,
 		       num_groups, num_groups * 2 * num_fds,
 		       thread_mode ? "threads" : "processes");
 		printf(" %14s: %lu.%03lu [sec]\n", "Total time",
-		       diff.tv_sec,
-		       (unsigned long) (diff.tv_usec/1000));
+		       (unsigned long) diff.tv_sec,
+		       (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
 		break;
 	case BENCH_FORMAT_SIMPLE:
-		printf("%lu.%03lu\n", diff.tv_sec,
-		       (unsigned long) (diff.tv_usec/1000));
+		printf("%lu.%03lu\n", (unsigned long) diff.tv_sec,
+		       (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
 		break;
 	default:
 		/* reaching here is something disaster */
@@ -332,5 +358,14 @@ int bench_sched_messaging(int argc, const char **argv,
 		break;
 	}
 
+	free(worker_tab);
+	list_for_each_entry_safe(pos, n, &sender_contexts, list) {
+		list_del_init(&pos->list);
+		free(pos);
+	}
+	list_for_each_entry_safe(pos, n, &receiver_contexts, list) {
+		list_del_init(&pos->list);
+		free(pos);
+	}
 	return 0;
 }
diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index 69cfba8d4c6c..70139036d68f 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
 /*
  *
  * sched-pipe.c
@@ -7,33 +8,89 @@
  * Based on pipe-test-1m.c by Ingo Molnar <mingo@redhat.com>
  *  http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c
  * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
- *
  */
-
-#include "../perf.h"
-#include "../util/util.h"
-#include "../util/parse-options.h"
-#include "../builtin.h"
+#include <subcmd/parse-options.h>
+#include <api/fs/fs.h>
 #include "bench.h"
+#include "util/cgroup.h"
 
 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <signal.h>
 #include <sys/wait.h>
-#include <linux/unistd.h>
 #include <string.h>
 #include <errno.h>
+#include <fcntl.h>
 #include <assert.h>
+#include <sys/epoll.h>
 #include <sys/time.h>
 #include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/time64.h>
+
+#include <pthread.h>
+
+struct thread_data {
+	int			nr;
+	int			pipe_read;
+	int			pipe_write;
+	struct epoll_event      epoll_ev;
+	int			epoll_fd;
+	bool			cgroup_failed;
+	pthread_t		pthread;
+};
 
 #define LOOPS_DEFAULT 1000000
-static int loops = LOOPS_DEFAULT;
+static	int			loops = LOOPS_DEFAULT;
+
+/* Use processes by default: */
+static bool			threaded;
+
+static bool			nonblocking;
+static char			*cgrp_names[2];
+static struct cgroup		*cgrps[2];
+
+static int parse_two_cgroups(const struct option *opt __maybe_unused,
+			     const char *str, int unset __maybe_unused)
+{
+	char *p = strdup(str);
+	char *q;
+	int ret = -1;
+
+	if (p == NULL) {
+		fprintf(stderr, "memory allocation failure\n");
+		return -1;
+	}
+
+	q = strchr(p, ',');
+	if (q == NULL) {
+		fprintf(stderr, "it should have two cgroup names: %s\n", p);
+		goto out;
+	}
+	*q = '\0';
+
+	cgrp_names[0] = strdup(p);
+	cgrp_names[1] = strdup(q + 1);
+
+	if (cgrp_names[0] == NULL || cgrp_names[1] == NULL) {
+		fprintf(stderr, "memory allocation failure\n");
+		goto out;
+	}
+	ret = 0;
+
+out:
+	free(p);
+	return ret;
+}
 
 static const struct option options[] = {
-	OPT_INTEGER('l', "loop", &loops,
-		    "Specify number of loops"),
+	OPT_BOOLEAN('n', "nonblocking",	&nonblocking,	"Use non-blocking operations"),
+	OPT_INTEGER('l', "loop",	&loops,		"Specify number of loops"),
+	OPT_BOOLEAN('T', "threaded",	&threaded,	"Specify threads/process based task setup"),
+	OPT_CALLBACK('G', "cgroups", NULL, "SEND,RECV",
+		     "Put sender and receivers in given cgroups",
+		     parse_two_cgroups),
 	OPT_END()
 };
 
@@ -42,78 +99,223 @@ static const char * const bench_sched_pipe_usage[] = {
 	NULL
 };
 
-int bench_sched_pipe(int argc, const char **argv,
-		     const char *prefix __maybe_unused)
+static int enter_cgroup(int nr)
 {
+	char buf[32];
+	int fd, len, ret;
+	int saved_errno;
+	struct cgroup *cgrp;
+	pid_t pid;
+
+	if (cgrp_names[nr] == NULL)
+		return 0;
+
+	if (cgrps[nr] == NULL) {
+		cgrps[nr] = cgroup__new(cgrp_names[nr], /*do_open=*/true);
+		if (cgrps[nr] == NULL)
+			goto err;
+	}
+	cgrp = cgrps[nr];
+
+	if (threaded)
+		pid = syscall(__NR_gettid);
+	else
+		pid = getpid();
+
+	snprintf(buf, sizeof(buf), "%d\n", pid);
+	len = strlen(buf);
+
+	/* try cgroup v2 interface first */
+	if (threaded)
+		fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY);
+	else
+		fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY);
+
+	/* try cgroup v1 if failed */
+	if (fd < 0 && errno == ENOENT)
+		fd = openat(cgrp->fd, "tasks", O_WRONLY);
+
+	if (fd < 0)
+		goto err;
+
+	ret = write(fd, buf, len);
+	close(fd);
+
+	if (ret != len) {
+		printf("Cannot enter to cgroup: %s\n", cgrp->name);
+		return -1;
+	}
+	return 0;
+
+err:
+	saved_errno = errno;
+	printf("Failed to open cgroup file in %s\n", cgrp_names[nr]);
+
+	if (saved_errno == ENOENT) {
+		char mnt[PATH_MAX];
+
+		if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0)
+			printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n",
+			       mnt, cgrp_names[nr]);
+	} else if (saved_errno == EACCES && geteuid() > 0) {
+		printf(" Hint: try to run as root\n");
+	}
+
+	return -1;
+}
+
+static void exit_cgroup(int nr)
+{
+	cgroup__put(cgrps[nr]);
+	free(cgrp_names[nr]);
+}
+
+static inline int read_pipe(struct thread_data *td)
+{
+	int ret, m;
+retry:
+	if (nonblocking) {
+		ret = epoll_wait(td->epoll_fd, &td->epoll_ev, 1, -1);
+		if (ret < 0)
+			return ret;
+	}
+	ret = read(td->pipe_read, &m, sizeof(int));
+	if (nonblocking && ret < 0 && errno == EWOULDBLOCK)
+		goto retry;
+	return ret;
+}
+
+static void *worker_thread(void *__tdata)
+{
+	struct thread_data *td = __tdata;
+	int i, ret, m = 0;
+
+	ret = enter_cgroup(td->nr);
+	if (ret < 0) {
+		td->cgroup_failed = true;
+		return NULL;
+	}
+
+	if (nonblocking) {
+		td->epoll_ev.events = EPOLLIN;
+		td->epoll_fd = epoll_create(1);
+		BUG_ON(td->epoll_fd < 0);
+		BUG_ON(epoll_ctl(td->epoll_fd, EPOLL_CTL_ADD, td->pipe_read, &td->epoll_ev) < 0);
+	}
+
+	for (i = 0; i < loops; i++) {
+		ret = write(td->pipe_write, &m, sizeof(int));
+		BUG_ON(ret != sizeof(int));
+		ret = read_pipe(td);
+		BUG_ON(ret != sizeof(int));
+	}
+
+	return NULL;
+}
+
+int bench_sched_pipe(int argc, const char **argv)
+{
+	struct thread_data threads[2] = {};
+	struct thread_data *td;
 	int pipe_1[2], pipe_2[2];
-	int m = 0, i;
 	struct timeval start, stop, diff;
 	unsigned long long result_usec = 0;
+	int nr_threads = 2;
+	int t;
 
 	/*
 	 * why does "ret" exist?
 	 * discarding returned value of read(), write()
 	 * causes error in building environment for perf
 	 */
-	int __maybe_unused ret, wait_stat;
+	int __maybe_unused ret, wait_stat, flags = 0;
 	pid_t pid, retpid __maybe_unused;
 
-	argc = parse_options(argc, argv, options,
-			     bench_sched_pipe_usage, 0);
+	argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0);
 
-	BUG_ON(pipe(pipe_1));
-	BUG_ON(pipe(pipe_2));
+	if (nonblocking)
+		flags |= O_NONBLOCK;
 
-	pid = fork();
-	assert(pid >= 0);
+	BUG_ON(pipe2(pipe_1, flags));
+	BUG_ON(pipe2(pipe_2, flags));
 
 	gettimeofday(&start, NULL);
 
-	if (!pid) {
-		for (i = 0; i < loops; i++) {
-			ret = read(pipe_1[0], &m, sizeof(int));
-			ret = write(pipe_2[1], &m, sizeof(int));
+	for (t = 0; t < nr_threads; t++) {
+		td = threads + t;
+
+		td->nr = t;
+
+		if (t == 0) {
+			td->pipe_read = pipe_1[0];
+			td->pipe_write = pipe_2[1];
+		} else {
+			td->pipe_write = pipe_1[1];
+			td->pipe_read = pipe_2[0];
+		}
+	}
+
+	if (threaded) {
+		for (t = 0; t < nr_threads; t++) {
+			td = threads + t;
+
+			ret = pthread_create(&td->pthread, NULL, worker_thread, td);
+			BUG_ON(ret);
+		}
+
+		for (t = 0; t < nr_threads; t++) {
+			td = threads + t;
+
+			ret = pthread_join(td->pthread, NULL);
+			BUG_ON(ret);
 		}
 	} else {
-		for (i = 0; i < loops; i++) {
-			ret = write(pipe_1[1], &m, sizeof(int));
-			ret = read(pipe_2[0], &m, sizeof(int));
+		pid = fork();
+		assert(pid >= 0);
+
+		if (!pid) {
+			worker_thread(threads + 0);
+			exit(0);
+		} else {
+			worker_thread(threads + 1);
 		}
+
+		retpid = waitpid(pid, &wait_stat, 0);
+		assert((retpid == pid) && WIFEXITED(wait_stat));
 	}
 
 	gettimeofday(&stop, NULL);
 	timersub(&stop, &start, &diff);
 
-	if (pid) {
-		retpid = waitpid(pid, &wait_stat, 0);
-		assert((retpid == pid) && WIFEXITED(wait_stat));
-	} else {
-		exit(0);
-	}
+	exit_cgroup(0);
+	exit_cgroup(1);
+
+	if (threads[0].cgroup_failed || threads[1].cgroup_failed)
+		return 0;
 
 	switch (bench_format) {
 	case BENCH_FORMAT_DEFAULT:
-		printf("# Executed %d pipe operations between two tasks\n\n",
-			loops);
+		printf("# Executed %d pipe operations between two %s\n\n",
+			loops, threaded ? "threads" : "processes");
 
-		result_usec = diff.tv_sec * 1000000;
+		result_usec = diff.tv_sec * USEC_PER_SEC;
 		result_usec += diff.tv_usec;
 
 		printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
-		       diff.tv_sec,
-		       (unsigned long) (diff.tv_usec/1000));
+		       (unsigned long) diff.tv_sec,
+		       (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
 
 		printf(" %14lf usecs/op\n",
 		       (double)result_usec / (double)loops);
 		printf(" %14d ops/sec\n",
 		       (int)((double)loops /
-			     ((double)result_usec / (double)1000000)));
+			     ((double)result_usec / (double)USEC_PER_SEC)));
 		break;
 
 	case BENCH_FORMAT_SIMPLE:
 		printf("%lu.%03lu\n",
-		       diff.tv_sec,
-		       (unsigned long) (diff.tv_usec / 1000));
+		       (unsigned long) diff.tv_sec,
+		       (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
 		break;
 
 	default:
diff --git a/tools/perf/bench/sched-seccomp-notify.c b/tools/perf/bench/sched-seccomp-notify.c
new file mode 100644
index 000000000000..269c1f4a6852
--- /dev/null
+++ b/tools/perf/bench/sched-seccomp-notify.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <subcmd/parse-options.h>
+#include "bench.h"
+
+#include <uapi/linux/filter.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <linux/unistd.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <linux/time64.h>
+#include <uapi/linux/seccomp.h>
+#include <sys/prctl.h>
+
+#include <unistd.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <string.h>
+#include <errno.h>
+#include <err.h>
+#include <inttypes.h>
+
+#define LOOPS_DEFAULT 1000000UL
+static uint64_t loops = LOOPS_DEFAULT;
+static bool sync_mode;
+
+static const struct option options[] = {
+	OPT_U64('l', "loop",	&loops,		"Specify number of loops"),
+	OPT_BOOLEAN('s', "sync-mode", &sync_mode,
+		    "Enable the synchronous mode for seccomp notifications"),
+	OPT_END()
+};
+
+static const char * const bench_seccomp_usage[] = {
+	"perf bench sched secccomp-notify <options>",
+	NULL
+};
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+	return syscall(__NR_seccomp, op, flags, args);
+}
+
+static int user_notif_syscall(int nr, unsigned int flags)
+{
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, nr)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+	};
+
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
+}
+
+#define USER_NOTIF_MAGIC INT_MAX
+static void user_notification_sync_loop(int listener)
+{
+	struct seccomp_notif_resp resp;
+	struct seccomp_notif req;
+	uint64_t nr;
+
+	for (nr = 0; nr < loops; nr++) {
+		memset(&req, 0, sizeof(req));
+		if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req))
+			err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_RECV failed");
+
+		if (req.data.nr != __NR_gettid)
+			errx(EXIT_FAILURE, "unexpected syscall: %d", req.data.nr);
+
+		resp.id = req.id;
+		resp.error = 0;
+		resp.val = USER_NOTIF_MAGIC;
+		resp.flags = 0;
+		if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp))
+			err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_SEND failed");
+	}
+}
+
+#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS  SECCOMP_IOW(4, __u64)
+#endif
+int bench_sched_seccomp_notify(int argc, const char **argv)
+{
+	struct timeval start, stop, diff;
+	unsigned long long result_usec = 0;
+	int status, listener;
+	pid_t pid;
+	long ret;
+
+	argc = parse_options(argc, argv, options, bench_seccomp_usage, 0);
+
+	gettimeofday(&start, NULL);
+
+	prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	listener = user_notif_syscall(__NR_gettid,
+				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
+	if (listener < 0)
+		err(EXIT_FAILURE, "can't create a notification descriptor");
+
+	pid = fork();
+	if (pid < 0)
+		err(EXIT_FAILURE, "fork");
+	if (pid == 0) {
+		if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0))
+			err(EXIT_FAILURE, "can't set the parent death signal");
+		while (1) {
+			ret = syscall(__NR_gettid);
+			if (ret == USER_NOTIF_MAGIC)
+				continue;
+			break;
+		}
+		_exit(1);
+	}
+
+	if (sync_mode) {
+		if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS,
+			     SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0))
+			err(EXIT_FAILURE,
+			    "can't set SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP");
+	}
+	user_notification_sync_loop(listener);
+
+	kill(pid, SIGKILL);
+	if (waitpid(pid, &status, 0) != pid)
+		err(EXIT_FAILURE, "waitpid(%d) failed", pid);
+	if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL)
+		errx(EXIT_FAILURE, "unexpected exit code: %d", status);
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		printf("# Executed %" PRIu64 " system calls\n\n",
+			loops);
+
+		result_usec = diff.tv_sec * USEC_PER_SEC;
+		result_usec += diff.tv_usec;
+
+		printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
+		       (unsigned long) diff.tv_sec,
+		       (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
+
+		printf(" %14lf usecs/op\n",
+		       (double)result_usec / (double)loops);
+		printf(" %14d ops/sec\n",
+		       (int)((double)loops /
+			     ((double)result_usec / (double)USEC_PER_SEC)));
+		break;
+
+	case BENCH_FORMAT_SIMPLE:
+		printf("%lu.%03lu\n",
+		       (unsigned long) diff.tv_sec,
+		       (unsigned long) (diff.tv_usec / USEC_PER_MSEC));
+		break;
+
+	default:
+		/* reaching here is something disaster */
+		fprintf(stderr, "Unknown format:%d\n", bench_format);
+		exit(1);
+		break;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c
new file mode 100644
index 000000000000..265d49a913d9
--- /dev/null
+++ b/tools/perf/bench/synthesize.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Benchmark synthesis of perf events such as at the start of a 'perf
+ * record'. Synthesis is done on the current process and the 'dummy' event
+ * handlers are invoked that support dump_trace but otherwise do nothing.
+ *
+ * Copyright 2019 Google LLC.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include "bench.h"
+#include "../util/debug.h"
+#include "../util/session.h"
+#include "../util/stat.h"
+#include "../util/synthetic-events.h"
+#include "../util/target.h"
+#include "../util/thread_map.h"
+#include "../util/tool.h"
+#include "../util/util.h"
+#include <linux/atomic.h>
+#include <linux/err.h>
+#include <linux/time64.h>
+#include <subcmd/parse-options.h>
+
+static unsigned int min_threads = 1;
+static unsigned int max_threads = UINT_MAX;
+static unsigned int single_iterations = 10000;
+static unsigned int multi_iterations = 10;
+static bool run_st;
+static bool run_mt;
+
+static const struct option options[] = {
+	OPT_BOOLEAN('s', "st", &run_st, "Run single threaded benchmark"),
+	OPT_BOOLEAN('t', "mt", &run_mt, "Run multi-threaded benchmark"),
+	OPT_UINTEGER('m', "min-threads", &min_threads,
+		"Minimum number of threads in multithreaded bench"),
+	OPT_UINTEGER('M', "max-threads", &max_threads,
+		"Maximum number of threads in multithreaded bench"),
+	OPT_UINTEGER('i', "single-iterations", &single_iterations,
+		"Number of iterations used to compute single-threaded average"),
+	OPT_UINTEGER('I', "multi-iterations", &multi_iterations,
+		"Number of iterations used to compute multi-threaded average"),
+	OPT_END()
+};
+
+static const char *const bench_usage[] = {
+	"perf bench internals synthesize <options>",
+	NULL
+};
+
+static atomic_t event_count;
+
+static int process_synthesized_event(const struct perf_tool *tool __maybe_unused,
+				     union perf_event *event __maybe_unused,
+				     struct perf_sample *sample __maybe_unused,
+				     struct machine *machine __maybe_unused)
+{
+	atomic_inc(&event_count);
+	return 0;
+}
+
+static int do_run_single_threaded(struct perf_session *session,
+				struct perf_thread_map *threads,
+				struct target *target, bool data_mmap)
+{
+	const unsigned int nr_threads_synthesize = 1;
+	struct timeval start, end, diff;
+	u64 runtime_us;
+	unsigned int i;
+	double time_average, time_stddev, event_average, event_stddev;
+	int err;
+	struct stats time_stats, event_stats;
+
+	init_stats(&time_stats);
+	init_stats(&event_stats);
+
+	for (i = 0; i < single_iterations; i++) {
+		atomic_set(&event_count, 0);
+		gettimeofday(&start, NULL);
+		err = __machine__synthesize_threads(&session->machines.host,
+						NULL,
+						target, threads,
+						process_synthesized_event,
+						true, data_mmap,
+						nr_threads_synthesize);
+		if (err)
+			return err;
+
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &diff);
+		runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+		update_stats(&time_stats, runtime_us);
+		update_stats(&event_stats, atomic_read(&event_count));
+	}
+
+	time_average = avg_stats(&time_stats);
+	time_stddev = stddev_stats(&time_stats);
+	printf("  Average %ssynthesis took: %.3f usec (+- %.3f usec)\n",
+		data_mmap ? "data " : "", time_average, time_stddev);
+
+	event_average = avg_stats(&event_stats);
+	event_stddev = stddev_stats(&event_stats);
+	printf("  Average num. events: %.3f (+- %.3f)\n",
+		event_average, event_stddev);
+
+	printf("  Average time per event %.3f usec\n",
+		time_average / event_average);
+	return 0;
+}
+
+static int run_single_threaded(void)
+{
+	struct perf_session *session;
+	struct target target = {
+		.pid = "self",
+	};
+	struct perf_thread_map *threads;
+	struct perf_env host_env;
+	int err;
+
+	perf_set_singlethreaded();
+	perf_env__init(&host_env);
+	session = __perf_session__new(/*data=*/NULL, /*tool=*/NULL,
+				      /*trace_event_repipe=*/false, &host_env);
+	if (IS_ERR(session)) {
+		pr_err("Session creation failed.\n");
+		perf_env__exit(&host_env);
+		return PTR_ERR(session);
+	}
+	threads = thread_map__new_by_pid(getpid());
+	if (!threads) {
+		pr_err("Thread map creation failed.\n");
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	puts(
+"Computing performance of single threaded perf event synthesis by\n"
+"synthesizing events on the perf process itself:");
+
+	err = do_run_single_threaded(session, threads, &target, false);
+	if (err)
+		goto err_out;
+
+	err = do_run_single_threaded(session, threads, &target, true);
+
+err_out:
+	if (threads)
+		perf_thread_map__put(threads);
+
+	perf_session__delete(session);
+	perf_env__exit(&host_env);
+	return err;
+}
+
+static int do_run_multi_threaded(struct target *target,
+				unsigned int nr_threads_synthesize)
+{
+	struct timeval start, end, diff;
+	u64 runtime_us;
+	unsigned int i;
+	double time_average, time_stddev, event_average, event_stddev;
+	int err = 0;
+	struct stats time_stats, event_stats;
+	struct perf_session *session;
+	struct perf_env host_env;
+
+	perf_env__init(&host_env);
+	init_stats(&time_stats);
+	init_stats(&event_stats);
+	for (i = 0; i < multi_iterations; i++) {
+		session = __perf_session__new(/*data=*/NULL, /*tool=*/NULL,
+					      /*trace_event_repipe=*/false, &host_env);
+		if (IS_ERR(session)) {
+			err = PTR_ERR(session);
+			goto err_out;
+		}
+		atomic_set(&event_count, 0);
+		gettimeofday(&start, NULL);
+		err = __machine__synthesize_threads(&session->machines.host,
+						NULL,
+						target, NULL,
+						process_synthesized_event,
+						true, false,
+						nr_threads_synthesize);
+		if (err) {
+			perf_session__delete(session);
+			goto err_out;
+		}
+
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &diff);
+		runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+		update_stats(&time_stats, runtime_us);
+		update_stats(&event_stats, atomic_read(&event_count));
+		perf_session__delete(session);
+	}
+
+	time_average = avg_stats(&time_stats);
+	time_stddev = stddev_stats(&time_stats);
+	printf("    Average synthesis took: %.3f usec (+- %.3f usec)\n",
+		time_average, time_stddev);
+
+	event_average = avg_stats(&event_stats);
+	event_stddev = stddev_stats(&event_stats);
+	printf("    Average num. events: %.3f (+- %.3f)\n",
+		event_average, event_stddev);
+
+	printf("    Average time per event %.3f usec\n",
+		time_average / event_average);
+err_out:
+	perf_env__exit(&host_env);
+	return err;
+}
+
+static int run_multi_threaded(void)
+{
+	struct target target = {
+		.cpu_list = "0"
+	};
+	unsigned int nr_threads_synthesize;
+	int err;
+
+	if (max_threads == UINT_MAX)
+		max_threads = sysconf(_SC_NPROCESSORS_ONLN);
+
+	puts(
+"Computing performance of multi threaded perf event synthesis by\n"
+"synthesizing events on CPU 0:");
+
+	for (nr_threads_synthesize = min_threads;
+	     nr_threads_synthesize <= max_threads;
+	     nr_threads_synthesize++) {
+		if (nr_threads_synthesize == 1)
+			perf_set_singlethreaded();
+		else
+			perf_set_multithreaded();
+
+		printf("  Number of synthesis threads: %u\n",
+			nr_threads_synthesize);
+
+		err = do_run_multi_threaded(&target, nr_threads_synthesize);
+		if (err)
+			return err;
+	}
+	perf_set_singlethreaded();
+	return 0;
+}
+
+int bench_synthesize(int argc, const char **argv)
+{
+	int err = 0;
+
+	argc = parse_options(argc, argv, options, bench_usage, 0);
+	if (argc) {
+		usage_with_options(bench_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	/*
+	 * If neither single threaded or multi-threaded are specified, default
+	 * to running just single threaded.
+	 */
+	if (!run_st && !run_mt)
+		run_st = true;
+
+	if (run_st)
+		err = run_single_threaded();
+
+	if (!err && run_mt)
+		err = run_multi_threaded();
+
+	return err;
+}
diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c
new file mode 100644
index 000000000000..e7dc216f717f
--- /dev/null
+++ b/tools/perf/bench/syscall.c
@@ -0,0 +1,188 @@
+/*
+ *
+ * syscall.c
+ *
+ * syscall: Benchmark for system call performance
+ */
+#include "../perf.h"
+#include "../util/util.h"
+#include <subcmd/parse-options.h>
+#include "../builtin.h"
+#include "bench.h"
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#ifndef __NR_fork
+#define __NR_fork -1
+#endif
+
+static	int loops;
+
+static const struct option options[] = {
+	OPT_INTEGER('l', "loop",	&loops,		"Specify number of loops"),
+	OPT_END()
+};
+
+static const char * const bench_syscall_usage[] = {
+	"perf bench syscall <options>",
+	NULL
+};
+
+static void test_fork(void)
+{
+	pid_t pid = fork();
+
+	if (pid < 0) {
+		fprintf(stderr, "fork failed\n");
+		exit(1);
+	} else if (pid == 0) {
+		exit(0);
+	} else {
+		if (waitpid(pid, NULL, 0) < 0) {
+			fprintf(stderr, "waitpid failed\n");
+			exit(1);
+		}
+	}
+}
+
+static void test_execve(void)
+{
+	const char *pathname = "/bin/true";
+	char *const argv[] = { (char *)pathname, NULL };
+	pid_t pid = fork();
+
+	if (pid < 0) {
+		fprintf(stderr, "fork failed\n");
+		exit(1);
+	} else if (pid == 0) {
+		execve(pathname, argv, NULL);
+		fprintf(stderr, "execve /bin/true failed\n");
+		exit(1);
+	} else {
+		if (waitpid(pid, NULL, 0) < 0) {
+			fprintf(stderr, "waitpid failed\n");
+			exit(1);
+		}
+	}
+}
+
+static int bench_syscall_common(int argc, const char **argv, int syscall)
+{
+	struct timeval start, stop, diff;
+	unsigned long long result_usec = 0;
+	const char *name = NULL;
+	int i;
+
+	switch (syscall) {
+	case __NR_fork:
+	case __NR_execve:
+		/* Limit default loop to 10000 times to save time */
+		loops = 10000;
+		break;
+	default:
+		loops = 10000000;
+		break;
+	}
+
+	/* Options -l and --loops override default above */
+	argc = parse_options(argc, argv, options, bench_syscall_usage, 0);
+
+	gettimeofday(&start, NULL);
+
+	for (i = 0; i < loops; i++) {
+		switch (syscall) {
+		case __NR_getppid:
+			getppid();
+			break;
+		case __NR_getpgid:
+			getpgid(0);
+			break;
+		case __NR_fork:
+			test_fork();
+			break;
+		case __NR_execve:
+			test_execve();
+		default:
+			break;
+		}
+	}
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+
+	switch (syscall) {
+	case __NR_getppid:
+		name = "getppid()";
+		break;
+	case __NR_getpgid:
+		name = "getpgid()";
+		break;
+	case __NR_fork:
+		name = "fork()";
+		break;
+	case __NR_execve:
+		name = "execve()";
+		break;
+	default:
+		break;
+	}
+
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		printf("# Executed %'d %s calls\n", loops, name);
+
+		result_usec = diff.tv_sec * 1000000;
+		result_usec += diff.tv_usec;
+
+		printf(" %14s: %lu.%03lu [sec]\n\n", "Total time",
+		       (unsigned long) diff.tv_sec,
+		       (unsigned long) (diff.tv_usec/1000));
+
+		printf(" %14lf usecs/op\n",
+		       (double)result_usec / (double)loops);
+		printf(" %'14d ops/sec\n",
+		       (int)((double)loops /
+			     ((double)result_usec / (double)1000000)));
+		break;
+
+	case BENCH_FORMAT_SIMPLE:
+		printf("%lu.%03lu\n",
+		       (unsigned long) diff.tv_sec,
+		       (unsigned long) (diff.tv_usec / 1000));
+		break;
+
+	default:
+		/* reaching here is something disaster */
+		fprintf(stderr, "Unknown format:%d\n", bench_format);
+		exit(1);
+		break;
+	}
+
+	return 0;
+}
+
+int bench_syscall_basic(int argc, const char **argv)
+{
+	return bench_syscall_common(argc, argv, __NR_getppid);
+}
+
+int bench_syscall_getpgid(int argc, const char **argv)
+{
+	return bench_syscall_common(argc, argv, __NR_getpgid);
+}
+
+int bench_syscall_fork(int argc, const char **argv)
+{
+	return bench_syscall_common(argc, argv, __NR_fork);
+}
+
+int bench_syscall_execve(int argc, const char **argv)
+{
+	return bench_syscall_common(argc, argv, __NR_execve);
+}
diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c
new file mode 100644
index 000000000000..0b90275862e1
--- /dev/null
+++ b/tools/perf/bench/uprobe.c
@@ -0,0 +1,213 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/*
+ * uprobe.c
+ *
+ * uprobe benchmarks
+ *
+ *  Copyright (C) 2023, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
+ */
+#include "../perf.h"
+#include "../util/util.h"
+#include <subcmd/parse-options.h>
+#include "../builtin.h"
+#include "bench.h"
+#include <linux/compiler.h>
+#include <linux/time64.h>
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#define LOOPS_DEFAULT 1000
+static int loops = LOOPS_DEFAULT;
+
+enum bench_uprobe {
+	BENCH_UPROBE__BASELINE,
+	BENCH_UPROBE__EMPTY,
+	BENCH_UPROBE__TRACE_PRINTK,
+	BENCH_UPROBE__EMPTY_RET,
+	BENCH_UPROBE__TRACE_PRINTK_RET,
+};
+
+static const struct option options[] = {
+	OPT_INTEGER('l', "loop",	&loops,		"Specify number of loops"),
+	OPT_END()
+};
+
+static const char * const bench_uprobe_usage[] = {
+	"perf bench uprobe <options>",
+	NULL
+};
+
+#ifdef HAVE_BPF_SKEL
+#include "bpf_skel/bench_uprobe.skel.h"
+
+#define bench_uprobe__attach_uprobe(prog) \
+	skel->links.prog = bpf_program__attach_uprobe_opts(/*prog=*/skel->progs.prog, \
+							   /*pid=*/-1, \
+							   /*binary_path=*/"libc.so.6", \
+							   /*func_offset=*/0, \
+							   /*opts=*/&uprobe_opts); \
+	if (!skel->links.prog) { \
+		err = -errno; \
+		fprintf(stderr, "Failed to attach bench uprobe \"%s\": %s\n", #prog, strerror(errno)); \
+		goto cleanup; \
+	}
+
+struct bench_uprobe_bpf *skel;
+
+static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench)
+{
+	DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts);
+	int err;
+
+	/* Load and verify BPF application */
+	skel = bench_uprobe_bpf__open();
+	if (!skel) {
+		fprintf(stderr, "Failed to open and load uprobes bench BPF skeleton\n");
+		return -1;
+	}
+
+	err = bench_uprobe_bpf__load(skel);
+	if (err) {
+		fprintf(stderr, "Failed to load and verify BPF skeleton\n");
+		goto cleanup;
+	}
+
+	uprobe_opts.func_name = "usleep";
+	switch (bench) {
+	case BENCH_UPROBE__BASELINE:							break;
+	case BENCH_UPROBE__EMPTY:	 bench_uprobe__attach_uprobe(empty);		break;
+	case BENCH_UPROBE__TRACE_PRINTK: bench_uprobe__attach_uprobe(trace_printk);	break;
+	case BENCH_UPROBE__EMPTY_RET:	 bench_uprobe__attach_uprobe(empty_ret);	break;
+	case BENCH_UPROBE__TRACE_PRINTK_RET: bench_uprobe__attach_uprobe(trace_printk_ret); break;
+	default:
+		fprintf(stderr, "Invalid bench: %d\n", bench);
+		goto cleanup;
+	}
+
+	return err;
+cleanup:
+	bench_uprobe_bpf__destroy(skel);
+	skel = NULL;
+	return err;
+}
+
+static void bench_uprobe__teardown_bpf_skel(void)
+{
+	if (skel) {
+		bench_uprobe_bpf__destroy(skel);
+		skel = NULL;
+	}
+}
+#else
+static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench __maybe_unused) { return 0; }
+static void bench_uprobe__teardown_bpf_skel(void) {};
+#endif
+
+static int bench_uprobe_format__default_fprintf(const char *name, const char *unit, u64 diff, FILE *fp)
+{
+	static u64 baseline, previous;
+	s64 diff_to_baseline = diff - baseline,
+	    diff_to_previous = diff - previous;
+	int printed = fprintf(fp, "# Executed %'d %s calls\n", loops, name);
+
+	printed += fprintf(fp, " %14s: %'" PRIu64 " %ss", "Total time", diff, unit);
+
+	if (baseline) {
+		printed += fprintf(fp, " %s%'" PRId64 " to baseline", diff_to_baseline > 0 ? "+" : "", diff_to_baseline);
+
+		if (previous != baseline)
+			fprintf(stdout, " %s%'" PRId64 " to previous", diff_to_previous > 0 ? "+" : "", diff_to_previous);
+	}
+
+	printed += fprintf(fp, "\n\n %'.3f %ss/op", (double)diff / (double)loops, unit);
+
+	if (baseline) {
+		printed += fprintf(fp, " %'.3f %ss/op to baseline", (double)diff_to_baseline / (double)loops, unit);
+
+		if (previous != baseline)
+			printed += fprintf(fp, " %'.3f %ss/op to previous", (double)diff_to_previous / (double)loops, unit);
+	} else {
+		baseline = diff;
+	}
+
+	fputc('\n', fp);
+
+	previous = diff;
+
+	return printed + 1;
+}
+
+static int bench_uprobe(int argc, const char **argv, enum bench_uprobe bench)
+{
+	const char *name = "usleep(1000)", *unit = "usec";
+	struct timespec start, end;
+	u64 diff;
+	int i;
+
+	argc = parse_options(argc, argv, options, bench_uprobe_usage, 0);
+
+	if (bench != BENCH_UPROBE__BASELINE && bench_uprobe__setup_bpf_skel(bench) < 0)
+		return 0;
+
+        clock_gettime(CLOCK_REALTIME, &start);
+
+	for (i = 0; i < loops; i++) {
+		usleep(USEC_PER_MSEC);
+	}
+
+	clock_gettime(CLOCK_REALTIME, &end);
+
+	diff = end.tv_sec * NSEC_PER_SEC + end.tv_nsec - (start.tv_sec * NSEC_PER_SEC + start.tv_nsec);
+	diff /= NSEC_PER_USEC;
+
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		bench_uprobe_format__default_fprintf(name, unit, diff, stdout);
+		break;
+
+	case BENCH_FORMAT_SIMPLE:
+		printf("%" PRIu64 "\n", diff);
+		break;
+
+	default:
+		/* reaching here is something of a disaster */
+		fprintf(stderr, "Unknown format:%d\n", bench_format);
+		exit(1);
+	}
+
+	if (bench != BENCH_UPROBE__BASELINE)
+		bench_uprobe__teardown_bpf_skel();
+
+	return 0;
+}
+
+int bench_uprobe_baseline(int argc, const char **argv)
+{
+	return bench_uprobe(argc, argv, BENCH_UPROBE__BASELINE);
+}
+
+int bench_uprobe_empty(int argc, const char **argv)
+{
+	return bench_uprobe(argc, argv, BENCH_UPROBE__EMPTY);
+}
+
+int bench_uprobe_trace_printk(int argc, const char **argv)
+{
+	return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK);
+}
+
+int bench_uprobe_empty_ret(int argc, const char **argv)
+{
+	return bench_uprobe(argc, argv, BENCH_UPROBE__EMPTY_RET);
+}
+
+int bench_uprobe_trace_printk_ret(int argc, const char **argv)
+{
+	return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK_RET);
+}