diff options
Diffstat (limited to 'tools/perf/bench')
33 files changed, 6393 insertions, 950 deletions
diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build new file mode 100644 index 000000000000..b558ab98719f --- /dev/null +++ b/tools/perf/bench/Build @@ -0,0 +1,26 @@ +perf-bench-y += sched-messaging.o +perf-bench-y += sched-pipe.o +perf-bench-y += sched-seccomp-notify.o +perf-bench-y += syscall.o +perf-bench-y += mem-functions.o +perf-bench-y += futex.o +perf-bench-y += futex-hash.o +perf-bench-y += futex-wake.o +perf-bench-y += futex-wake-parallel.o +perf-bench-y += futex-requeue.o +perf-bench-y += futex-lock-pi.o +perf-bench-y += epoll-wait.o +perf-bench-y += epoll-ctl.o +perf-bench-y += synthesize.o +perf-bench-y += kallsyms-parse.o +perf-bench-y += find-bit-bench.o +perf-bench-y += inject-buildid.o +perf-bench-y += evlist-open-close.o +perf-bench-y += breakpoint.o +perf-bench-y += pmu-scan.o +perf-bench-y += uprobe.o + +perf-bench-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o +perf-bench-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o + +perf-bench-$(CONFIG_NUMA) += numa.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 0fdc85269c4d..8519eb5a42fa 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -1,36 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BENCH_H #define BENCH_H +#include <sys/time.h> + +extern struct timeval bench__start, bench__end, bench__runtime; + /* * The madvise transparent hugepage constants were added in glibc * 2.13. For compatibility with older versions of glibc, define these * tokens if they are not already defined. - * - * PA-RISC uses different madvise values from other architectures and - * needs to be special-cased. */ -#ifdef __hppa__ -# ifndef MADV_HUGEPAGE -# define MADV_HUGEPAGE 67 -# endif -# ifndef MADV_NOHUGEPAGE -# define MADV_NOHUGEPAGE 68 -# endif -#else # ifndef MADV_HUGEPAGE # define MADV_HUGEPAGE 14 # endif # ifndef MADV_NOHUGEPAGE # define MADV_NOHUGEPAGE 15 # endif -#endif -extern int bench_numa(int argc, const char **argv, const char *prefix); -extern int bench_sched_messaging(int argc, const char **argv, const char *prefix); -extern int bench_sched_pipe(int argc, const char **argv, const char *prefix); -extern int bench_mem_memcpy(int argc, const char **argv, - const char *prefix __maybe_unused); -extern int bench_mem_memset(int argc, const char **argv, const char *prefix); +int bench_numa(int argc, const char **argv); +int bench_sched_messaging(int argc, const char **argv); +int bench_sched_pipe(int argc, const char **argv); +int bench_sched_seccomp_notify(int argc, const char **argv); +int bench_syscall_basic(int argc, const char **argv); +int bench_syscall_getpgid(int argc, const char **argv); +int bench_syscall_fork(int argc, const char **argv); +int bench_syscall_execve(int argc, const char **argv); +int bench_mem_memcpy(int argc, const char **argv); +int bench_mem_memset(int argc, const char **argv); +int bench_mem_mmap(int argc, const char **argv); +int bench_mem_find_bit(int argc, const char **argv); +int bench_futex_hash(int argc, const char **argv); +int bench_futex_wake(int argc, const char **argv); +int bench_futex_wake_parallel(int argc, const char **argv); +int bench_futex_requeue(int argc, const char **argv); +/* pi futexes */ +int bench_futex_lock_pi(int argc, const char **argv); +int bench_epoll_wait(int argc, const char **argv); +int bench_epoll_ctl(int argc, const char **argv); +int bench_synthesize(int argc, const char **argv); +int bench_kallsyms_parse(int argc, const char **argv); +int bench_inject_build_id(int argc, const char **argv); +int bench_evlist_open_close(int argc, const char **argv); +int bench_breakpoint_thread(int argc, const char **argv); +int bench_breakpoint_enable(int argc, const char **argv); +int bench_uprobe_baseline(int argc, const char **argv); +int bench_uprobe_empty(int argc, const char **argv); +int bench_uprobe_trace_printk(int argc, const char **argv); +int bench_uprobe_empty_ret(int argc, const char **argv); +int bench_uprobe_trace_printk_ret(int argc, const char **argv); +int bench_pmu_scan(int argc, const char **argv); #define BENCH_FORMAT_DEFAULT_STR "default" #define BENCH_FORMAT_DEFAULT 0 @@ -40,5 +59,17 @@ extern int bench_mem_memset(int argc, const char **argv, const char *prefix); #define BENCH_FORMAT_UNKNOWN -1 extern int bench_format; +extern unsigned int bench_repeat; + +#ifndef HAVE_PTHREAD_ATTR_SETAFFINITY_NP +#include <pthread.h> +#include <linux/compiler.h> +static inline int pthread_attr_setaffinity_np(pthread_attr_t *attr __maybe_unused, + size_t cpusetsize __maybe_unused, + cpu_set_t *cpuset __maybe_unused) +{ + return 0; +} +#endif #endif diff --git a/tools/perf/bench/breakpoint.c b/tools/perf/bench/breakpoint.c new file mode 100644 index 000000000000..dfd18f5db97d --- /dev/null +++ b/tools/perf/bench/breakpoint.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <subcmd/parse-options.h> +#include <linux/hw_breakpoint.h> +#include <linux/perf_event.h> +#include <linux/time64.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/time.h> +#include <pthread.h> +#include <stddef.h> +#include <stdlib.h> +#include <unistd.h> +#include <stdio.h> +#include <errno.h> +#include "bench.h" +#include "futex.h" + +struct { + unsigned int nbreakpoints; + unsigned int nparallel; + unsigned int nthreads; +} thread_params = { + .nbreakpoints = 1, + .nparallel = 1, + .nthreads = 1, +}; + +static const struct option thread_options[] = { + OPT_UINTEGER('b', "breakpoints", &thread_params.nbreakpoints, + "Specify amount of breakpoints"), + OPT_UINTEGER('p', "parallelism", &thread_params.nparallel, "Specify amount of parallelism"), + OPT_UINTEGER('t', "threads", &thread_params.nthreads, "Specify amount of threads"), + OPT_END() +}; + +static const char * const thread_usage[] = { + "perf bench breakpoint thread <options>", + NULL +}; + +struct breakpoint { + int fd; + char watched; +}; + +static int breakpoint_setup(void *addr) +{ + struct perf_event_attr attr = { .size = 0, }; + int fd; + + attr.type = PERF_TYPE_BREAKPOINT; + attr.size = sizeof(attr); + attr.inherit = 1; + attr.exclude_kernel = 1; + attr.exclude_hv = 1; + attr.bp_addr = (unsigned long)addr; + attr.bp_type = HW_BREAKPOINT_RW; + attr.bp_len = HW_BREAKPOINT_LEN_1; + fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0); + + if (fd < 0) + fd = -errno; + + return fd; +} + +static void *passive_thread(void *arg) +{ + unsigned int *done = (unsigned int *)arg; + + while (!__atomic_load_n(done, __ATOMIC_RELAXED)) + futex_wait(done, 0, NULL, 0); + return NULL; +} + +static void *active_thread(void *arg) +{ + unsigned int *done = (unsigned int *)arg; + + while (!__atomic_load_n(done, __ATOMIC_RELAXED)); + return NULL; +} + +static void *breakpoint_thread(void *arg) +{ + unsigned int i, done; + int *repeat = (int *)arg; + pthread_t *threads; + + threads = calloc(thread_params.nthreads, sizeof(threads[0])); + if (!threads) + exit((perror("calloc"), EXIT_FAILURE)); + + while (__atomic_fetch_sub(repeat, 1, __ATOMIC_RELAXED) > 0) { + done = 0; + for (i = 0; i < thread_params.nthreads; i++) { + if (pthread_create(&threads[i], NULL, passive_thread, &done)) + exit((perror("pthread_create"), EXIT_FAILURE)); + } + __atomic_store_n(&done, 1, __ATOMIC_RELAXED); + futex_wake(&done, thread_params.nthreads, 0); + for (i = 0; i < thread_params.nthreads; i++) + pthread_join(threads[i], NULL); + } + free(threads); + return NULL; +} + +// The benchmark creates nbreakpoints inheritable breakpoints, +// then starts nparallel threads which create and join bench_repeat batches of nthreads threads. +int bench_breakpoint_thread(int argc, const char **argv) +{ + unsigned int i, result_usec; + int repeat = bench_repeat; + struct breakpoint *breakpoints; + pthread_t *parallel; + struct timeval start, stop, diff; + + if (parse_options(argc, argv, thread_options, thread_usage, 0)) { + usage_with_options(thread_usage, thread_options); + exit(EXIT_FAILURE); + } + breakpoints = calloc(thread_params.nbreakpoints, sizeof(breakpoints[0])); + parallel = calloc(thread_params.nparallel, sizeof(parallel[0])); + if (!breakpoints || !parallel) + exit((perror("calloc"), EXIT_FAILURE)); + + for (i = 0; i < thread_params.nbreakpoints; i++) { + breakpoints[i].fd = breakpoint_setup(&breakpoints[i].watched); + + if (breakpoints[i].fd < 0) { + if (breakpoints[i].fd == -ENODEV) { + printf("Skipping perf bench breakpoint thread: No hardware support\n"); + return 0; + } + exit((perror("perf_event_open"), EXIT_FAILURE)); + } + } + gettimeofday(&start, NULL); + for (i = 0; i < thread_params.nparallel; i++) { + if (pthread_create(¶llel[i], NULL, breakpoint_thread, &repeat)) + exit((perror("pthread_create"), EXIT_FAILURE)); + } + for (i = 0; i < thread_params.nparallel; i++) + pthread_join(parallel[i], NULL); + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + for (i = 0; i < thread_params.nbreakpoints; i++) + close(breakpoints[i].fd); + free(parallel); + free(breakpoints); + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Created/joined %d threads with %d breakpoints and %d parallelism\n", + bench_repeat, thread_params.nbreakpoints, thread_params.nparallel); + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); + result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + printf(" %14lf usecs/op\n", + (double)result_usec / bench_repeat / thread_params.nthreads); + printf(" %14lf usecs/op/cpu\n", + (double)result_usec / bench_repeat / + thread_params.nthreads * thread_params.nparallel); + break; + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); + break; + default: + fprintf(stderr, "Unknown format: %d\n", bench_format); + exit(EXIT_FAILURE); + } + return 0; +} + +struct { + unsigned int npassive; + unsigned int nactive; +} enable_params = { + .nactive = 0, + .npassive = 0, +}; + +static const struct option enable_options[] = { + OPT_UINTEGER('p', "passive", &enable_params.npassive, "Specify amount of passive threads"), + OPT_UINTEGER('a', "active", &enable_params.nactive, "Specify amount of active threads"), + OPT_END() +}; + +static const char * const enable_usage[] = { + "perf bench breakpoint enable <options>", + NULL +}; + +// The benchmark creates an inheritable breakpoint, +// then starts npassive threads that block and nactive threads that actively spin +// and then disables and enables the breakpoint bench_repeat times. +int bench_breakpoint_enable(int argc, const char **argv) +{ + unsigned int i, nthreads, result_usec, done = 0; + char watched; + int fd; + pthread_t *threads; + struct timeval start, stop, diff; + + if (parse_options(argc, argv, enable_options, enable_usage, 0)) { + usage_with_options(enable_usage, enable_options); + exit(EXIT_FAILURE); + } + fd = breakpoint_setup(&watched); + + if (fd < 0) { + if (fd == -ENODEV) { + printf("Skipping perf bench breakpoint enable: No hardware support\n"); + return 0; + } + exit((perror("perf_event_open"), EXIT_FAILURE)); + } + nthreads = enable_params.npassive + enable_params.nactive; + threads = calloc(nthreads, sizeof(threads[0])); + if (!threads) + exit((perror("calloc"), EXIT_FAILURE)); + + for (i = 0; i < nthreads; i++) { + if (pthread_create(&threads[i], NULL, + i < enable_params.npassive ? passive_thread : active_thread, &done)) + exit((perror("pthread_create"), EXIT_FAILURE)); + } + usleep(10000); // let the threads block + gettimeofday(&start, NULL); + for (i = 0; i < bench_repeat; i++) { + if (ioctl(fd, PERF_EVENT_IOC_DISABLE, 0)) + exit((perror("ioctl(PERF_EVENT_IOC_DISABLE)"), EXIT_FAILURE)); + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) + exit((perror("ioctl(PERF_EVENT_IOC_ENABLE)"), EXIT_FAILURE)); + } + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + __atomic_store_n(&done, 1, __ATOMIC_RELAXED); + futex_wake(&done, enable_params.npassive, 0); + for (i = 0; i < nthreads; i++) + pthread_join(threads[i], NULL); + free(threads); + close(fd); + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Enabled/disabled breakpoint %d time with %d passive and %d active threads\n", + bench_repeat, enable_params.npassive, enable_params.nactive); + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); + result_usec = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + printf(" %14lf usecs/op\n", (double)result_usec / bench_repeat); + break; + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", (long)diff.tv_sec, (long)(diff.tv_usec / USEC_PER_MSEC)); + break; + default: + fprintf(stderr, "Unknown format: %d\n", bench_format); + exit(EXIT_FAILURE); + } + return 0; +} diff --git a/tools/perf/bench/epoll-ctl.c b/tools/perf/bench/epoll-ctl.c new file mode 100644 index 000000000000..d66d852b90e4 --- /dev/null +++ b/tools/perf/bench/epoll-ctl.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2018 Davidlohr Bueso. + * + * Benchmark the various operations allowed for epoll_ctl(2). + * The idea is to concurrently stress a single epoll instance + */ +#ifdef HAVE_EVENTFD_SUPPORT +/* For the CLR_() macros */ +#include <string.h> +#include <pthread.h> + +#include <errno.h> +#include <inttypes.h> +#include <signal.h> +#include <stdlib.h> +#include <unistd.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/epoll.h> +#include <sys/eventfd.h> +#include <perf/cpumap.h> + +#include "../util/mutex.h" +#include "../util/stat.h" +#include <subcmd/parse-options.h> +#include "bench.h" + +#include <err.h> + +#define printinfo(fmt, arg...) \ + do { if (__verbose) printf(fmt, ## arg); } while (0) + +static unsigned int nthreads = 0; +static unsigned int nsecs = 8; +static bool done, __verbose, randomize; + +/* + * epoll related shared variables. + */ + +/* Maximum number of nesting allowed inside epoll sets */ +#define EPOLL_MAXNESTS 4 + +enum { + OP_EPOLL_ADD, + OP_EPOLL_MOD, + OP_EPOLL_DEL, + EPOLL_NR_OPS, +}; + +static int epollfd; +static int *epollfdp; +static bool noaffinity; +static unsigned int nested = 0; + +/* amount of fds to monitor, per thread */ +static unsigned int nfds = 64; + +static struct mutex thread_lock; +static unsigned int threads_starting; +static struct stats all_stats[EPOLL_NR_OPS]; +static struct cond thread_parent, thread_worker; + +struct worker { + int tid; + pthread_t thread; + unsigned long ops[EPOLL_NR_OPS]; + int *fdmap; +}; + +static const struct option options[] = { + OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), + OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"), + OPT_UINTEGER('f', "nfds", &nfds, "Specify amount of file descriptors to monitor for each thread"), + OPT_BOOLEAN( 'n', "noaffinity", &noaffinity, "Disables CPU affinity"), + OPT_UINTEGER( 'N', "nested", &nested, "Nesting level epoll hierarchy (default is 0, no nesting)"), + OPT_BOOLEAN( 'R', "randomize", &randomize, "Perform random operations on random fds"), + OPT_BOOLEAN( 'v', "verbose", &__verbose, "Verbose mode"), + OPT_END() +}; + +static const char * const bench_epoll_ctl_usage[] = { + "perf bench epoll ctl <options>", + NULL +}; + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + /* inform all threads that we're done for the day */ + done = true; + gettimeofday(&bench__end, NULL); + timersub(&bench__end, &bench__start, &bench__runtime); +} + +static void nest_epollfd(void) +{ + unsigned int i; + struct epoll_event ev; + + if (nested > EPOLL_MAXNESTS) + nested = EPOLL_MAXNESTS; + printinfo("Nesting level(s): %d\n", nested); + + epollfdp = calloc(nested, sizeof(int)); + if (!epollfdp) + err(EXIT_FAILURE, "calloc"); + + for (i = 0; i < nested; i++) { + epollfdp[i] = epoll_create(1); + if (epollfd < 0) + err(EXIT_FAILURE, "epoll_create"); + } + + ev.events = EPOLLHUP; /* anything */ + ev.data.u64 = i; /* any number */ + + for (i = nested - 1; i; i--) { + if (epoll_ctl(epollfdp[i - 1], EPOLL_CTL_ADD, + epollfdp[i], &ev) < 0) + err(EXIT_FAILURE, "epoll_ctl"); + } + + if (epoll_ctl(epollfd, EPOLL_CTL_ADD, *epollfdp, &ev) < 0) + err(EXIT_FAILURE, "epoll_ctl"); +} + +static inline void do_epoll_op(struct worker *w, int op, int fd) +{ + int error; + struct epoll_event ev; + + ev.events = EPOLLIN; + ev.data.u64 = fd; + + switch (op) { + case OP_EPOLL_ADD: + error = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev); + break; + case OP_EPOLL_MOD: + ev.events = EPOLLOUT; + error = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &ev); + break; + case OP_EPOLL_DEL: + error = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, NULL); + break; + default: + error = 1; + break; + } + + if (!error) + w->ops[op]++; +} + +static inline void do_random_epoll_op(struct worker *w) +{ + unsigned long rnd1 = random(), rnd2 = random(); + int op, fd; + + fd = w->fdmap[rnd1 % nfds]; + op = rnd2 % EPOLL_NR_OPS; + + do_epoll_op(w, op, fd); +} + +static void *workerfn(void *arg) +{ + unsigned int i; + struct worker *w = (struct worker *) arg; + struct timespec ts = { .tv_sec = 0, + .tv_nsec = 250 }; + + mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); + + /* Let 'em loose */ + do { + /* random */ + if (randomize) { + do_random_epoll_op(w); + } else { + for (i = 0; i < nfds; i++) { + do_epoll_op(w, OP_EPOLL_ADD, w->fdmap[i]); + do_epoll_op(w, OP_EPOLL_MOD, w->fdmap[i]); + do_epoll_op(w, OP_EPOLL_DEL, w->fdmap[i]); + } + } + + nanosleep(&ts, NULL); + } while (!done); + + return NULL; +} + +static void init_fdmaps(struct worker *w, int pct) +{ + unsigned int i; + int inc; + struct epoll_event ev; + + if (!pct) + return; + + inc = 100/pct; + for (i = 0; i < nfds; i+=inc) { + ev.data.fd = w->fdmap[i]; + ev.events = EPOLLIN; + + if (epoll_ctl(epollfd, EPOLL_CTL_ADD, w->fdmap[i], &ev) < 0) + err(EXIT_FAILURE, "epoll_ct"); + } +} + +static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) +{ + pthread_attr_t thread_attr, *attrp = NULL; + cpu_set_t *cpuset; + unsigned int i, j; + int ret = 0; + int nrcpus; + size_t size; + + if (!noaffinity) + pthread_attr_init(&thread_attr); + + nrcpus = cpu__max_cpu().cpu; + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + + for (i = 0; i < nthreads; i++) { + struct worker *w = &worker[i]; + + w->tid = i; + w->fdmap = calloc(nfds, sizeof(int)); + if (!w->fdmap) + return 1; + + for (j = 0; j < nfds; j++) { + w->fdmap[j] = eventfd(0, EFD_NONBLOCK); + if (w->fdmap[j] < 0) + err(EXIT_FAILURE, "eventfd"); + } + + /* + * Lets add 50% of the fdmap to the epoll instance, and + * do it before any threads are started; otherwise there is + * an initial bias of the call failing (mod and del ops). + */ + if (randomize) + init_fdmaps(w, 50); + + if (!noaffinity) { + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, + size, cpuset); + + ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset); + if (ret) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } + + attrp = &thread_attr; + } + + ret = pthread_create(&w->thread, attrp, workerfn, + (void *)(struct worker *) w); + if (ret) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_create"); + } + } + + CPU_FREE(cpuset); + if (!noaffinity) + pthread_attr_destroy(&thread_attr); + + return ret; +} + +static void print_summary(void) +{ + int i; + unsigned long avg[EPOLL_NR_OPS]; + double stddev[EPOLL_NR_OPS]; + + for (i = 0; i < EPOLL_NR_OPS; i++) { + avg[i] = avg_stats(&all_stats[i]); + stddev[i] = stddev_stats(&all_stats[i]); + } + + printf("\nAveraged %ld ADD operations (+- %.2f%%)\n", + avg[OP_EPOLL_ADD], rel_stddev_stats(stddev[OP_EPOLL_ADD], + avg[OP_EPOLL_ADD])); + printf("Averaged %ld MOD operations (+- %.2f%%)\n", + avg[OP_EPOLL_MOD], rel_stddev_stats(stddev[OP_EPOLL_MOD], + avg[OP_EPOLL_MOD])); + printf("Averaged %ld DEL operations (+- %.2f%%)\n", + avg[OP_EPOLL_DEL], rel_stddev_stats(stddev[OP_EPOLL_DEL], + avg[OP_EPOLL_DEL])); +} + +int bench_epoll_ctl(int argc, const char **argv) +{ + int j, ret = 0; + struct sigaction act; + struct worker *worker = NULL; + struct perf_cpu_map *cpu; + struct rlimit rl, prevrl; + unsigned int i; + + argc = parse_options(argc, argv, options, bench_epoll_ctl_usage, 0); + if (argc) { + usage_with_options(bench_epoll_ctl_usage, options); + exit(EXIT_FAILURE); + } + + memset(&act, 0, sizeof(act)); + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + cpu = perf_cpu_map__new_online_cpus(); + if (!cpu) + goto errmem; + + /* a single, main epoll instance */ + epollfd = epoll_create(1); + if (epollfd < 0) + err(EXIT_FAILURE, "epoll_create"); + + /* + * Deal with nested epolls, if any. + */ + if (nested) + nest_epollfd(); + + /* default to the number of CPUs */ + if (!nthreads) + nthreads = perf_cpu_map__nr(cpu); + + worker = calloc(nthreads, sizeof(*worker)); + if (!worker) + goto errmem; + + if (getrlimit(RLIMIT_NOFILE, &prevrl)) + err(EXIT_FAILURE, "getrlimit"); + rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50; + printinfo("Setting RLIMIT_NOFILE rlimit from %" PRIu64 " to: %" PRIu64 "\n", + (uint64_t)prevrl.rlim_max, (uint64_t)rl.rlim_max); + if (setrlimit(RLIMIT_NOFILE, &rl) < 0) + err(EXIT_FAILURE, "setrlimit"); + + printf("Run summary [PID %d]: %d threads doing epoll_ctl ops " + "%d file-descriptors for %d secs.\n\n", + getpid(), nthreads, nfds, nsecs); + + for (i = 0; i < EPOLL_NR_OPS; i++) + init_stats(&all_stats[i]); + + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); + + threads_starting = nthreads; + + gettimeofday(&bench__start, NULL); + + do_threads(worker, cpu); + + mutex_lock(&thread_lock); + while (threads_starting) + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); + + sleep(nsecs); + toggle_done(0, NULL, NULL); + printinfo("main thread: toggling done\n"); + + for (i = 0; i < nthreads; i++) { + ret = pthread_join(worker[i].thread, NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + /* cleanup & report results */ + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); + + for (i = 0; i < nthreads; i++) { + unsigned long t[EPOLL_NR_OPS]; + + for (j = 0; j < EPOLL_NR_OPS; j++) { + t[j] = worker[i].ops[j]; + update_stats(&all_stats[j], t[j]); + } + + if (nfds == 1) + printf("[thread %2d] fdmap: %p [ add: %04ld; mod: %04ld; del: %04lds ops ]\n", + worker[i].tid, &worker[i].fdmap[0], + t[OP_EPOLL_ADD], t[OP_EPOLL_MOD], t[OP_EPOLL_DEL]); + else + printf("[thread %2d] fdmap: %p ... %p [ add: %04ld ops; mod: %04ld ops; del: %04ld ops ]\n", + worker[i].tid, &worker[i].fdmap[0], + &worker[i].fdmap[nfds-1], + t[OP_EPOLL_ADD], t[OP_EPOLL_MOD], t[OP_EPOLL_DEL]); + } + + print_summary(); + + close(epollfd); + perf_cpu_map__put(cpu); + for (i = 0; i < nthreads; i++) + free(worker[i].fdmap); + + free(worker); + return ret; +errmem: + err(EXIT_FAILURE, "calloc"); +} +#endif // HAVE_EVENTFD_SUPPORT diff --git a/tools/perf/bench/epoll-wait.c b/tools/perf/bench/epoll-wait.c new file mode 100644 index 000000000000..20fe4f72b4af --- /dev/null +++ b/tools/perf/bench/epoll-wait.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef HAVE_EVENTFD_SUPPORT +/* + * Copyright (C) 2018 Davidlohr Bueso. + * + * This program benchmarks concurrent epoll_wait(2) monitoring multiple + * file descriptors under one or two load balancing models. The first, + * and default, is the single/combined queueing (which refers to a single + * epoll instance for N worker threads): + * + * |---> [worker A] + * |---> [worker B] + * [combined queue] .---> [worker C] + * |---> [worker D] + * |---> [worker E] + * + * While the second model, enabled via --multiq option, uses multiple + * queueing (which refers to one epoll instance per worker). For example, + * short lived tcp connections in a high throughput httpd server will + * distribute the accept()'ing connections across CPUs. In this case each + * worker does a limited amount of processing. + * + * [queue A] ---> [worker] + * [queue B] ---> [worker] + * [queue C] ---> [worker] + * [queue D] ---> [worker] + * [queue E] ---> [worker] + * + * Naturally, the single queue will enforce more concurrency on the epoll + * instance, and can therefore scale poorly compared to multiple queues. + * However, this is a benchmark raw data and must be taken with a grain of + * salt when choosing how to make use of sys_epoll. + + * Each thread has a number of private, nonblocking file descriptors, + * referred to as fdmap. A writer thread will constantly be writing to + * the fdmaps of all threads, minimizing each threads's chances of + * epoll_wait not finding any ready read events and blocking as this + * is not what we want to stress. The size of the fdmap can be adjusted + * by the user; enlarging the value will increase the chances of + * epoll_wait(2) blocking as the lineal writer thread will take "longer", + * at least at a high level. + * + * Note that because fds are private to each thread, this workload does + * not stress scenarios where multiple tasks are awoken per ready IO; ie: + * EPOLLEXCLUSIVE semantics. + * + * The end result/metric is throughput: number of ops/second where an + * operation consists of: + * + * epoll_wait(2) + [others] + * + * ... where [others] is the cost of re-adding the fd (EPOLLET), + * or rearming it (EPOLLONESHOT). + * + * + * The purpose of this is program is that it be useful for measuring + * kernel related changes to the sys_epoll, and not comparing different + * IO polling methods, for example. Hence everything is very adhoc and + * outputs raw microbenchmark numbers. Also this uses eventfd, similar + * tools tend to use pipes or sockets, but the result is the same. + */ + +/* For the CLR_() macros */ +#include <string.h> +#include <pthread.h> +#include <unistd.h> + +#include <errno.h> +#include <inttypes.h> +#include <signal.h> +#include <stdlib.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <sys/epoll.h> +#include <sys/eventfd.h> +#include <sys/types.h> +#include <perf/cpumap.h> + +#include "../util/stat.h" +#include "../util/mutex.h" +#include <subcmd/parse-options.h> +#include "bench.h" + +#include <err.h> + +#define printinfo(fmt, arg...) \ + do { if (__verbose) { printf(fmt, ## arg); fflush(stdout); } } while (0) + +static unsigned int nthreads = 0; +static unsigned int nsecs = 8; +static bool wdone, done, __verbose, randomize, nonblocking; + +/* + * epoll related shared variables. + */ + +/* Maximum number of nesting allowed inside epoll sets */ +#define EPOLL_MAXNESTS 4 + +static int epollfd; +static int *epollfdp; +static bool noaffinity; +static unsigned int nested = 0; +static bool et; /* edge-trigger */ +static bool oneshot; +static bool multiq; /* use an epoll instance per thread */ + +/* amount of fds to monitor, per thread */ +static unsigned int nfds = 64; + +static struct mutex thread_lock; +static unsigned int threads_starting; +static struct stats throughput_stats; +static struct cond thread_parent, thread_worker; + +struct worker { + int tid; + int epollfd; /* for --multiq */ + pthread_t thread; + unsigned long ops; + int *fdmap; +}; + +static const struct option options[] = { + /* general benchmark options */ + OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), + OPT_UINTEGER('r', "runtime", &nsecs, "Specify runtime (in seconds)"), + OPT_UINTEGER('f', "nfds", &nfds, "Specify amount of file descriptors to monitor for each thread"), + OPT_BOOLEAN( 'n', "noaffinity", &noaffinity, "Disables CPU affinity"), + OPT_BOOLEAN('R', "randomize", &randomize, "Enable random write behaviour (default is lineal)"), + OPT_BOOLEAN( 'v', "verbose", &__verbose, "Verbose mode"), + + /* epoll specific options */ + OPT_BOOLEAN( 'm', "multiq", &multiq, "Use multiple epoll instances (one per thread)"), + OPT_BOOLEAN( 'B', "nonblocking", &nonblocking, "Nonblocking epoll_wait(2) behaviour"), + OPT_UINTEGER( 'N', "nested", &nested, "Nesting level epoll hierarchy (default is 0, no nesting)"), + OPT_BOOLEAN( 'S', "oneshot", &oneshot, "Use EPOLLONESHOT semantics"), + OPT_BOOLEAN( 'E', "edge", &et, "Use Edge-triggered interface (default is LT)"), + + OPT_END() +}; + +static const char * const bench_epoll_wait_usage[] = { + "perf bench epoll wait <options>", + NULL +}; + + +/* + * Arrange the N elements of ARRAY in random order. + * Only effective if N is much smaller than RAND_MAX; + * if this may not be the case, use a better random + * number generator. -- Ben Pfaff. + */ +static void shuffle(void *array, size_t n, size_t size) +{ + char *carray = array; + void *aux; + size_t i; + + if (n <= 1) + return; + + aux = calloc(1, size); + if (!aux) + err(EXIT_FAILURE, "calloc"); + + for (i = 1; i < n; ++i) { + size_t j = i + rand() / (RAND_MAX / (n - i) + 1); + j *= size; + + memcpy(aux, &carray[j], size); + memcpy(&carray[j], &carray[i*size], size); + memcpy(&carray[i*size], aux, size); + } + + free(aux); +} + + +static void *workerfn(void *arg) +{ + int fd, ret, r; + struct worker *w = (struct worker *) arg; + unsigned long ops = w->ops; + struct epoll_event ev; + uint64_t val; + int to = nonblocking? 0 : -1; + int efd = multiq ? w->epollfd : epollfd; + + mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); + + do { + /* + * Block indefinitely waiting for the IN event. + * In order to stress the epoll_wait(2) syscall, + * call it event per event, instead of a larger + * batch (max)limit. + */ + do { + ret = epoll_wait(efd, &ev, 1, to); + } while (ret < 0 && errno == EINTR); + if (ret < 0) + err(EXIT_FAILURE, "epoll_wait"); + + fd = ev.data.fd; + + do { + r = read(fd, &val, sizeof(val)); + } while (!done && (r < 0 && errno == EAGAIN)); + + if (et) { + ev.events = EPOLLIN | EPOLLET; + ret = epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev); + } + + if (oneshot) { + /* rearm the file descriptor with a new event mask */ + ev.events |= EPOLLIN | EPOLLONESHOT; + ret = epoll_ctl(efd, EPOLL_CTL_MOD, fd, &ev); + } + + ops++; + } while (!done); + + if (multiq) + close(w->epollfd); + + w->ops = ops; + return NULL; +} + +static void nest_epollfd(struct worker *w) +{ + unsigned int i; + struct epoll_event ev; + int efd = multiq ? w->epollfd : epollfd; + + if (nested > EPOLL_MAXNESTS) + nested = EPOLL_MAXNESTS; + + epollfdp = calloc(nested, sizeof(*epollfdp)); + if (!epollfdp) + err(EXIT_FAILURE, "calloc"); + + for (i = 0; i < nested; i++) { + epollfdp[i] = epoll_create(1); + if (epollfdp[i] < 0) + err(EXIT_FAILURE, "epoll_create"); + } + + ev.events = EPOLLHUP; /* anything */ + ev.data.u64 = i; /* any number */ + + for (i = nested - 1; i; i--) { + if (epoll_ctl(epollfdp[i - 1], EPOLL_CTL_ADD, + epollfdp[i], &ev) < 0) + err(EXIT_FAILURE, "epoll_ctl"); + } + + if (epoll_ctl(efd, EPOLL_CTL_ADD, *epollfdp, &ev) < 0) + err(EXIT_FAILURE, "epoll_ctl"); +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + /* inform all threads that we're done for the day */ + done = true; + gettimeofday(&bench__end, NULL); + timersub(&bench__end, &bench__start, &bench__runtime); +} + +static void print_summary(void) +{ + unsigned long avg = avg_stats(&throughput_stats); + double stddev = stddev_stats(&throughput_stats); + + printf("\nAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", + avg, rel_stddev_stats(stddev, avg), + (int)bench__runtime.tv_sec); +} + +static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) +{ + pthread_attr_t thread_attr, *attrp = NULL; + cpu_set_t *cpuset; + unsigned int i, j; + int ret = 0, events = EPOLLIN; + int nrcpus; + size_t size; + + if (oneshot) + events |= EPOLLONESHOT; + if (et) + events |= EPOLLET; + + printinfo("starting worker/consumer %sthreads%s\n", + noaffinity ? "":"CPU affinity ", + nonblocking ? " (nonblocking)":""); + if (!noaffinity) + pthread_attr_init(&thread_attr); + + nrcpus = cpu__max_cpu().cpu; + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + + for (i = 0; i < nthreads; i++) { + struct worker *w = &worker[i]; + + if (multiq) { + w->epollfd = epoll_create(1); + if (w->epollfd < 0) + err(EXIT_FAILURE, "epoll_create"); + + if (nested) + nest_epollfd(w); + } + + w->tid = i; + w->fdmap = calloc(nfds, sizeof(int)); + if (!w->fdmap) + return 1; + + for (j = 0; j < nfds; j++) { + int efd = multiq ? w->epollfd : epollfd; + struct epoll_event ev; + + w->fdmap[j] = eventfd(0, EFD_NONBLOCK); + if (w->fdmap[j] < 0) + err(EXIT_FAILURE, "eventfd"); + + ev.data.fd = w->fdmap[j]; + ev.events = events; + + ret = epoll_ctl(efd, EPOLL_CTL_ADD, + w->fdmap[j], &ev); + if (ret < 0) + err(EXIT_FAILURE, "epoll_ctl"); + } + + if (!noaffinity) { + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, + size, cpuset); + + ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset); + if (ret) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } + + attrp = &thread_attr; + } + + ret = pthread_create(&w->thread, attrp, workerfn, + (void *)(struct worker *) w); + if (ret) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_create"); + } + } + + CPU_FREE(cpuset); + if (!noaffinity) + pthread_attr_destroy(&thread_attr); + + return ret; +} + +static void *writerfn(void *p) +{ + struct worker *worker = p; + size_t i, j, iter; + const uint64_t val = 1; + ssize_t sz; + struct timespec ts = { .tv_sec = 0, + .tv_nsec = 500 }; + + printinfo("starting writer-thread: doing %s writes ...\n", + randomize? "random":"lineal"); + + for (iter = 0; !wdone; iter++) { + if (randomize) { + shuffle((void *)worker, nthreads, sizeof(*worker)); + } + + for (i = 0; i < nthreads; i++) { + struct worker *w = &worker[i]; + + if (randomize) { + shuffle((void *)w->fdmap, nfds, sizeof(int)); + } + + for (j = 0; j < nfds; j++) { + do { + sz = write(w->fdmap[j], &val, sizeof(val)); + } while (!wdone && (sz < 0 && errno == EAGAIN)); + } + } + + nanosleep(&ts, NULL); + } + + printinfo("exiting writer-thread (total full-loops: %zd)\n", iter); + return NULL; +} + +static int cmpworker(const void *p1, const void *p2) +{ + + struct worker *w1 = (struct worker *) p1; + struct worker *w2 = (struct worker *) p2; + + if (w1->tid > w2->tid) + return 1; + if (w1->tid < w2->tid) + return -1; + return 0; +} + +int bench_epoll_wait(int argc, const char **argv) +{ + int ret = 0; + struct sigaction act; + unsigned int i; + struct worker *worker = NULL; + struct perf_cpu_map *cpu; + pthread_t wthread; + struct rlimit rl, prevrl; + + argc = parse_options(argc, argv, options, bench_epoll_wait_usage, 0); + if (argc) { + usage_with_options(bench_epoll_wait_usage, options); + exit(EXIT_FAILURE); + } + + memset(&act, 0, sizeof(act)); + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + cpu = perf_cpu_map__new_online_cpus(); + if (!cpu) + goto errmem; + + /* a single, main epoll instance */ + if (!multiq) { + epollfd = epoll_create(1); + if (epollfd < 0) + err(EXIT_FAILURE, "epoll_create"); + + /* + * Deal with nested epolls, if any. + */ + if (nested) + nest_epollfd(NULL); + } + + printinfo("Using %s queue model\n", multiq ? "multi" : "single"); + printinfo("Nesting level(s): %d\n", nested); + + /* default to the number of CPUs and leave one for the writer pthread */ + if (!nthreads) + nthreads = perf_cpu_map__nr(cpu) - 1; + + worker = calloc(nthreads, sizeof(*worker)); + if (!worker) { + goto errmem; + } + + if (getrlimit(RLIMIT_NOFILE, &prevrl)) + err(EXIT_FAILURE, "getrlimit"); + rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50; + printinfo("Setting RLIMIT_NOFILE rlimit from %" PRIu64 " to: %" PRIu64 "\n", + (uint64_t)prevrl.rlim_max, (uint64_t)rl.rlim_max); + if (setrlimit(RLIMIT_NOFILE, &rl) < 0) + err(EXIT_FAILURE, "setrlimit"); + + printf("Run summary [PID %d]: %d threads monitoring%s on " + "%d file-descriptors for %d secs.\n\n", + getpid(), nthreads, oneshot ? " (EPOLLONESHOT semantics)": "", nfds, nsecs); + + init_stats(&throughput_stats); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); + + threads_starting = nthreads; + + gettimeofday(&bench__start, NULL); + + do_threads(worker, cpu); + + mutex_lock(&thread_lock); + while (threads_starting) + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); + + /* + * At this point the workers should be blocked waiting for read events + * to become ready. Launch the writer which will constantly be writing + * to each thread's fdmap. + */ + ret = pthread_create(&wthread, NULL, writerfn, + (void *)(struct worker *) worker); + if (ret) + err(EXIT_FAILURE, "pthread_create"); + + sleep(nsecs); + toggle_done(0, NULL, NULL); + printinfo("main thread: toggling done\n"); + + sleep(1); /* meh */ + wdone = true; + ret = pthread_join(wthread, NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + + /* cleanup & report results */ + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); + + /* sort the array back before reporting */ + if (randomize) + qsort(worker, nthreads, sizeof(struct worker), cmpworker); + + for (i = 0; i < nthreads; i++) { + unsigned long t = bench__runtime.tv_sec > 0 ? + worker[i].ops / bench__runtime.tv_sec : 0; + + update_stats(&throughput_stats, t); + + if (nfds == 1) + printf("[thread %2d] fdmap: %p [ %04ld ops/sec ]\n", + worker[i].tid, &worker[i].fdmap[0], t); + else + printf("[thread %2d] fdmap: %p ... %p [ %04ld ops/sec ]\n", + worker[i].tid, &worker[i].fdmap[0], + &worker[i].fdmap[nfds-1], t); + } + + print_summary(); + + close(epollfd); + perf_cpu_map__put(cpu); + for (i = 0; i < nthreads; i++) + free(worker[i].fdmap); + + free(worker); + return ret; +errmem: + err(EXIT_FAILURE, "calloc"); +} +#endif // HAVE_EVENTFD_SUPPORT diff --git a/tools/perf/bench/evlist-open-close.c b/tools/perf/bench/evlist-open-close.c new file mode 100644 index 000000000000..faf9c34b4a5d --- /dev/null +++ b/tools/perf/bench/evlist-open-close.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <errno.h> +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include "bench.h" +#include "../util/debug.h" +#include "../util/stat.h" +#include "../util/evlist.h" +#include "../util/evsel.h" +#include "../util/strbuf.h" +#include "../util/record.h" +#include "../util/parse-events.h" +#include "internal/threadmap.h" +#include "internal/cpumap.h" +#include <linux/perf_event.h> +#include <linux/kernel.h> +#include <linux/time64.h> +#include <linux/string.h> +#include <subcmd/parse-options.h> + +#define MMAP_FLUSH_DEFAULT 1 + +static int iterations = 100; +static int nr_events = 1; +static const char *event_string = "dummy"; + +static inline u64 timeval2usec(struct timeval *tv) +{ + return tv->tv_sec * USEC_PER_SEC + tv->tv_usec; +} + +static struct record_opts opts = { + .sample_time = true, + .mmap_pages = UINT_MAX, + .user_freq = UINT_MAX, + .user_interval = ULLONG_MAX, + .freq = 4000, + .target = { + .uses_mmap = true, + .default_per_cpu = true, + }, + .mmap_flush = MMAP_FLUSH_DEFAULT, + .nr_threads_synthesize = 1, + .ctl_fd = -1, + .ctl_fd_ack = -1, +}; + +static int evlist__count_evsel_fds(struct evlist *evlist) +{ + struct evsel *evsel; + int cnt = 0; + + evlist__for_each_entry(evlist, evsel) + cnt += evsel->core.threads->nr * perf_cpu_map__nr(evsel->core.cpus); + + return cnt; +} + +static struct evlist *bench__create_evlist(char *evstr, const char *uid_str) +{ + struct parse_events_error err; + struct evlist *evlist = evlist__new(); + int ret; + + if (!evlist) { + pr_err("Not enough memory to create evlist\n"); + return NULL; + } + + parse_events_error__init(&err); + ret = parse_events(evlist, evstr, &err); + if (ret) { + parse_events_error__print(&err, evstr); + parse_events_error__exit(&err); + pr_err("Run 'perf list' for a list of valid events\n"); + ret = 1; + goto out_delete_evlist; + } + parse_events_error__exit(&err); + if (uid_str) { + uid_t uid = parse_uid(uid_str); + + if (uid == UINT_MAX) { + pr_err("Invalid User: %s", uid_str); + ret = -EINVAL; + goto out_delete_evlist; + } + ret = parse_uid_filter(evlist, uid); + if (ret) + goto out_delete_evlist; + } + ret = evlist__create_maps(evlist, &opts.target); + if (ret < 0) { + pr_err("Not enough memory to create thread/cpu maps\n"); + goto out_delete_evlist; + } + + evlist__config(evlist, &opts, NULL); + + return evlist; + +out_delete_evlist: + evlist__delete(evlist); + return NULL; +} + +static int bench__do_evlist_open_close(struct evlist *evlist) +{ + char sbuf[STRERR_BUFSIZE]; + int err = evlist__open(evlist); + + if (err < 0) { + pr_err("evlist__open: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + return err; + } + + err = evlist__mmap(evlist, opts.mmap_pages); + if (err < 0) { + pr_err("evlist__mmap: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + return err; + } + + evlist__enable(evlist); + evlist__disable(evlist); + evlist__munmap(evlist); + evlist__close(evlist); + + return 0; +} + +static int bench_evlist_open_close__run(char *evstr, const char *uid_str) +{ + // used to print statistics only + struct evlist *evlist = bench__create_evlist(evstr, uid_str); + double time_average, time_stddev; + struct timeval start, end, diff; + struct stats time_stats; + u64 runtime_us; + int i, err; + + if (!evlist) + return -ENOMEM; + + init_stats(&time_stats); + + printf(" Number of cpus:\t%d\n", perf_cpu_map__nr(evlist->core.user_requested_cpus)); + printf(" Number of threads:\t%d\n", evlist->core.threads->nr); + printf(" Number of events:\t%d (%d fds)\n", + evlist->core.nr_entries, evlist__count_evsel_fds(evlist)); + printf(" Number of iterations:\t%d\n", iterations); + + evlist__delete(evlist); + + for (i = 0; i < iterations; i++) { + pr_debug("Started iteration %d\n", i); + evlist = bench__create_evlist(evstr, uid_str); + if (!evlist) + return -ENOMEM; + + gettimeofday(&start, NULL); + err = bench__do_evlist_open_close(evlist); + if (err) { + evlist__delete(evlist); + return err; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = timeval2usec(&diff); + update_stats(&time_stats, runtime_us); + + evlist__delete(evlist); + pr_debug("Iteration %d took:\t%" PRIu64 "us\n", i, runtime_us); + } + + time_average = avg_stats(&time_stats); + time_stddev = stddev_stats(&time_stats); + printf(" Average open-close took: %.3f usec (+- %.3f usec)\n", time_average, time_stddev); + + return 0; +} + +static char *bench__repeat_event_string(const char *evstr, int n) +{ + char sbuf[STRERR_BUFSIZE]; + struct strbuf buf; + int i, str_size = strlen(evstr), + final_size = str_size * n + n, + err = strbuf_init(&buf, final_size); + + if (err) { + pr_err("strbuf_init: %s\n", str_error_r(err, sbuf, sizeof(sbuf))); + goto out_error; + } + + for (i = 0; i < n; i++) { + err = strbuf_add(&buf, evstr, str_size); + if (err) { + pr_err("strbuf_add: %s\n", str_error_r(err, sbuf, sizeof(sbuf))); + goto out_error; + } + + err = strbuf_addch(&buf, i == n-1 ? '\0' : ','); + if (err) { + pr_err("strbuf_addch: %s\n", str_error_r(err, sbuf, sizeof(sbuf))); + goto out_error; + } + } + + return strbuf_detach(&buf, NULL); + +out_error: + strbuf_release(&buf); + return NULL; +} + + +int bench_evlist_open_close(int argc, const char **argv) +{ + const char *uid_str = NULL; + const struct option options[] = { + OPT_STRING('e', "event", &event_string, "event", + "event selector. use 'perf list' to list available events"), + OPT_INTEGER('n', "nr-events", &nr_events, + "number of dummy events to create (default 1). If used with -e, it clones those events n times (1 = no change)"), + OPT_INTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average (default=100)"), + OPT_BOOLEAN('a', "all-cpus", &opts.target.system_wide, + "system-wide collection from all CPUs"), + OPT_STRING('C', "cpu", &opts.target.cpu_list, "cpu", + "list of cpus where to open events"), + OPT_STRING('p', "pid", &opts.target.pid, "pid", + "record events on existing process id"), + OPT_STRING('t', "tid", &opts.target.tid, "tid", + "record events on existing thread id"), + OPT_STRING('u', "uid", &uid_str, "user", "user to profile"), + OPT_BOOLEAN(0, "per-thread", &opts.target.per_thread, "use per-thread mmaps"), + OPT_END() + }; + const char *const bench_usage[] = { + "perf bench internals evlist-open-close <options>", + NULL + }; + char *evstr, errbuf[BUFSIZ]; + int err; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + err = target__validate(&opts.target); + if (err) { + target__strerror(&opts.target, err, errbuf, sizeof(errbuf)); + pr_err("%s\n", errbuf); + goto out; + } + + /* Enable ignoring missing threads when -p option is defined. */ + opts.ignore_missing_thread = opts.target.pid; + + evstr = bench__repeat_event_string(event_string, nr_events); + if (!evstr) { + err = -ENOMEM; + goto out; + } + + err = bench_evlist_open_close__run(evstr, uid_str); + + free(evstr); +out: + return err; +} diff --git a/tools/perf/bench/find-bit-bench.c b/tools/perf/bench/find-bit-bench.c new file mode 100644 index 000000000000..e697c20951bc --- /dev/null +++ b/tools/perf/bench/find-bit-bench.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark find_next_bit and related bit operations. + * + * Copyright 2020 Google LLC. + */ +#include <stdlib.h> +#include "bench.h" +#include "../util/stat.h" +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/time64.h> +#include <subcmd/parse-options.h> + +static unsigned int outer_iterations = 5; +static unsigned int inner_iterations = 100000; + +static const struct option options[] = { + OPT_UINTEGER('i', "outer-iterations", &outer_iterations, + "Number of outer iterations used"), + OPT_UINTEGER('j', "inner-iterations", &inner_iterations, + "Number of inner iterations used"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench mem find_bit <options>", + NULL +}; + +static unsigned int accumulator; +static unsigned int use_of_val; + +static noinline void workload(int val) +{ + use_of_val += val; + accumulator++; +} + +#if defined(__i386__) || defined(__x86_64__) +static bool asm_test_bit(long nr, const unsigned long *addr) +{ + bool oldbit; + + asm volatile("bt %2,%1" + : "=@ccc" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); + + return oldbit; +} +#else +#define asm_test_bit test_bit +#endif + +static int do_for_each_set_bit(unsigned int num_bits) +{ + unsigned long *to_test = bitmap_zalloc(num_bits); + struct timeval start, end, diff; + u64 runtime_us; + struct stats fb_time_stats, tb_time_stats; + double time_average, time_stddev; + unsigned int bit, i, j; + unsigned int set_bits, skip; + + init_stats(&fb_time_stats); + init_stats(&tb_time_stats); + + for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) { + bitmap_zero(to_test, num_bits); + skip = num_bits / set_bits; + for (i = 0; i < num_bits; i += skip) + __set_bit(i, to_test); + + for (i = 0; i < outer_iterations; i++) { +#ifndef NDEBUG + unsigned int old = accumulator; +#endif + + gettimeofday(&start, NULL); + for (j = 0; j < inner_iterations; j++) { + for_each_set_bit(bit, to_test, num_bits) + workload(bit); + } + gettimeofday(&end, NULL); + assert(old + (inner_iterations * set_bits) == accumulator); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&fb_time_stats, runtime_us); + +#ifndef NDEBUG + old = accumulator; +#endif + gettimeofday(&start, NULL); + for (j = 0; j < inner_iterations; j++) { + for (bit = 0; bit < num_bits; bit++) { + if (asm_test_bit(bit, to_test)) + workload(bit); + } + } + gettimeofday(&end, NULL); + assert(old + (inner_iterations * set_bits) == accumulator); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&tb_time_stats, runtime_us); + } + + printf("%d operations %d bits set of %d bits\n", + inner_iterations, set_bits, num_bits); + time_average = avg_stats(&fb_time_stats); + time_stddev = stddev_stats(&fb_time_stats); + printf(" Average for_each_set_bit took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + time_average = avg_stats(&tb_time_stats); + time_stddev = stddev_stats(&tb_time_stats); + printf(" Average test_bit loop took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + if (use_of_val == accumulator) /* Try to avoid compiler tricks. */ + printf("\n"); + } + bitmap_free(to_test); + return 0; +} + +int bench_mem_find_bit(int argc, const char **argv) +{ + int err = 0, i; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + for (i = 1; i <= 2048; i <<= 1) + do_for_each_set_bit(i); + + return err; +} diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c new file mode 100644 index 000000000000..7e29f04da744 --- /dev/null +++ b/tools/perf/bench/futex-hash.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Davidlohr Bueso <davidlohr@hp.com> + * + * futex-hash: Stress the hell out of the Linux kernel futex uaddr hashing. + * + * This program is particularly useful for measuring the kernel's futex hash + * table/function implementation. In order for it to make sense, use with as + * many threads and futexes as possible. + */ + +/* For the CLR_() macros */ +#include <string.h> +#include <pthread.h> + +#include <errno.h> +#include <signal.h> +#include <stdlib.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/zalloc.h> +#include <sys/time.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <perf/cpumap.h> + +#include "../util/mutex.h" +#include "../util/stat.h" +#include <subcmd/parse-options.h> +#include "bench.h" +#include "futex.h" + +#include <err.h> + +static bool done = false; +static int futex_flag = 0; + +struct timeval bench__start, bench__end, bench__runtime; +static struct mutex thread_lock; +static unsigned int threads_starting; +static struct stats throughput_stats; +static struct cond thread_parent, thread_worker; + +struct worker { + int tid; + u_int32_t *futex; + pthread_t thread; + unsigned long ops; +}; + +static struct bench_futex_parameters params = { + .nfutexes = 1024, + .runtime = 10, + .nbuckets = -1, +}; + +static const struct option options[] = { + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('r', "runtime", ¶ms.runtime, "Specify runtime (in seconds)"), + OPT_UINTEGER('f', "futexes", ¶ms.nfutexes, "Specify amount of futexes per threads"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + OPT_END() +}; + +static const char * const bench_futex_hash_usage[] = { + "perf bench futex hash <options>", + NULL +}; + +static void *workerfn(void *arg) +{ + int ret; + struct worker *w = (struct worker *) arg; + unsigned int i; + unsigned long ops = w->ops; /* avoid cacheline bouncing */ + + mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); + + do { + for (i = 0; i < params.nfutexes; i++, ops++) { + /* + * We want the futex calls to fail in order to stress + * the hashing of uaddr and not measure other steps, + * such as internal waitqueue handling, thus enlarging + * the critical region protected by hb->lock. + */ + ret = futex_wait(&w->futex[i], 1234, NULL, futex_flag); + if (!params.silent && + (!ret || errno != EAGAIN || errno != EWOULDBLOCK)) + warn("Non-expected futex return call"); + } + } while (!done); + + w->ops = ops; + return NULL; +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + /* inform all threads that we're done for the day */ + done = true; + gettimeofday(&bench__end, NULL); + timersub(&bench__end, &bench__start, &bench__runtime); +} + +static void print_summary(void) +{ + unsigned long avg = avg_stats(&throughput_stats); + double stddev = stddev_stats(&throughput_stats); + + printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", + !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), + (int)bench__runtime.tv_sec); + futex_print_nbuckets(¶ms); +} + +int bench_futex_hash(int argc, const char **argv) +{ + int ret = 0; + cpu_set_t *cpuset; + struct sigaction act; + unsigned int i; + pthread_attr_t thread_attr; + struct worker *worker = NULL; + struct perf_cpu_map *cpu; + int nrcpus; + size_t size; + + argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0); + if (argc) { + usage_with_options(bench_futex_hash_usage, options); + exit(EXIT_FAILURE); + } + + cpu = perf_cpu_map__new_online_cpus(); + if (!cpu) + goto errmem; + + memset(&act, 0, sizeof(act)); + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + if (!params.nthreads) /* default to the number of CPUs */ + params.nthreads = perf_cpu_map__nr(cpu); + + worker = calloc(params.nthreads, sizeof(*worker)); + if (!worker) + goto errmem; + + if (!params.fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + futex_set_nbuckets_param(¶ms); + + printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n", + getpid(), params.nthreads, params.nfutexes, params.fshared ? "shared":"private", params.runtime); + + init_stats(&throughput_stats); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); + + threads_starting = params.nthreads; + pthread_attr_init(&thread_attr); + gettimeofday(&bench__start, NULL); + + nrcpus = cpu__max_cpu().cpu; + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + + for (i = 0; i < params.nthreads; i++) { + worker[i].tid = i; + worker[i].futex = calloc(params.nfutexes, sizeof(*worker[i].futex)); + if (!worker[i].futex) + goto errmem; + + CPU_ZERO_S(size, cpuset); + + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); + ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset); + if (ret) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } + ret = pthread_create(&worker[i].thread, &thread_attr, workerfn, + (void *)(struct worker *) &worker[i]); + if (ret) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_create"); + } + + } + CPU_FREE(cpuset); + pthread_attr_destroy(&thread_attr); + + mutex_lock(&thread_lock); + while (threads_starting) + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); + + sleep(params.runtime); + toggle_done(0, NULL, NULL); + + for (i = 0; i < params.nthreads; i++) { + ret = pthread_join(worker[i].thread, NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + /* cleanup & report results */ + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); + + for (i = 0; i < params.nthreads; i++) { + unsigned long t = bench__runtime.tv_sec > 0 ? + worker[i].ops / bench__runtime.tv_sec : 0; + update_stats(&throughput_stats, t); + if (!params.silent) { + if (params.nfutexes == 1) + printf("[thread %2d] futex: %p [ %ld ops/sec ]\n", + worker[i].tid, &worker[i].futex[0], t); + else + printf("[thread %2d] futexes: %p ... %p [ %ld ops/sec ]\n", + worker[i].tid, &worker[i].futex[0], + &worker[i].futex[params.nfutexes-1], t); + } + + zfree(&worker[i].futex); + } + + print_summary(); + + free(worker); + free(cpu); + return ret; +errmem: + err(EXIT_FAILURE, "calloc"); +} diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c new file mode 100644 index 000000000000..40640b674427 --- /dev/null +++ b/tools/perf/bench/futex-lock-pi.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2015 Davidlohr Bueso. + */ + +/* For the CLR_() macros */ +#include <string.h> +#include <pthread.h> + +#include <signal.h> +#include "../util/mutex.h" +#include "../util/stat.h" +#include <subcmd/parse-options.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/zalloc.h> +#include <errno.h> +#include <perf/cpumap.h> +#include "bench.h" +#include "futex.h" + +#include <err.h> +#include <stdlib.h> +#include <sys/time.h> +#include <sys/mman.h> + +struct worker { + int tid; + u_int32_t *futex; + pthread_t thread; + unsigned long ops; +}; + +static u_int32_t global_futex = 0; +static struct worker *worker; +static bool done = false; +static int futex_flag = 0; +static struct mutex thread_lock; +static unsigned int threads_starting; +static struct stats throughput_stats; +static struct cond thread_parent, thread_worker; + +static struct bench_futex_parameters params = { + .nbuckets = -1, + .runtime = 10, +}; + +static const struct option options[] = { + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('r', "runtime", ¶ms.runtime, "Specify runtime (in seconds)"), + OPT_BOOLEAN( 'M', "multi", ¶ms.multi, "Use multiple futexes"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + OPT_END() +}; + +static const char * const bench_futex_lock_pi_usage[] = { + "perf bench futex lock-pi <options>", + NULL +}; + +static void print_summary(void) +{ + unsigned long avg = avg_stats(&throughput_stats); + double stddev = stddev_stats(&throughput_stats); + + printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", + !params.silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), + (int)bench__runtime.tv_sec); + futex_print_nbuckets(¶ms); +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + /* inform all threads that we're done for the day */ + done = true; + gettimeofday(&bench__end, NULL); + timersub(&bench__end, &bench__start, &bench__runtime); +} + +static void *workerfn(void *arg) +{ + struct worker *w = (struct worker *) arg; + unsigned long ops = w->ops; + + mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); + + do { + int ret; + again: + ret = futex_lock_pi(w->futex, NULL, futex_flag); + + if (ret) { /* handle lock acquisition */ + if (!params.silent) + warn("thread %d: Could not lock pi-lock for %p (%d)", + w->tid, w->futex, ret); + if (done) + break; + + goto again; + } + + usleep(1); + ret = futex_unlock_pi(w->futex, futex_flag); + if (ret && !params.silent) + warn("thread %d: Could not unlock pi-lock for %p (%d)", + w->tid, w->futex, ret); + ops++; /* account for thread's share of work */ + } while (!done); + + w->ops = ops; + return NULL; +} + +static void create_threads(struct worker *w, struct perf_cpu_map *cpu) +{ + cpu_set_t *cpuset; + unsigned int i; + int nrcpus = cpu__max_cpu().cpu; + size_t size; + + threads_starting = params.nthreads; + + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + + for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); + worker[i].tid = i; + + if (params.multi) { + worker[i].futex = calloc(1, sizeof(u_int32_t)); + if (!worker[i].futex) + err(EXIT_FAILURE, "calloc"); + } else + worker[i].futex = &global_futex; + + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); + + if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } + + if (pthread_create(&w[i].thread, &thread_attr, workerfn, &worker[i])) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_create"); + } + pthread_attr_destroy(&thread_attr); + } + CPU_FREE(cpuset); +} + +int bench_futex_lock_pi(int argc, const char **argv) +{ + int ret = 0; + unsigned int i; + struct sigaction act; + struct perf_cpu_map *cpu; + + argc = parse_options(argc, argv, options, bench_futex_lock_pi_usage, 0); + if (argc) + goto err; + + cpu = perf_cpu_map__new_online_cpus(); + if (!cpu) + err(EXIT_FAILURE, "calloc"); + + memset(&act, 0, sizeof(act)); + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + if (!params.nthreads) + params.nthreads = perf_cpu_map__nr(cpu); + + worker = calloc(params.nthreads, sizeof(*worker)); + if (!worker) + err(EXIT_FAILURE, "calloc"); + + if (!params.fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: %d threads doing pi lock/unlock pairing for %d secs.\n\n", + getpid(), params.nthreads, params.runtime); + + init_stats(&throughput_stats); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); + futex_set_nbuckets_param(¶ms); + + threads_starting = params.nthreads; + gettimeofday(&bench__start, NULL); + + create_threads(worker, cpu); + + mutex_lock(&thread_lock); + while (threads_starting) + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); + + sleep(params.runtime); + toggle_done(0, NULL, NULL); + + for (i = 0; i < params.nthreads; i++) { + ret = pthread_join(worker[i].thread, NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + /* cleanup & report results */ + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); + + for (i = 0; i < params.nthreads; i++) { + unsigned long t = bench__runtime.tv_sec > 0 ? + worker[i].ops / bench__runtime.tv_sec : 0; + + update_stats(&throughput_stats, t); + if (!params.silent) + printf("[thread %3d] futex: %p [ %ld ops/sec ]\n", + worker[i].tid, worker[i].futex, t); + + if (params.multi) + zfree(&worker[i].futex); + } + + print_summary(); + + free(worker); + perf_cpu_map__put(cpu); + return ret; +err: + usage_with_options(bench_futex_lock_pi_usage, options); + exit(EXIT_FAILURE); +} diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c new file mode 100644 index 000000000000..0748b0fd689e --- /dev/null +++ b/tools/perf/bench/futex-requeue.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Davidlohr Bueso <davidlohr@hp.com> + * + * futex-requeue: Block a bunch of threads on futex1 and requeue them + * on futex2, N at a time. + * + * This program is particularly useful to measure the latency of nthread + * requeues without waking up any tasks (in the non-pi case) -- thus + * mimicking a regular futex_wait. + */ + +/* For the CLR_() macros */ +#include <string.h> +#include <pthread.h> + +#include <signal.h> +#include "../util/mutex.h" +#include "../util/stat.h" +#include <subcmd/parse-options.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/time64.h> +#include <errno.h> +#include <perf/cpumap.h> +#include "bench.h" +#include "futex.h" + +#include <err.h> +#include <stdlib.h> +#include <sys/time.h> +#include <sys/mman.h> + +static u_int32_t futex1 = 0, futex2 = 0; + +static pthread_t *worker; +static bool done = false; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; +static struct stats requeuetime_stats, requeued_stats; +static unsigned int threads_starting; +static int futex_flag = 0; + +static struct bench_futex_parameters params = { + .nbuckets = -1, + /* + * How many tasks to requeue at a time. + * Default to 1 in order to make the kernel work more. + */ + .nrequeue = 1, +}; + +static const struct option options[] = { + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('q', "nrequeue", ¶ms.nrequeue, "Specify amount of threads to requeue at once"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + OPT_BOOLEAN( 'B', "broadcast", ¶ms.broadcast, "Requeue all threads at once"), + OPT_BOOLEAN( 'p', "pi", ¶ms.pi, "Use PI-aware variants of FUTEX_CMP_REQUEUE"), + + OPT_END() +}; + +static const char * const bench_futex_requeue_usage[] = { + "perf bench futex requeue <options>", + NULL +}; + +static void print_summary(void) +{ + double requeuetime_avg = avg_stats(&requeuetime_stats); + double requeuetime_stddev = stddev_stats(&requeuetime_stats); + unsigned int requeued_avg = avg_stats(&requeued_stats); + + printf("Requeued %d of %d threads in %.4f ms (+-%.2f%%)\n", + requeued_avg, + params.nthreads, + requeuetime_avg / USEC_PER_MSEC, + rel_stddev_stats(requeuetime_stddev, requeuetime_avg)); + futex_print_nbuckets(¶ms); +} + +static void *workerfn(void *arg __maybe_unused) +{ + int ret; + + mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); + + while (1) { + if (!params.pi) { + ret = futex_wait(&futex1, 0, NULL, futex_flag); + if (!ret) + break; + + if (ret && errno != EAGAIN) { + if (!params.silent) + warnx("futex_wait"); + break; + } + } else { + ret = futex_wait_requeue_pi(&futex1, 0, &futex2, + NULL, futex_flag); + if (!ret) { + /* got the lock at futex2 */ + futex_unlock_pi(&futex2, futex_flag); + break; + } + + if (ret && errno != EAGAIN) { + if (!params.silent) + warnx("futex_wait_requeue_pi"); + break; + } + } + } + + return NULL; +} + +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) +{ + cpu_set_t *cpuset; + unsigned int i; + int nrcpus = cpu__max_cpu().cpu; + size_t size; + + threads_starting = params.nthreads; + + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + + /* create and block all threads */ + for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); + + if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } + + if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_create"); + } + pthread_attr_destroy(&thread_attr); + } + CPU_FREE(cpuset); +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + done = true; +} + +int bench_futex_requeue(int argc, const char **argv) +{ + int ret = 0; + unsigned int i, j; + struct sigaction act; + struct perf_cpu_map *cpu; + + argc = parse_options(argc, argv, options, bench_futex_requeue_usage, 0); + if (argc) + goto err; + + cpu = perf_cpu_map__new_online_cpus(); + if (!cpu) + err(EXIT_FAILURE, "cpu_map__new"); + + memset(&act, 0, sizeof(act)); + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + if (!params.nthreads) + params.nthreads = perf_cpu_map__nr(cpu); + + worker = calloc(params.nthreads, sizeof(*worker)); + if (!worker) + err(EXIT_FAILURE, "calloc"); + + if (!params.fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + if (params.nrequeue > params.nthreads) + params.nrequeue = params.nthreads; + + if (params.broadcast) + params.nrequeue = params.nthreads; + + futex_set_nbuckets_param(¶ms); + + printf("Run summary [PID %d]: Requeuing %d threads (from [%s] %p to %s%p), " + "%d at a time.\n\n", getpid(), params.nthreads, + params.fshared ? "shared":"private", &futex1, + params.pi ? "PI ": "", &futex2, params.nrequeue); + + init_stats(&requeued_stats); + init_stats(&requeuetime_stats); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); + + for (j = 0; j < bench_repeat && !done; j++) { + unsigned int nrequeued = 0, wakeups = 0; + struct timeval start, end, runtime; + + /* create, launch & block all threads */ + block_threads(worker, cpu); + + /* make sure all threads are already blocked */ + mutex_lock(&thread_lock); + while (threads_starting) + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); + + usleep(100000); + + /* Ok, all threads are patiently blocked, start requeueing */ + gettimeofday(&start, NULL); + while (nrequeued < params.nthreads) { + int r; + + /* + * For the regular non-pi case, do not wakeup any tasks + * blocked on futex1, allowing us to really measure + * futex_wait functionality. For the PI case the first + * waiter is always awoken. + */ + if (!params.pi) { + r = futex_cmp_requeue(&futex1, 0, &futex2, 0, + params.nrequeue, + futex_flag); + } else { + r = futex_cmp_requeue_pi(&futex1, 0, &futex2, + params.nrequeue, + futex_flag); + wakeups++; /* assume no error */ + } + + if (r < 0) + err(EXIT_FAILURE, "couldn't requeue from %p to %p", + &futex1, &futex2); + + nrequeued += r; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &runtime); + + update_stats(&requeued_stats, nrequeued); + update_stats(&requeuetime_stats, runtime.tv_usec); + + if (!params.silent) { + if (!params.pi) + printf("[Run %d]: Requeued %d of %d threads in " + "%.4f ms\n", j + 1, nrequeued, + params.nthreads, + runtime.tv_usec / (double)USEC_PER_MSEC); + else { + nrequeued -= wakeups; + printf("[Run %d]: Awoke and Requeued (%d+%d) of " + "%d threads in %.4f ms\n", + j + 1, wakeups, nrequeued, + params.nthreads, + runtime.tv_usec / (double)USEC_PER_MSEC); + } + + } + + if (!params.pi) { + /* everybody should be blocked on futex2, wake'em up */ + nrequeued = futex_wake(&futex2, nrequeued, futex_flag); + if (params.nthreads != nrequeued) + warnx("couldn't wakeup all tasks (%d/%d)", + nrequeued, params.nthreads); + } + + for (i = 0; i < params.nthreads; i++) { + ret = pthread_join(worker[i], NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + } + + /* cleanup & report results */ + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); + + print_summary(); + + free(worker); + perf_cpu_map__put(cpu); + return ret; +err: + usage_with_options(bench_futex_requeue_usage, options); + exit(EXIT_FAILURE); +} diff --git a/tools/perf/bench/futex-wake-parallel.c b/tools/perf/bench/futex-wake-parallel.c new file mode 100644 index 000000000000..6aede7c46b33 --- /dev/null +++ b/tools/perf/bench/futex-wake-parallel.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2015 Davidlohr Bueso. + * + * Block a bunch of threads and let parallel waker threads wakeup an + * equal amount of them. The program output reflects the avg latency + * for each individual thread to service its share of work. Ultimately + * it can be used to measure futex_wake() changes. + */ +#include "bench.h" +#include <linux/compiler.h> +#include "../util/debug.h" +#include "../util/mutex.h" + +#ifndef HAVE_PTHREAD_BARRIER +int bench_futex_wake_parallel(int argc __maybe_unused, const char **argv __maybe_unused) +{ + pr_err("%s: pthread_barrier_t unavailable, disabling this test...\n", __func__); + return 0; +} +#else /* HAVE_PTHREAD_BARRIER */ +/* For the CLR_() macros */ +#include <string.h> +#include <pthread.h> + +#include <signal.h> +#include "../util/stat.h" +#include <subcmd/parse-options.h> +#include <linux/kernel.h> +#include <linux/time64.h> +#include <errno.h> +#include "futex.h" +#include <perf/cpumap.h> + +#include <err.h> +#include <stdlib.h> +#include <sys/time.h> +#include <sys/mman.h> + +struct thread_data { + pthread_t worker; + unsigned int nwoken; + struct timeval runtime; +}; + +static unsigned int nwakes = 1; + +/* all threads will block on the same futex -- hash bucket chaos ;) */ +static u_int32_t futex = 0; + +static pthread_t *blocked_worker; +static bool done = false; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; +static pthread_barrier_t barrier; +static struct stats waketime_stats, wakeup_stats; +static unsigned int threads_starting; +static int futex_flag = 0; + +static struct bench_futex_parameters params = { + .nbuckets = -1, +}; + +static const struct option options[] = { + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('w', "nwakers", ¶ms.nwakes, "Specify amount of waking threads"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + + OPT_END() +}; + +static const char * const bench_futex_wake_parallel_usage[] = { + "perf bench futex wake-parallel <options>", + NULL +}; + +static void *waking_workerfn(void *arg) +{ + struct thread_data *waker = (struct thread_data *) arg; + struct timeval start, end; + + pthread_barrier_wait(&barrier); + + gettimeofday(&start, NULL); + + waker->nwoken = futex_wake(&futex, nwakes, futex_flag); + if (waker->nwoken != nwakes) + warnx("couldn't wakeup all tasks (%d/%d)", + waker->nwoken, nwakes); + + gettimeofday(&end, NULL); + timersub(&end, &start, &waker->runtime); + + pthread_exit(NULL); + return NULL; +} + +static void wakeup_threads(struct thread_data *td) +{ + unsigned int i; + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); + pthread_attr_setdetachstate(&thread_attr, PTHREAD_CREATE_JOINABLE); + + pthread_barrier_init(&barrier, NULL, params.nwakes + 1); + + /* create and block all threads */ + for (i = 0; i < params.nwakes; i++) { + /* + * Thread creation order will impact per-thread latency + * as it will affect the order to acquire the hb spinlock. + * For now let the scheduler decide. + */ + if (pthread_create(&td[i].worker, &thread_attr, + waking_workerfn, (void *)&td[i])) + err(EXIT_FAILURE, "pthread_create"); + } + + pthread_barrier_wait(&barrier); + + for (i = 0; i < params.nwakes; i++) + if (pthread_join(td[i].worker, NULL)) + err(EXIT_FAILURE, "pthread_join"); + + pthread_barrier_destroy(&barrier); + pthread_attr_destroy(&thread_attr); +} + +static void *blocked_workerfn(void *arg __maybe_unused) +{ + mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); + + while (1) { /* handle spurious wakeups */ + if (futex_wait(&futex, 0, NULL, futex_flag) != EINTR) + break; + } + + pthread_exit(NULL); + return NULL; +} + +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) +{ + cpu_set_t *cpuset; + unsigned int i; + int nrcpus = cpu__max_cpu().cpu; + size_t size; + + threads_starting = params.nthreads; + + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + + /* create and block all threads */ + for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); + + if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } + + if (pthread_create(&w[i], &thread_attr, blocked_workerfn, NULL)) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_create"); + } + pthread_attr_destroy(&thread_attr); + } + CPU_FREE(cpuset); +} + +static void print_run(struct thread_data *waking_worker, unsigned int run_num) +{ + unsigned int i, wakeup_avg; + double waketime_avg, waketime_stddev; + struct stats __waketime_stats, __wakeup_stats; + + init_stats(&__wakeup_stats); + init_stats(&__waketime_stats); + + for (i = 0; i < params.nwakes; i++) { + update_stats(&__waketime_stats, waking_worker[i].runtime.tv_usec); + update_stats(&__wakeup_stats, waking_worker[i].nwoken); + } + + waketime_avg = avg_stats(&__waketime_stats); + waketime_stddev = stddev_stats(&__waketime_stats); + wakeup_avg = avg_stats(&__wakeup_stats); + + printf("[Run %d]: Avg per-thread latency (waking %d/%d threads) " + "in %.4f ms (+-%.2f%%)\n", run_num + 1, wakeup_avg, + params.nthreads, waketime_avg / USEC_PER_MSEC, + rel_stddev_stats(waketime_stddev, waketime_avg)); +} + +static void print_summary(void) +{ + unsigned int wakeup_avg; + double waketime_avg, waketime_stddev; + + waketime_avg = avg_stats(&waketime_stats); + waketime_stddev = stddev_stats(&waketime_stats); + wakeup_avg = avg_stats(&wakeup_stats); + + printf("Avg per-thread latency (waking %d/%d threads) in %.4f ms (+-%.2f%%)\n", + wakeup_avg, + params.nthreads, + waketime_avg / USEC_PER_MSEC, + rel_stddev_stats(waketime_stddev, waketime_avg)); + futex_print_nbuckets(¶ms); +} + + +static void do_run_stats(struct thread_data *waking_worker) +{ + unsigned int i; + + for (i = 0; i < params.nwakes; i++) { + update_stats(&waketime_stats, waking_worker[i].runtime.tv_usec); + update_stats(&wakeup_stats, waking_worker[i].nwoken); + } + +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + done = true; +} + +int bench_futex_wake_parallel(int argc, const char **argv) +{ + int ret = 0; + unsigned int i, j; + struct sigaction act; + struct thread_data *waking_worker; + struct perf_cpu_map *cpu; + + argc = parse_options(argc, argv, options, + bench_futex_wake_parallel_usage, 0); + if (argc) { + usage_with_options(bench_futex_wake_parallel_usage, options); + exit(EXIT_FAILURE); + } + + memset(&act, 0, sizeof(act)); + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + cpu = perf_cpu_map__new_online_cpus(); + if (!cpu) + err(EXIT_FAILURE, "calloc"); + + if (!params.nthreads) + params.nthreads = perf_cpu_map__nr(cpu); + + /* some sanity checks */ + if (params.nwakes > params.nthreads || + !params.nwakes) + params.nwakes = params.nthreads; + + if (params.nthreads % params.nwakes) + errx(EXIT_FAILURE, "Must be perfectly divisible"); + /* + * Each thread will wakeup nwakes tasks in + * a single futex_wait call. + */ + nwakes = params.nthreads/params.nwakes; + + blocked_worker = calloc(params.nthreads, sizeof(*blocked_worker)); + if (!blocked_worker) + err(EXIT_FAILURE, "calloc"); + + if (!params.fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + futex_set_nbuckets_param(¶ms); + + printf("Run summary [PID %d]: blocking on %d threads (at [%s] " + "futex %p), %d threads waking up %d at a time.\n\n", + getpid(), params.nthreads, params.fshared ? "shared":"private", + &futex, params.nwakes, nwakes); + + init_stats(&wakeup_stats); + init_stats(&waketime_stats); + + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); + + for (j = 0; j < bench_repeat && !done; j++) { + waking_worker = calloc(params.nwakes, sizeof(*waking_worker)); + if (!waking_worker) + err(EXIT_FAILURE, "calloc"); + + /* create, launch & block all threads */ + block_threads(blocked_worker, cpu); + + /* make sure all threads are already blocked */ + mutex_lock(&thread_lock); + while (threads_starting) + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); + + usleep(200000); + + /* Ok, all threads are patiently blocked, start waking folks up */ + wakeup_threads(waking_worker); + + for (i = 0; i < params.nthreads; i++) { + ret = pthread_join(blocked_worker[i], NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + do_run_stats(waking_worker); + if (!params.silent) + print_run(waking_worker, j); + + free(waking_worker); + } + + /* cleanup & report results */ + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); + + print_summary(); + + free(blocked_worker); + perf_cpu_map__put(cpu); + return ret; +} +#endif /* HAVE_PTHREAD_BARRIER */ diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c new file mode 100644 index 000000000000..a31fc1563862 --- /dev/null +++ b/tools/perf/bench/futex-wake.c @@ -0,0 +1,241 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 Davidlohr Bueso <davidlohr@hp.com> + * + * futex-wake: Block a bunch of threads on a futex and wake'em up, N at a time. + * + * This program is particularly useful to measure the latency of nthread wakeups + * in non-error situations: all waiters are queued and all wake calls wakeup + * one or more tasks, and thus the waitqueue is never empty. + */ + +/* For the CLR_() macros */ +#include <string.h> +#include <pthread.h> + +#include <signal.h> +#include "../util/mutex.h" +#include "../util/stat.h" +#include <subcmd/parse-options.h> +#include <linux/compiler.h> +#include <linux/kernel.h> +#include <linux/time64.h> +#include <errno.h> +#include <perf/cpumap.h> +#include "bench.h" +#include "futex.h" + +#include <err.h> +#include <stdlib.h> +#include <sys/time.h> +#include <sys/mman.h> + +/* all threads will block on the same futex */ +static u_int32_t futex1 = 0; + +static pthread_t *worker; +static bool done = false; +static struct mutex thread_lock; +static struct cond thread_parent, thread_worker; +static struct stats waketime_stats, wakeup_stats; +static unsigned int threads_starting; +static int futex_flag = 0; + +static struct bench_futex_parameters params = { + .nbuckets = -1, + /* + * How many wakeups to do at a time. + * Default to 1 in order to make the kernel work more. + */ + .nwakes = 1, +}; + +static const struct option options[] = { + OPT_INTEGER( 'b', "buckets", ¶ms.nbuckets, "Specify amount of hash buckets"), + OPT_UINTEGER('t', "threads", ¶ms.nthreads, "Specify amount of threads"), + OPT_UINTEGER('w', "nwakes", ¶ms.nwakes, "Specify amount of threads to wake at once"), + OPT_BOOLEAN( 's', "silent", ¶ms.silent, "Silent mode: do not display data/details"), + OPT_BOOLEAN( 'S', "shared", ¶ms.fshared, "Use shared futexes instead of private ones"), + OPT_BOOLEAN( 'm', "mlockall", ¶ms.mlockall, "Lock all current and future memory"), + + OPT_END() +}; + +static const char * const bench_futex_wake_usage[] = { + "perf bench futex wake <options>", + NULL +}; + +static void *workerfn(void *arg __maybe_unused) +{ + mutex_lock(&thread_lock); + threads_starting--; + if (!threads_starting) + cond_signal(&thread_parent); + cond_wait(&thread_worker, &thread_lock); + mutex_unlock(&thread_lock); + + while (1) { + if (futex_wait(&futex1, 0, NULL, futex_flag) != EINTR) + break; + } + + pthread_exit(NULL); + return NULL; +} + +static void print_summary(void) +{ + double waketime_avg = avg_stats(&waketime_stats); + double waketime_stddev = stddev_stats(&waketime_stats); + unsigned int wakeup_avg = avg_stats(&wakeup_stats); + + printf("Wokeup %d of %d threads in %.4f ms (+-%.2f%%)\n", + wakeup_avg, + params.nthreads, + waketime_avg / USEC_PER_MSEC, + rel_stddev_stats(waketime_stddev, waketime_avg)); + futex_print_nbuckets(¶ms); +} + +static void block_threads(pthread_t *w, struct perf_cpu_map *cpu) +{ + cpu_set_t *cpuset; + unsigned int i; + size_t size; + int nrcpus = cpu__max_cpu().cpu; + threads_starting = params.nthreads; + + cpuset = CPU_ALLOC(nrcpus); + BUG_ON(!cpuset); + size = CPU_ALLOC_SIZE(nrcpus); + + /* create and block all threads */ + for (i = 0; i < params.nthreads; i++) { + pthread_attr_t thread_attr; + + pthread_attr_init(&thread_attr); + CPU_ZERO_S(size, cpuset); + CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, size, cpuset); + + if (pthread_attr_setaffinity_np(&thread_attr, size, cpuset)) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + } + + if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) { + CPU_FREE(cpuset); + err(EXIT_FAILURE, "pthread_create"); + } + pthread_attr_destroy(&thread_attr); + } + CPU_FREE(cpuset); +} + +static void toggle_done(int sig __maybe_unused, + siginfo_t *info __maybe_unused, + void *uc __maybe_unused) +{ + done = true; +} + +int bench_futex_wake(int argc, const char **argv) +{ + int ret = 0; + unsigned int i, j; + struct sigaction act; + struct perf_cpu_map *cpu; + + argc = parse_options(argc, argv, options, bench_futex_wake_usage, 0); + if (argc) { + usage_with_options(bench_futex_wake_usage, options); + exit(EXIT_FAILURE); + } + + cpu = perf_cpu_map__new_online_cpus(); + if (!cpu) + err(EXIT_FAILURE, "calloc"); + + memset(&act, 0, sizeof(act)); + sigfillset(&act.sa_mask); + act.sa_sigaction = toggle_done; + sigaction(SIGINT, &act, NULL); + + if (params.mlockall) { + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + err(EXIT_FAILURE, "mlockall"); + } + + if (!params.nthreads) + params.nthreads = perf_cpu_map__nr(cpu); + + worker = calloc(params.nthreads, sizeof(*worker)); + if (!worker) + err(EXIT_FAILURE, "calloc"); + + if (!params.fshared) + futex_flag = FUTEX_PRIVATE_FLAG; + + printf("Run summary [PID %d]: blocking on %d threads (at [%s] futex %p), " + "waking up %d at a time.\n\n", + getpid(), params.nthreads, params.fshared ? "shared":"private", + &futex1, params.nwakes); + + init_stats(&wakeup_stats); + init_stats(&waketime_stats); + mutex_init(&thread_lock); + cond_init(&thread_parent); + cond_init(&thread_worker); + + for (j = 0; j < bench_repeat && !done; j++) { + unsigned int nwoken = 0; + struct timeval start, end, runtime; + + /* create, launch & block all threads */ + block_threads(worker, cpu); + + /* make sure all threads are already blocked */ + mutex_lock(&thread_lock); + while (threads_starting) + cond_wait(&thread_parent, &thread_lock); + cond_broadcast(&thread_worker); + mutex_unlock(&thread_lock); + + usleep(100000); + + /* Ok, all threads are patiently blocked, start waking folks up */ + gettimeofday(&start, NULL); + while (nwoken != params.nthreads) + nwoken += futex_wake(&futex1, + params.nwakes, futex_flag); + gettimeofday(&end, NULL); + timersub(&end, &start, &runtime); + + update_stats(&wakeup_stats, nwoken); + update_stats(&waketime_stats, runtime.tv_usec); + + if (!params.silent) { + printf("[Run %d]: Wokeup %d of %d threads in %.4f ms\n", + j + 1, nwoken, params.nthreads, + runtime.tv_usec / (double)USEC_PER_MSEC); + } + + for (i = 0; i < params.nthreads; i++) { + ret = pthread_join(worker[i], NULL); + if (ret) + err(EXIT_FAILURE, "pthread_join"); + } + + } + + /* cleanup & report results */ + cond_destroy(&thread_parent); + cond_destroy(&thread_worker); + mutex_destroy(&thread_lock); + + print_summary(); + + free(worker); + perf_cpu_map__put(cpu); + return ret; +} diff --git a/tools/perf/bench/futex.c b/tools/perf/bench/futex.c new file mode 100644 index 000000000000..1968c9d00b5b --- /dev/null +++ b/tools/perf/bench/futex.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <err.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/prctl.h> + +#include "futex.h" + +#ifndef PR_FUTEX_HASH +#define PR_FUTEX_HASH 78 +# define PR_FUTEX_HASH_SET_SLOTS 1 +# define PR_FUTEX_HASH_GET_SLOTS 2 +#endif // PR_FUTEX_HASH + +void futex_set_nbuckets_param(struct bench_futex_parameters *params) +{ + int ret; + + if (params->nbuckets < 0) + return; + + ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_SET_SLOTS, params->nbuckets, 0); + if (ret) { + printf("Requesting %d hash buckets failed: %d/%m\n", + params->nbuckets, ret); + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); + } +} + +void futex_print_nbuckets(struct bench_futex_parameters *params) +{ + char *futex_hash_mode; + int ret; + + ret = prctl(PR_FUTEX_HASH, PR_FUTEX_HASH_GET_SLOTS); + if (params->nbuckets >= 0) { + if (ret != params->nbuckets) { + if (ret < 0) { + printf("Can't query number of buckets: %m\n"); + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); + } + printf("Requested number of hash buckets does not currently used.\n"); + printf("Requested: %d in usage: %d\n", params->nbuckets, ret); + err(EXIT_FAILURE, "prctl(PR_FUTEX_HASH)"); + } + if (params->nbuckets == 0) + ret = asprintf(&futex_hash_mode, "Futex hashing: global hash"); + else + ret = asprintf(&futex_hash_mode, "Futex hashing: %d hash buckets", + params->nbuckets); + } else { + if (ret <= 0) { + ret = asprintf(&futex_hash_mode, "Futex hashing: global hash"); + } else { + ret = asprintf(&futex_hash_mode, "Futex hashing: auto resized to %d buckets", + ret); + } + } + if (ret < 0) + err(EXIT_FAILURE, "ENOMEM, futex_hash_mode"); + printf("%s\n", futex_hash_mode); + free(futex_hash_mode); +} diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h new file mode 100644 index 000000000000..fcb72d682cf8 --- /dev/null +++ b/tools/perf/bench/futex.h @@ -0,0 +1,151 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Glibc independent futex library for testing kernel functionality. + * Shamelessly stolen from Darren Hart <dvhltc@us.ibm.com> + * http://git.kernel.org/cgit/linux/kernel/git/dvhart/futextest.git/ + */ + +#ifndef _FUTEX_H +#define _FUTEX_H + +#include <stdbool.h> +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <linux/futex.h> + +struct bench_futex_parameters { + bool silent; + bool fshared; + bool mlockall; + bool multi; /* lock-pi */ + bool pi; /* requeue-pi */ + bool broadcast; /* requeue */ + unsigned int runtime; /* seconds*/ + unsigned int nthreads; + unsigned int nfutexes; + unsigned int nwakes; + unsigned int nrequeue; + int nbuckets; +}; + +/** + * futex_syscall() - SYS_futex syscall wrapper + * @uaddr: address of first futex + * @op: futex op code + * @val: typically expected value of uaddr, but varies by op + * @timeout: typically an absolute struct timespec (except where noted + * otherwise). Overloaded by some ops + * @uaddr2: address of second futex for some ops + * @val3: varies by op + * @opflags: flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG + * + * futex_syscall() is used by all the following futex op wrappers. It can also be + * used for misuse and abuse testing. Generally, the specific op wrappers + * should be used instead. + * + * These argument descriptions are the defaults for all + * like-named arguments in the following wrappers except where noted below. + */ +static inline int +futex_syscall(volatile u_int32_t *uaddr, int op, u_int32_t val, struct timespec *timeout, + volatile u_int32_t *uaddr2, int val3, int opflags) +{ + return syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3); +} + +static inline int +futex_syscall_nr_requeue(volatile u_int32_t *uaddr, int op, u_int32_t val, int nr_requeue, + volatile u_int32_t *uaddr2, int val3, int opflags) +{ + return syscall(SYS_futex, uaddr, op | opflags, val, nr_requeue, uaddr2, val3); +} + +/** + * futex_wait() - block on uaddr with optional timeout + * @timeout: relative timeout + */ +static inline int +futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags) +{ + return futex_syscall(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); +} + +/** + * futex_wake() - wake one or more tasks blocked on uaddr + * @nr_wake: wake up to this many tasks + */ +static inline int +futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) +{ + return futex_syscall(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); +} + +/** + * futex_lock_pi() - block on uaddr as a PI mutex + */ +static inline int +futex_lock_pi(u_int32_t *uaddr, struct timespec *timeout, int opflags) +{ + return futex_syscall(uaddr, FUTEX_LOCK_PI, 0, timeout, NULL, 0, opflags); +} + +/** + * futex_unlock_pi() - release uaddr as a PI mutex, waking the top waiter + */ +static inline int +futex_unlock_pi(u_int32_t *uaddr, int opflags) +{ + return futex_syscall(uaddr, FUTEX_UNLOCK_PI, 0, NULL, NULL, 0, opflags); +} + +/** +* futex_cmp_requeue() - requeue tasks from uaddr to uaddr2 +* @nr_wake: wake up to this many tasks +* @nr_requeue: requeue up to this many tasks +*/ +static inline int +futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wake, + int nr_requeue, int opflags) +{ + return futex_syscall_nr_requeue(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2, + val, opflags); +} + +/** + * futex_wait_requeue_pi() - block on uaddr and prepare to requeue to uaddr2 + * @uaddr: non-PI futex source + * @uaddr2: PI futex target + * + * This is the first half of the requeue_pi mechanism. It shall always be + * paired with futex_cmp_requeue_pi(). + */ +static inline int +futex_wait_requeue_pi(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, + struct timespec *timeout, int opflags) +{ + return futex_syscall(uaddr, FUTEX_WAIT_REQUEUE_PI, val, timeout, uaddr2, 0, + opflags); +} + +/** + * futex_cmp_requeue_pi() - requeue tasks from uaddr to uaddr2 + * @uaddr: non-PI futex source + * @uaddr2: PI futex target + * @nr_requeue: requeue up to this many tasks + * + * This is the second half of the requeue_pi mechanism. It shall always be + * paired with futex_wait_requeue_pi(). The first waker is always awoken. + */ +static inline int +futex_cmp_requeue_pi(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, + int nr_requeue, int opflags) +{ + return futex_syscall_nr_requeue(uaddr, FUTEX_CMP_REQUEUE_PI, 1, nr_requeue, uaddr2, + val, opflags); +} + +void futex_set_nbuckets_param(struct bench_futex_parameters *params); +void futex_print_nbuckets(struct bench_futex_parameters *params); + +#endif /* _FUTEX_H */ diff --git a/tools/perf/bench/inject-buildid.c b/tools/perf/bench/inject-buildid.c new file mode 100644 index 000000000000..aad572a78d7f --- /dev/null +++ b/tools/perf/bench/inject-buildid.c @@ -0,0 +1,486 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdlib.h> +#include <stddef.h> +#include <ftw.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <linux/kernel.h> +#include <linux/time64.h> +#include <linux/list.h> +#include <linux/err.h> +#include <linux/zalloc.h> +#include <internal/lib.h> +#include <subcmd/parse-options.h> + +#include "bench.h" +#include "util/data.h" +#include "util/stat.h" +#include "util/debug.h" +#include "util/symbol.h" +#include "util/session.h" +#include "util/build-id.h" +#include "util/sample.h" +#include "util/synthetic-events.h" + +#define MMAP_DEV_MAJOR 8 +#define DSO_MMAP_RATIO 4 + +static unsigned int iterations = 100; +static unsigned int nr_mmaps = 100; +static unsigned int nr_samples = 100; /* samples per mmap */ + +static u64 bench_sample_type; +static u16 bench_id_hdr_size; + +struct bench_data { + int pid; + int input_pipe[2]; + int output_pipe[2]; + pthread_t th; +}; + +struct bench_dso { + struct list_head list; + char *name; + int ino; +}; + +static int nr_dsos; +static struct bench_dso *dsos; + +extern int main(int argc, const char **argv); + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average (default: 100)"), + OPT_UINTEGER('m', "nr-mmaps", &nr_mmaps, + "Number of mmap events for each iteration (default: 100)"), + OPT_UINTEGER('n', "nr-samples", &nr_samples, + "Number of sample events per mmap event (default: 100)"), + OPT_INCR('v', "verbose", &verbose, + "be more verbose (show iteration count, DSO name, etc)"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals inject-build-id <options>", + NULL +}; + +/* + * Helper for collect_dso that adds the given file as a dso to dso_list + * if it contains a build-id. Stops after collecting 4 times more than + * we need (for MMAP2 events). + */ +static int add_dso(const char *fpath, const struct stat *sb __maybe_unused, + int typeflag, struct FTW *ftwbuf __maybe_unused) +{ + struct bench_dso *dso = &dsos[nr_dsos]; + struct build_id bid = { .size = 0, }; + + if (typeflag == FTW_D || typeflag == FTW_SL) + return 0; + + if (filename__read_build_id(fpath, &bid) < 0) + return 0; + + dso->name = realpath(fpath, NULL); + if (dso->name == NULL) + return -1; + + dso->ino = nr_dsos++; + pr_debug2(" Adding DSO: %s\n", fpath); + + /* stop if we collected enough DSOs */ + if ((unsigned int)nr_dsos == DSO_MMAP_RATIO * nr_mmaps) + return 1; + + return 0; +} + +static void collect_dso(void) +{ + dsos = calloc(nr_mmaps * DSO_MMAP_RATIO, sizeof(*dsos)); + if (dsos == NULL) { + printf(" Memory allocation failed\n"); + exit(1); + } + + if (nftw("/usr/lib/", add_dso, 10, FTW_PHYS) < 0) + return; + + pr_debug(" Collected %d DSOs\n", nr_dsos); +} + +static void release_dso(void) +{ + int i; + + for (i = 0; i < nr_dsos; i++) { + struct bench_dso *dso = &dsos[i]; + + zfree(&dso->name); + } + free(dsos); +} + +/* Fake address used by mmap and sample events */ +static u64 dso_map_addr(struct bench_dso *dso) +{ + return 0x400000ULL + dso->ino * 8192ULL; +} + +static ssize_t synthesize_attr(struct bench_data *data) +{ + union perf_event event; + + memset(&event, 0, sizeof(event.attr) + sizeof(u64)); + + event.header.type = PERF_RECORD_HEADER_ATTR; + event.header.size = sizeof(event.attr) + sizeof(u64); + + event.attr.attr.type = PERF_TYPE_SOFTWARE; + event.attr.attr.config = PERF_COUNT_SW_TASK_CLOCK; + event.attr.attr.exclude_kernel = 1; + event.attr.attr.sample_id_all = 1; + event.attr.attr.sample_type = bench_sample_type; + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static ssize_t synthesize_fork(struct bench_data *data) +{ + union perf_event event; + + memset(&event, 0, sizeof(event.fork) + bench_id_hdr_size); + + event.header.type = PERF_RECORD_FORK; + event.header.misc = PERF_RECORD_MISC_FORK_EXEC; + event.header.size = sizeof(event.fork) + bench_id_hdr_size; + + event.fork.ppid = 1; + event.fork.ptid = 1; + event.fork.pid = data->pid; + event.fork.tid = data->pid; + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static ssize_t synthesize_mmap(struct bench_data *data, struct bench_dso *dso, u64 timestamp) +{ + union perf_event event; + size_t len = offsetof(struct perf_record_mmap2, filename); + u64 *id_hdr_ptr = (void *)&event; + int ts_idx; + + len += roundup(strlen(dso->name) + 1, 8) + bench_id_hdr_size; + + memset(&event, 0, min(len, sizeof(event.mmap2))); + + event.header.type = PERF_RECORD_MMAP2; + event.header.misc = PERF_RECORD_MISC_USER; + event.header.size = len; + + event.mmap2.pid = data->pid; + event.mmap2.tid = data->pid; + event.mmap2.maj = MMAP_DEV_MAJOR; + event.mmap2.ino = dso->ino; + + strcpy(event.mmap2.filename, dso->name); + + event.mmap2.start = dso_map_addr(dso); + event.mmap2.len = 4096; + event.mmap2.prot = PROT_EXEC; + + if (len > sizeof(event.mmap2)) { + /* write mmap2 event first */ + if (writen(data->input_pipe[1], &event, len - bench_id_hdr_size) < 0) + return -1; + /* zero-fill sample id header */ + memset(id_hdr_ptr, 0, bench_id_hdr_size); + /* put timestamp in the right position */ + ts_idx = (bench_id_hdr_size / sizeof(u64)) - 2; + id_hdr_ptr[ts_idx] = timestamp; + if (writen(data->input_pipe[1], id_hdr_ptr, bench_id_hdr_size) < 0) + return -1; + + return len; + } + + ts_idx = (len / sizeof(u64)) - 2; + id_hdr_ptr[ts_idx] = timestamp; + return writen(data->input_pipe[1], &event, len); +} + +static ssize_t synthesize_sample(struct bench_data *data, struct bench_dso *dso, u64 timestamp) +{ + union perf_event event; + struct perf_sample sample = { + .tid = data->pid, + .pid = data->pid, + .ip = dso_map_addr(dso), + .time = timestamp, + }; + + event.header.type = PERF_RECORD_SAMPLE; + event.header.misc = PERF_RECORD_MISC_USER; + event.header.size = perf_event__sample_event_size(&sample, bench_sample_type, 0); + + perf_event__synthesize_sample(&event, bench_sample_type, 0, &sample); + + return writen(data->input_pipe[1], &event, event.header.size); +} + +static ssize_t synthesize_flush(struct bench_data *data) +{ + struct perf_event_header header = { + .size = sizeof(header), + .type = PERF_RECORD_FINISHED_ROUND, + }; + + return writen(data->input_pipe[1], &header, header.size); +} + +static void *data_reader(void *arg) +{ + struct bench_data *data = arg; + char buf[8192]; + int flag; + int n; + + flag = fcntl(data->output_pipe[0], F_GETFL); + fcntl(data->output_pipe[0], F_SETFL, flag | O_NONBLOCK); + + /* read out data from child */ + while (true) { + n = read(data->output_pipe[0], buf, sizeof(buf)); + if (n > 0) + continue; + if (n == 0) + break; + + if (errno != EINTR && errno != EAGAIN) + break; + + usleep(100); + } + + close(data->output_pipe[0]); + return NULL; +} + +static int setup_injection(struct bench_data *data, bool build_id_all) +{ + int ready_pipe[2]; + int dev_null_fd; + char buf; + + if (pipe(ready_pipe) < 0) + return -1; + + if (pipe(data->input_pipe) < 0) + return -1; + + if (pipe(data->output_pipe) < 0) + return -1; + + data->pid = fork(); + if (data->pid < 0) + return -1; + + if (data->pid == 0) { + const char **inject_argv; + int inject_argc = 3; + + close(data->input_pipe[1]); + close(data->output_pipe[0]); + close(ready_pipe[0]); + + dup2(data->input_pipe[0], STDIN_FILENO); + close(data->input_pipe[0]); + dup2(data->output_pipe[1], STDOUT_FILENO); + close(data->output_pipe[1]); + + dev_null_fd = open("/dev/null", O_WRONLY); + if (dev_null_fd < 0) + exit(1); + + dup2(dev_null_fd, STDERR_FILENO); + + if (build_id_all) + inject_argc++; + + inject_argv = calloc(inject_argc + 1, sizeof(*inject_argv)); + if (inject_argv == NULL) + exit(1); + + inject_argv[0] = strdup("perf"); + inject_argv[1] = strdup("inject"); + inject_argv[2] = strdup("-b"); + if (build_id_all) + inject_argv[3] = strdup("--buildid-all"); + + /* signal that we're ready to go */ + close(ready_pipe[1]); + + main(inject_argc, inject_argv); + + exit(0); + } + + pthread_create(&data->th, NULL, data_reader, data); + + close(ready_pipe[1]); + close(data->input_pipe[0]); + close(data->output_pipe[1]); + + /* wait for child ready */ + if (read(ready_pipe[0], &buf, 1) < 0) + return -1; + close(ready_pipe[0]); + + return 0; +} + +static int inject_build_id(struct bench_data *data, u64 *max_rss) +{ + int status; + unsigned int i, k; + struct rusage rusage; + + /* this makes the child to run */ + if (perf_header__write_pipe(data->input_pipe[1]) < 0) + return -1; + + if (synthesize_attr(data) < 0) + return -1; + + if (synthesize_fork(data) < 0) + return -1; + + for (i = 0; i < nr_mmaps; i++) { + int idx = rand() % nr_dsos; + struct bench_dso *dso = &dsos[idx]; + u64 timestamp = rand() % 1000000; + + pr_debug2(" [%d] injecting: %s\n", i+1, dso->name); + if (synthesize_mmap(data, dso, timestamp) < 0) + return -1; + + for (k = 0; k < nr_samples; k++) { + if (synthesize_sample(data, dso, timestamp + k * 1000) < 0) + return -1; + } + + if ((i + 1) % 10 == 0) { + if (synthesize_flush(data) < 0) + return -1; + } + } + + /* this makes the child to finish */ + close(data->input_pipe[1]); + + wait4(data->pid, &status, 0, &rusage); + *max_rss = rusage.ru_maxrss; + + pr_debug(" Child %d exited with %d\n", data->pid, status); + + return 0; +} + +static void do_inject_loop(struct bench_data *data, bool build_id_all) +{ + unsigned int i; + struct stats time_stats, mem_stats; + double time_average, time_stddev; + double mem_average, mem_stddev; + + init_stats(&time_stats); + init_stats(&mem_stats); + + pr_debug(" Build-id%s injection benchmark\n", build_id_all ? "-all" : ""); + + for (i = 0; i < iterations; i++) { + struct timeval start, end, diff; + u64 runtime_us, max_rss; + + pr_debug(" Iteration #%d\n", i+1); + + if (setup_injection(data, build_id_all) < 0) { + printf(" Build-id injection setup failed\n"); + break; + } + + gettimeofday(&start, NULL); + if (inject_build_id(data, &max_rss) < 0) { + printf(" Build-id injection failed\n"); + break; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&mem_stats, max_rss); + + pthread_join(data->th, NULL); + } + + time_average = avg_stats(&time_stats) / USEC_PER_MSEC; + time_stddev = stddev_stats(&time_stats) / USEC_PER_MSEC; + printf(" Average build-id%s injection took: %.3f msec (+- %.3f msec)\n", + build_id_all ? "-all" : "", time_average, time_stddev); + + /* each iteration, it processes MMAP2 + BUILD_ID + nr_samples * SAMPLE */ + time_average = avg_stats(&time_stats) / (nr_mmaps * (nr_samples + 2)); + time_stddev = stddev_stats(&time_stats) / (nr_mmaps * (nr_samples + 2)); + printf(" Average time per event: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + mem_average = avg_stats(&mem_stats); + mem_stddev = stddev_stats(&mem_stats); + printf(" Average memory usage: %.0f KB (+- %.0f KB)\n", + mem_average, mem_stddev); +} + +static int do_inject_loops(struct bench_data *data) +{ + + srand(time(NULL)); + symbol__init(NULL); + + bench_sample_type = PERF_SAMPLE_IDENTIFIER | PERF_SAMPLE_IP; + bench_sample_type |= PERF_SAMPLE_TID | PERF_SAMPLE_TIME; + bench_id_hdr_size = 32; + + collect_dso(); + if (nr_dsos == 0) { + printf(" Cannot collect DSOs for injection\n"); + return -1; + } + + do_inject_loop(data, false); + do_inject_loop(data, true); + + release_dso(); + return 0; +} + +int bench_inject_build_id(int argc, const char **argv) +{ + struct bench_data data; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + return do_inject_loops(&data); +} + diff --git a/tools/perf/bench/kallsyms-parse.c b/tools/perf/bench/kallsyms-parse.c new file mode 100644 index 000000000000..2b0d0f980ae9 --- /dev/null +++ b/tools/perf/bench/kallsyms-parse.c @@ -0,0 +1,75 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark of /proc/kallsyms parsing. + * + * Copyright 2020 Google LLC. + */ +#include <stdlib.h> +#include "bench.h" +#include "../util/stat.h" +#include <linux/time64.h> +#include <subcmd/parse-options.h> +#include <symbol/kallsyms.h> + +static unsigned int iterations = 100; + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals kallsyms-parse <options>", + NULL +}; + +static int bench_process_symbol(void *arg __maybe_unused, + const char *name __maybe_unused, + char type __maybe_unused, + u64 start __maybe_unused) +{ + return 0; +} + +static int do_kallsyms_parse(void) +{ + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double time_average, time_stddev; + int err; + struct stats time_stats; + + init_stats(&time_stats); + + for (i = 0; i < iterations; i++) { + gettimeofday(&start, NULL); + err = kallsyms__parse("/proc/kallsyms", NULL, + bench_process_symbol); + if (err) + return err; + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + } + + time_average = avg_stats(&time_stats) / USEC_PER_MSEC; + time_stddev = stddev_stats(&time_stats) / USEC_PER_MSEC; + printf(" Average kallsyms__parse took: %.3f ms (+- %.3f ms)\n", + time_average, time_stddev); + return 0; +} + +int bench_kallsyms_parse(int argc, const char **argv) +{ + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + return do_kallsyms_parse(); +} diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c new file mode 100644 index 000000000000..2908a3a796c9 --- /dev/null +++ b/tools/perf/bench/mem-functions.c @@ -0,0 +1,563 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mem-memcpy.c + * + * Simple memcpy() and memset() benchmarks + * + * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> + */ + +#include "debug.h" +#include "../perf-sys.h" +#include <subcmd/parse-options.h> +#include "../util/header.h" +#include "../util/cloexec.h" +#include "../util/string2.h" +#include "bench.h" +#include "mem-memcpy-arch.h" +#include "mem-memset-arch.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/time.h> +#include <sys/mman.h> +#include <errno.h> +#include <linux/time64.h> +#include <linux/log2.h> + +#define K 1024 + +#define PAGE_SHIFT_4KB 12 +#define PAGE_SHIFT_2MB 21 +#define PAGE_SHIFT_1GB 30 + +static const char *size_str = "1MB"; +static const char *function_str = "all"; +static const char *page_size_str = "4KB"; +static const char *chunk_size_str = "0"; +static unsigned int nr_loops = 1; +static bool use_cycles; +static int cycles_fd; +static unsigned int seed; + +static const struct option bench_common_options[] = { + OPT_STRING('s', "size", &size_str, "1MB", + "Specify the size of the memory buffers. " + "Available units: B, KB, MB, GB and TB (case insensitive)"), + + OPT_STRING('p', "page", &page_size_str, "4KB", + "Specify page-size for mapping memory buffers. " + "Available sizes: 4KB, 2MB, 1GB (case insensitive)"), + + OPT_STRING('f', "function", &function_str, "all", + "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"), + + OPT_UINTEGER('l', "nr_loops", &nr_loops, + "Specify the number of loops to run. (default: 1)"), + + OPT_BOOLEAN('c', "cycles", &use_cycles, + "Use a cycles event instead of gettimeofday() to measure performance"), + + OPT_END() +}; + +static const struct option bench_mem_options[] = { + OPT_STRING('k', "chunk", &chunk_size_str, "0", + "Specify the chunk-size for each invocation. " + "Available units: B, KB, MB, GB and TB (case insensitive)"), + OPT_PARENT(bench_common_options), + OPT_END() +}; + +union bench_clock { + u64 cycles; + struct timeval tv; +}; + +struct bench_params { + size_t size; + size_t size_total; + size_t chunk_size; + unsigned int nr_loops; + unsigned int page_shift; + unsigned int seed; +}; + +struct bench_mem_info { + const struct function *functions; + int (*do_op)(const struct function *r, struct bench_params *p, + void *src, void *dst, union bench_clock *rt); + const char *const *usage; + const struct option *options; + bool alloc_src; +}; + +typedef bool (*mem_init_t)(struct bench_mem_info *, struct bench_params *, + void **, void **); +typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *, + void **, void **); +typedef void *(*memcpy_t)(void *, const void *, size_t); +typedef void *(*memset_t)(void *, int, size_t); +typedef void (*mmap_op_t)(void *, size_t, unsigned int, bool); + +struct function { + const char *name; + const char *desc; + struct { + mem_init_t init; + mem_fini_t fini; + union { + memcpy_t memcpy; + memset_t memset; + mmap_op_t mmap_op; + }; + } fn; +}; + +static struct perf_event_attr cycle_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES +}; + +static int init_cycles(void) +{ + cycles_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, perf_event_open_cloexec_flag()); + + if (cycles_fd < 0 && errno == ENOSYS) { + pr_debug("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); + return -1; + } + + return cycles_fd; +} + +static u64 get_cycles(void) +{ + int ret; + u64 clk; + + ret = read(cycles_fd, &clk, sizeof(u64)); + BUG_ON(ret != sizeof(u64)); + + return clk; +} + +static void clock_get(union bench_clock *t) +{ + if (use_cycles) + t->cycles = get_cycles(); + else + BUG_ON(gettimeofday(&t->tv, NULL)); +} + +static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e) +{ + union bench_clock t; + + if (use_cycles) + t.cycles = e->cycles - s->cycles; + else + timersub(&e->tv, &s->tv, &t.tv); + + return t; +} + +static void clock_accum(union bench_clock *a, union bench_clock *b) +{ + if (use_cycles) + a->cycles += b->cycles; + else + timeradd(&a->tv, &b->tv, &a->tv); +} + +static double timeval2double(struct timeval *ts) +{ + return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC; +} + +#define print_bps(x) do { \ + if (x < K) \ + printf(" %14lf bytes/sec\n", x); \ + else if (x < K * K) \ + printf(" %14lfd KB/sec\n", x / K); \ + else if (x < K * K * K) \ + printf(" %14lf MB/sec\n", x / K / K); \ + else \ + printf(" %14lf GB/sec\n", x / K / K / K); \ + } while (0) + +static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p, + int r_idx) +{ + const struct function *r = &info->functions[r_idx]; + double result_bps = 0.0; + union bench_clock rt = { 0 }; + void *src = NULL, *dst = NULL; + + printf("# function '%s' (%s)\n", r->name, r->desc); + + if (r->fn.init && r->fn.init(info, p, &src, &dst)) + goto out_init_failed; + + if (bench_format == BENCH_FORMAT_DEFAULT) + printf("# Copying %s bytes ...\n\n", size_str); + + if (info->do_op(r, p, src, dst, &rt)) + goto out_test_failed; + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + if (use_cycles) { + printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total); + } else { + result_bps = (double)p->size_total/timeval2double(&rt.tv); + print_bps(result_bps); + } + break; + + case BENCH_FORMAT_SIMPLE: + if (use_cycles) { + printf("%lf\n", (double)rt.cycles/(double)p->size_total); + } else { + result_bps = (double)p->size_total/timeval2double(&rt.tv); + printf("%lf\n", result_bps); + } + break; + + default: + BUG_ON(1); + break; + } + +out_test_failed: +out_free: + if (r->fn.fini) r->fn.fini(info, p, &src, &dst); + return; +out_init_failed: + printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, + p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); + goto out_free; +} + +static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *info) +{ + int i; + struct bench_params p = { 0 }; + unsigned int page_size; + + argc = parse_options(argc, argv, info->options, info->usage, 0); + + if (use_cycles) { + i = init_cycles(); + if (i < 0) { + fprintf(stderr, "Failed to open cycles counter\n"); + return i; + } + } + + p.nr_loops = nr_loops; + p.size = (size_t)perf_atoll((char *)size_str); + + if ((s64)p.size <= 0) { + fprintf(stderr, "Invalid size:%s\n", size_str); + return 1; + } + p.size_total = p.size * p.nr_loops; + + p.chunk_size = (size_t)perf_atoll((char *)chunk_size_str); + if ((s64)p.chunk_size < 0 || (s64)p.chunk_size > (s64)p.size) { + fprintf(stderr, "Invalid chunk_size:%s\n", chunk_size_str); + return 1; + } + if (!p.chunk_size) + p.chunk_size = p.size; + + page_size = (unsigned int)perf_atoll((char *)page_size_str); + if (page_size != (1 << PAGE_SHIFT_4KB) && + page_size != (1 << PAGE_SHIFT_2MB) && + page_size != (1 << PAGE_SHIFT_1GB)) { + fprintf(stderr, "Invalid page-size:%s\n", page_size_str); + return 1; + } + p.page_shift = ilog2(page_size); + + p.seed = seed; + + if (!strncmp(function_str, "all", 3)) { + for (i = 0; info->functions[i].name; i++) + __bench_mem_function(info, &p, i); + return 0; + } + + for (i = 0; info->functions[i].name; i++) { + if (!strcmp(info->functions[i].name, function_str)) + break; + } + if (!info->functions[i].name) { + if (strcmp(function_str, "help") && strcmp(function_str, "h")) + printf("Unknown function: %s\n", function_str); + printf("Available functions:\n"); + for (i = 0; info->functions[i].name; i++) { + printf("\t%s ... %s\n", + info->functions[i].name, info->functions[i].desc); + } + return 1; + } + + __bench_mem_function(info, &p, i); + + return 0; +} + +static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst) +{ + /* Make sure to always prefault zero pages even if MMAP_THRESH is crossed: */ + memset(src, 0, size); + + /* + * We prefault the freshly allocated memory range here, + * to not measure page fault overhead: + */ + fn(dst, src, size); +} + +static int do_memcpy(const struct function *r, struct bench_params *p, + void *src, void *dst, union bench_clock *rt) +{ + union bench_clock start, end; + memcpy_t fn = r->fn.memcpy; + + memcpy_prefault(fn, p->size, src, dst); + + clock_get(&start); + for (unsigned int i = 0; i < p->nr_loops; ++i) + for (size_t off = 0; off < p->size; off += p->chunk_size) + fn(dst + off, src + off, min(p->chunk_size, p->size - off)); + clock_get(&end); + + *rt = clock_diff(&start, &end); + + return 0; +} + +static void *bench_mmap(size_t size, bool populate, unsigned int page_shift) +{ + void *p; + int extra = populate ? MAP_POPULATE : 0; + + if (page_shift != PAGE_SHIFT_4KB) + extra |= MAP_HUGETLB | (page_shift << MAP_HUGE_SHIFT); + + p = mmap(NULL, size, PROT_READ|PROT_WRITE, + extra | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + return p == MAP_FAILED ? NULL : p; +} + +static void bench_munmap(void *p, size_t size) +{ + if (p) + munmap(p, size); +} + +static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p, + void **src, void **dst) +{ + bool failed; + + *dst = bench_mmap(p->size, true, p->page_shift); + failed = *dst == NULL; + + if (info->alloc_src) { + *src = bench_mmap(p->size, true, p->page_shift); + failed = failed || *src == NULL; + } + + return failed; +} + +static void mem_free(struct bench_mem_info *info __maybe_unused, + struct bench_params *p __maybe_unused, + void **src, void **dst) +{ + bench_munmap(*dst, p->size); + bench_munmap(*src, p->size); + + *dst = *src = NULL; +} + +struct function memcpy_functions[] = { + { .name = "default", + .desc = "Default memcpy() provided by glibc", + .fn.init = mem_alloc, + .fn.fini = mem_free, + .fn.memcpy = memcpy }, + +#ifdef HAVE_ARCH_X86_64_SUPPORT +# define MEMCPY_FN(_fn, _init, _fini, _name, _desc) \ + {.name = _name, .desc = _desc, .fn.memcpy = _fn, .fn.init = _init, .fn.fini = _fini }, +# include "mem-memcpy-x86-64-asm-def.h" +# undef MEMCPY_FN +#endif + + { .name = NULL, } +}; + +static const char * const bench_mem_memcpy_usage[] = { + "perf bench mem memcpy <options>", + NULL +}; + +int bench_mem_memcpy(int argc, const char **argv) +{ + struct bench_mem_info info = { + .functions = memcpy_functions, + .do_op = do_memcpy, + .usage = bench_mem_memcpy_usage, + .options = bench_mem_options, + .alloc_src = true, + }; + + return bench_mem_common(argc, argv, &info); +} + +static int do_memset(const struct function *r, struct bench_params *p, + void *src __maybe_unused, void *dst, union bench_clock *rt) +{ + union bench_clock start, end; + memset_t fn = r->fn.memset; + + /* + * We prefault the freshly allocated memory range here, + * to not measure page fault overhead: + */ + fn(dst, -1, p->size); + + clock_get(&start); + for (unsigned int i = 0; i < p->nr_loops; ++i) + for (size_t off = 0; off < p->size; off += p->chunk_size) + fn(dst + off, i, min(p->chunk_size, p->size - off)); + clock_get(&end); + + *rt = clock_diff(&start, &end); + + return 0; +} + +static const char * const bench_mem_memset_usage[] = { + "perf bench mem memset <options>", + NULL +}; + +static const struct function memset_functions[] = { + { .name = "default", + .desc = "Default memset() provided by glibc", + .fn.init = mem_alloc, + .fn.fini = mem_free, + .fn.memset = memset }, + +#ifdef HAVE_ARCH_X86_64_SUPPORT +# define MEMSET_FN(_fn, _init, _fini, _name, _desc) \ + {.name = _name, .desc = _desc, .fn.memset = _fn, .fn.init = _init, .fn.fini = _fini }, +# include "mem-memset-x86-64-asm-def.h" +# undef MEMSET_FN +#endif + + { .name = NULL, } +}; + +int bench_mem_memset(int argc, const char **argv) +{ + struct bench_mem_info info = { + .functions = memset_functions, + .do_op = do_memset, + .usage = bench_mem_memset_usage, + .options = bench_mem_options, + }; + + return bench_mem_common(argc, argv, &info); +} + +static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, bool random) +{ + unsigned long npages = size / (1 << page_shift); + unsigned long offset = 0, r = 0; + + for (unsigned long i = 0; i < npages; i++) { + if (random) + r = rand() % (1 << page_shift); + + *((char *)dst + offset + r) = *(char *)(dst + offset + r) + i; + offset += 1 << page_shift; + } +} + +static int do_mmap(const struct function *r, struct bench_params *p, + void *src __maybe_unused, void *dst __maybe_unused, + union bench_clock *accum) +{ + union bench_clock start, end, diff; + mmap_op_t fn = r->fn.mmap_op; + bool populate = strcmp(r->name, "populate") == 0; + + if (p->seed) + srand(p->seed); + + for (unsigned int i = 0; i < p->nr_loops; i++) { + clock_get(&start); + dst = bench_mmap(p->size, populate, p->page_shift); + if (!dst) + goto out; + + fn(dst, p->size, p->page_shift, p->seed); + clock_get(&end); + diff = clock_diff(&start, &end); + clock_accum(accum, &diff); + + bench_munmap(dst, p->size); + } + + return 0; +out: + printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, + p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); + return -1; +} + +static const char * const bench_mem_mmap_usage[] = { + "perf bench mem mmap <options>", + NULL +}; + +static const struct function mmap_functions[] = { + { .name = "demand", + .desc = "Demand loaded mmap()", + .fn.mmap_op = mmap_page_touch }, + + { .name = "populate", + .desc = "Eagerly populated mmap()", + .fn.mmap_op = mmap_page_touch }, + + { .name = NULL, } +}; + +int bench_mem_mmap(int argc, const char **argv) +{ + static const struct option bench_mmap_options[] = { + OPT_UINTEGER('r', "randomize", &seed, + "Seed to randomize page access offset."), + OPT_PARENT(bench_common_options), + OPT_END() + }; + + struct bench_mem_info info = { + .functions = mmap_functions, + .do_op = do_mmap, + .usage = bench_mem_mmap_usage, + .options = bench_mmap_options, + }; + + return bench_mem_common(argc, argv, &info); +} diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h index a72e36cb5394..852e48cfd8fe 100644 --- a/tools/perf/bench/mem-memcpy-arch.h +++ b/tools/perf/bench/mem-memcpy-arch.h @@ -1,8 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef ARCH_X86_64 +#ifdef HAVE_ARCH_X86_64_SUPPORT -#define MEMCPY_FN(fn, name, desc) \ - extern void *fn(void *, const void *, size_t); +#define MEMCPY_FN(fn, init, fini, name, desc) \ + void *fn(void *, const void *, size_t); #include "mem-memcpy-x86-64-asm-def.h" diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h index d66ab799b35f..f43038f4448b 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -1,12 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ -MEMCPY_FN(__memcpy, +MEMCPY_FN(memcpy_orig, + mem_alloc, + mem_free, "x86-64-unrolled", "unrolled memcpy() in arch/x86/lib/memcpy_64.S") -MEMCPY_FN(memcpy_c, +MEMCPY_FN(__memcpy, + mem_alloc, + mem_free, "x86-64-movsq", "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") - -MEMCPY_FN(memcpy_c_e, - "x86-64-movsb", - "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S index fcd9cf00600a..1b9fef7efcdc 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S @@ -1,9 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Various wrappers to make the kernel .S file build in user-space: */ + +// memcpy_orig is being defined as SYM_L_LOCAL but we need it +#define SYM_FUNC_START_LOCAL(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) #define memcpy MEMCPY /* don't hide glibc's memcpy() */ #define altinstr_replacement text #define globl p2align 4; .globl -#define Lmemcpy_c globl memcpy_c; memcpy_c -#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e -#include "../../../arch/x86/lib/memcpy_64.S" +#define _ASM_EXTABLE_FAULT(x, y) +#define _ASM_EXTABLE(x, y) + +#include "../../arch/x86/lib/memcpy_64.S" /* * We need to provide note.GNU-stack section, saying that we want * NOT executable stack. Otherwise the final linking will assume that diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c deleted file mode 100644 index 25fd3f1966f1..000000000000 --- a/tools/perf/bench/mem-memcpy.c +++ /dev/null @@ -1,303 +0,0 @@ -/* - * mem-memcpy.c - * - * memcpy: Simple memory copy in various ways - * - * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> - */ - -#include "../perf.h" -#include "../util/util.h" -#include "../util/parse-options.h" -#include "../util/header.h" -#include "bench.h" -#include "mem-memcpy-arch.h" - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include <errno.h> - -#define K 1024 - -static const char *length_str = "1MB"; -static const char *routine = "default"; -static int iterations = 1; -static bool use_cycle; -static int cycle_fd; -static bool only_prefault; -static bool no_prefault; - -static const struct option options[] = { - OPT_STRING('l', "length", &length_str, "1MB", - "Specify length of memory to copy. " - "Available units: B, KB, MB, GB and TB (upper and lower)"), - OPT_STRING('r', "routine", &routine, "default", - "Specify routine to copy"), - OPT_INTEGER('i', "iterations", &iterations, - "repeat memcpy() invocation this number of times"), - OPT_BOOLEAN('c', "cycle", &use_cycle, - "Use cycles event instead of gettimeofday() for measuring"), - OPT_BOOLEAN('o', "only-prefault", &only_prefault, - "Show only the result with page faults before memcpy()"), - OPT_BOOLEAN('n', "no-prefault", &no_prefault, - "Show only the result without page faults before memcpy()"), - OPT_END() -}; - -typedef void *(*memcpy_t)(void *, const void *, size_t); - -struct routine { - const char *name; - const char *desc; - memcpy_t fn; -}; - -struct routine routines[] = { - { "default", - "Default memcpy() provided by glibc", - memcpy }, -#ifdef ARCH_X86_64 - -#define MEMCPY_FN(fn, name, desc) { name, desc, fn }, -#include "mem-memcpy-x86-64-asm-def.h" -#undef MEMCPY_FN - -#endif - - { NULL, - NULL, - NULL } -}; - -static const char * const bench_mem_memcpy_usage[] = { - "perf bench mem memcpy <options>", - NULL -}; - -static struct perf_event_attr cycle_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES -}; - -static void init_cycle(void) -{ - cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0); - - if (cycle_fd < 0 && errno == ENOSYS) - die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); - else - BUG_ON(cycle_fd < 0); -} - -static u64 get_cycle(void) -{ - int ret; - u64 clk; - - ret = read(cycle_fd, &clk, sizeof(u64)); - BUG_ON(ret != sizeof(u64)); - - return clk; -} - -static double timeval2double(struct timeval *ts) -{ - return (double)ts->tv_sec + - (double)ts->tv_usec / (double)1000000; -} - -static void alloc_mem(void **dst, void **src, size_t length) -{ - *dst = zalloc(length); - if (!*dst) - die("memory allocation failed - maybe length is too large?\n"); - - *src = zalloc(length); - if (!*src) - die("memory allocation failed - maybe length is too large?\n"); -} - -static u64 do_memcpy_cycle(memcpy_t fn, size_t len, bool prefault) -{ - u64 cycle_start = 0ULL, cycle_end = 0ULL; - void *src = NULL, *dst = NULL; - int i; - - alloc_mem(&src, &dst, len); - - if (prefault) - fn(dst, src, len); - - cycle_start = get_cycle(); - for (i = 0; i < iterations; ++i) - fn(dst, src, len); - cycle_end = get_cycle(); - - free(src); - free(dst); - return cycle_end - cycle_start; -} - -static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault) -{ - struct timeval tv_start, tv_end, tv_diff; - void *src = NULL, *dst = NULL; - int i; - - alloc_mem(&src, &dst, len); - - if (prefault) - fn(dst, src, len); - - BUG_ON(gettimeofday(&tv_start, NULL)); - for (i = 0; i < iterations; ++i) - fn(dst, src, len); - BUG_ON(gettimeofday(&tv_end, NULL)); - - timersub(&tv_end, &tv_start, &tv_diff); - - free(src); - free(dst); - return (double)((double)len / timeval2double(&tv_diff)); -} - -#define pf (no_prefault ? 0 : 1) - -#define print_bps(x) do { \ - if (x < K) \ - printf(" %14lf B/Sec", x); \ - else if (x < K * K) \ - printf(" %14lfd KB/Sec", x / K); \ - else if (x < K * K * K) \ - printf(" %14lf MB/Sec", x / K / K); \ - else \ - printf(" %14lf GB/Sec", x / K / K / K); \ - } while (0) - -int bench_mem_memcpy(int argc, const char **argv, - const char *prefix __maybe_unused) -{ - int i; - size_t len; - double result_bps[2]; - u64 result_cycle[2]; - - argc = parse_options(argc, argv, options, - bench_mem_memcpy_usage, 0); - - if (use_cycle) - init_cycle(); - - len = (size_t)perf_atoll((char *)length_str); - - result_cycle[0] = result_cycle[1] = 0ULL; - result_bps[0] = result_bps[1] = 0.0; - - if ((s64)len <= 0) { - fprintf(stderr, "Invalid length:%s\n", length_str); - return 1; - } - - /* same to without specifying either of prefault and no-prefault */ - if (only_prefault && no_prefault) - only_prefault = no_prefault = false; - - for (i = 0; routines[i].name; i++) { - if (!strcmp(routines[i].name, routine)) - break; - } - if (!routines[i].name) { - printf("Unknown routine:%s\n", routine); - printf("Available routines...\n"); - for (i = 0; routines[i].name; i++) { - printf("\t%s ... %s\n", - routines[i].name, routines[i].desc); - } - return 1; - } - - if (bench_format == BENCH_FORMAT_DEFAULT) - printf("# Copying %s Bytes ...\n\n", length_str); - - if (!only_prefault && !no_prefault) { - /* show both of results */ - if (use_cycle) { - result_cycle[0] = - do_memcpy_cycle(routines[i].fn, len, false); - result_cycle[1] = - do_memcpy_cycle(routines[i].fn, len, true); - } else { - result_bps[0] = - do_memcpy_gettimeofday(routines[i].fn, - len, false); - result_bps[1] = - do_memcpy_gettimeofday(routines[i].fn, - len, true); - } - } else { - if (use_cycle) { - result_cycle[pf] = - do_memcpy_cycle(routines[i].fn, - len, only_prefault); - } else { - result_bps[pf] = - do_memcpy_gettimeofday(routines[i].fn, - len, only_prefault); - } - } - - switch (bench_format) { - case BENCH_FORMAT_DEFAULT: - if (!only_prefault && !no_prefault) { - if (use_cycle) { - printf(" %14lf Cycle/Byte\n", - (double)result_cycle[0] - / (double)len); - printf(" %14lf Cycle/Byte (with prefault)\n", - (double)result_cycle[1] - / (double)len); - } else { - print_bps(result_bps[0]); - printf("\n"); - print_bps(result_bps[1]); - printf(" (with prefault)\n"); - } - } else { - if (use_cycle) { - printf(" %14lf Cycle/Byte", - (double)result_cycle[pf] - / (double)len); - } else - print_bps(result_bps[pf]); - - printf("%s\n", only_prefault ? " (with prefault)" : ""); - } - break; - case BENCH_FORMAT_SIMPLE: - if (!only_prefault && !no_prefault) { - if (use_cycle) { - printf("%lf %lf\n", - (double)result_cycle[0] / (double)len, - (double)result_cycle[1] / (double)len); - } else { - printf("%lf %lf\n", - result_bps[0], result_bps[1]); - } - } else { - if (use_cycle) { - printf("%lf\n", (double)result_cycle[pf] - / (double)len); - } else - printf("%lf\n", result_bps[pf]); - } - break; - default: - /* reaching this means there's some disaster: */ - die("unknown format: %d\n", bench_format); - break; - } - - return 0; -} diff --git a/tools/perf/bench/mem-memset-arch.h b/tools/perf/bench/mem-memset-arch.h index a040fa77665b..278c5da12d63 100644 --- a/tools/perf/bench/mem-memset-arch.h +++ b/tools/perf/bench/mem-memset-arch.h @@ -1,8 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef ARCH_X86_64 +#ifdef HAVE_ARCH_X86_64_SUPPORT -#define MEMSET_FN(fn, name, desc) \ - extern void *fn(void *, int, size_t); +#define MEMSET_FN(fn, init, fini, name, desc) \ + void *fn(void *, int, size_t); #include "mem-memset-x86-64-asm-def.h" diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h index a71dff97c1f5..80ad1b7ea770 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h @@ -1,12 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ -MEMSET_FN(__memset, +MEMSET_FN(memset_orig, + mem_alloc, + mem_free, "x86-64-unrolled", "unrolled memset() in arch/x86/lib/memset_64.S") -MEMSET_FN(memset_c, +MEMSET_FN(__memset, + mem_alloc, + mem_free, "x86-64-stosq", "movsq-based memset() in arch/x86/lib/memset_64.S") - -MEMSET_FN(memset_c_e, - "x86-64-stosb", - "movsb-based memset() in arch/x86/lib/memset_64.S") diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S index 9e5af89ed13a..abd26c95f1aa 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm.S +++ b/tools/perf/bench/mem-memset-x86-64-asm.S @@ -1,9 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// memset_orig is being defined as SYM_L_LOCAL but we need it +#define SYM_FUNC_START_LOCAL(name) \ + SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) #define memset MEMSET /* don't hide glibc's memset() */ #define altinstr_replacement text #define globl p2align 4; .globl -#define Lmemset_c globl memset_c; memset_c -#define Lmemset_c_e globl memset_c_e; memset_c_e -#include "../../../arch/x86/lib/memset_64.S" +#include "../../arch/x86/lib/memset_64.S" /* * We need to provide note.GNU-stack section, saying that we want diff --git a/tools/perf/bench/mem-memset.c b/tools/perf/bench/mem-memset.c deleted file mode 100644 index 4a2f12081964..000000000000 --- a/tools/perf/bench/mem-memset.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - * mem-memset.c - * - * memset: Simple memory set in various ways - * - * Trivial clone of mem-memcpy.c. - */ - -#include "../perf.h" -#include "../util/util.h" -#include "../util/parse-options.h" -#include "../util/header.h" -#include "bench.h" -#include "mem-memset-arch.h" - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/time.h> -#include <errno.h> - -#define K 1024 - -static const char *length_str = "1MB"; -static const char *routine = "default"; -static int iterations = 1; -static bool use_cycle; -static int cycle_fd; -static bool only_prefault; -static bool no_prefault; - -static const struct option options[] = { - OPT_STRING('l', "length", &length_str, "1MB", - "Specify length of memory to set. " - "Available units: B, KB, MB, GB and TB (upper and lower)"), - OPT_STRING('r', "routine", &routine, "default", - "Specify routine to set"), - OPT_INTEGER('i', "iterations", &iterations, - "repeat memset() invocation this number of times"), - OPT_BOOLEAN('c', "cycle", &use_cycle, - "Use cycles event instead of gettimeofday() for measuring"), - OPT_BOOLEAN('o', "only-prefault", &only_prefault, - "Show only the result with page faults before memset()"), - OPT_BOOLEAN('n', "no-prefault", &no_prefault, - "Show only the result without page faults before memset()"), - OPT_END() -}; - -typedef void *(*memset_t)(void *, int, size_t); - -struct routine { - const char *name; - const char *desc; - memset_t fn; -}; - -static const struct routine routines[] = { - { "default", - "Default memset() provided by glibc", - memset }, -#ifdef ARCH_X86_64 - -#define MEMSET_FN(fn, name, desc) { name, desc, fn }, -#include "mem-memset-x86-64-asm-def.h" -#undef MEMSET_FN - -#endif - - { NULL, - NULL, - NULL } -}; - -static const char * const bench_mem_memset_usage[] = { - "perf bench mem memset <options>", - NULL -}; - -static struct perf_event_attr cycle_attr = { - .type = PERF_TYPE_HARDWARE, - .config = PERF_COUNT_HW_CPU_CYCLES -}; - -static void init_cycle(void) -{ - cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0); - - if (cycle_fd < 0 && errno == ENOSYS) - die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); - else - BUG_ON(cycle_fd < 0); -} - -static u64 get_cycle(void) -{ - int ret; - u64 clk; - - ret = read(cycle_fd, &clk, sizeof(u64)); - BUG_ON(ret != sizeof(u64)); - - return clk; -} - -static double timeval2double(struct timeval *ts) -{ - return (double)ts->tv_sec + - (double)ts->tv_usec / (double)1000000; -} - -static void alloc_mem(void **dst, size_t length) -{ - *dst = zalloc(length); - if (!*dst) - die("memory allocation failed - maybe length is too large?\n"); -} - -static u64 do_memset_cycle(memset_t fn, size_t len, bool prefault) -{ - u64 cycle_start = 0ULL, cycle_end = 0ULL; - void *dst = NULL; - int i; - - alloc_mem(&dst, len); - - if (prefault) - fn(dst, -1, len); - - cycle_start = get_cycle(); - for (i = 0; i < iterations; ++i) - fn(dst, i, len); - cycle_end = get_cycle(); - - free(dst); - return cycle_end - cycle_start; -} - -static double do_memset_gettimeofday(memset_t fn, size_t len, bool prefault) -{ - struct timeval tv_start, tv_end, tv_diff; - void *dst = NULL; - int i; - - alloc_mem(&dst, len); - - if (prefault) - fn(dst, -1, len); - - BUG_ON(gettimeofday(&tv_start, NULL)); - for (i = 0; i < iterations; ++i) - fn(dst, i, len); - BUG_ON(gettimeofday(&tv_end, NULL)); - - timersub(&tv_end, &tv_start, &tv_diff); - - free(dst); - return (double)((double)len / timeval2double(&tv_diff)); -} - -#define pf (no_prefault ? 0 : 1) - -#define print_bps(x) do { \ - if (x < K) \ - printf(" %14lf B/Sec", x); \ - else if (x < K * K) \ - printf(" %14lfd KB/Sec", x / K); \ - else if (x < K * K * K) \ - printf(" %14lf MB/Sec", x / K / K); \ - else \ - printf(" %14lf GB/Sec", x / K / K / K); \ - } while (0) - -int bench_mem_memset(int argc, const char **argv, - const char *prefix __maybe_unused) -{ - int i; - size_t len; - double result_bps[2]; - u64 result_cycle[2]; - - argc = parse_options(argc, argv, options, - bench_mem_memset_usage, 0); - - if (use_cycle) - init_cycle(); - - len = (size_t)perf_atoll((char *)length_str); - - result_cycle[0] = result_cycle[1] = 0ULL; - result_bps[0] = result_bps[1] = 0.0; - - if ((s64)len <= 0) { - fprintf(stderr, "Invalid length:%s\n", length_str); - return 1; - } - - /* same to without specifying either of prefault and no-prefault */ - if (only_prefault && no_prefault) - only_prefault = no_prefault = false; - - for (i = 0; routines[i].name; i++) { - if (!strcmp(routines[i].name, routine)) - break; - } - if (!routines[i].name) { - printf("Unknown routine:%s\n", routine); - printf("Available routines...\n"); - for (i = 0; routines[i].name; i++) { - printf("\t%s ... %s\n", - routines[i].name, routines[i].desc); - } - return 1; - } - - if (bench_format == BENCH_FORMAT_DEFAULT) - printf("# Copying %s Bytes ...\n\n", length_str); - - if (!only_prefault && !no_prefault) { - /* show both of results */ - if (use_cycle) { - result_cycle[0] = - do_memset_cycle(routines[i].fn, len, false); - result_cycle[1] = - do_memset_cycle(routines[i].fn, len, true); - } else { - result_bps[0] = - do_memset_gettimeofday(routines[i].fn, - len, false); - result_bps[1] = - do_memset_gettimeofday(routines[i].fn, - len, true); - } - } else { - if (use_cycle) { - result_cycle[pf] = - do_memset_cycle(routines[i].fn, - len, only_prefault); - } else { - result_bps[pf] = - do_memset_gettimeofday(routines[i].fn, - len, only_prefault); - } - } - - switch (bench_format) { - case BENCH_FORMAT_DEFAULT: - if (!only_prefault && !no_prefault) { - if (use_cycle) { - printf(" %14lf Cycle/Byte\n", - (double)result_cycle[0] - / (double)len); - printf(" %14lf Cycle/Byte (with prefault)\n ", - (double)result_cycle[1] - / (double)len); - } else { - print_bps(result_bps[0]); - printf("\n"); - print_bps(result_bps[1]); - printf(" (with prefault)\n"); - } - } else { - if (use_cycle) { - printf(" %14lf Cycle/Byte", - (double)result_cycle[pf] - / (double)len); - } else - print_bps(result_bps[pf]); - - printf("%s\n", only_prefault ? " (with prefault)" : ""); - } - break; - case BENCH_FORMAT_SIMPLE: - if (!only_prefault && !no_prefault) { - if (use_cycle) { - printf("%lf %lf\n", - (double)result_cycle[0] / (double)len, - (double)result_cycle[1] / (double)len); - } else { - printf("%lf %lf\n", - result_bps[0], result_bps[1]); - } - } else { - if (use_cycle) { - printf("%lf\n", (double)result_cycle[pf] - / (double)len); - } else - printf("%lf\n", result_bps[pf]); - } - break; - default: - /* reaching this means there's some disaster: */ - die("unknown format: %d\n", bench_format); - break; - } - - return 0; -} diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 30d1c3225b46..19be2aaf4dc0 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -1,13 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 /* * numa.c * * numa: Simulate NUMA-sensitive workload and measure their NUMA performance */ -#include "../perf.h" -#include "../builtin.h" -#include "../util/util.h" -#include "../util/parse-options.h" +#include <inttypes.h> + +#include <subcmd/parse-options.h> +#include "../util/cloexec.h" #include "bench.h" @@ -15,34 +16,48 @@ #include <sched.h> #include <stdio.h> #include <assert.h> +#include <debug.h> #include <malloc.h> #include <signal.h> #include <stdlib.h> #include <string.h> #include <unistd.h> -#include <pthread.h> #include <sys/mman.h> #include <sys/time.h> +#include <sys/resource.h> #include <sys/wait.h> #include <sys/prctl.h> +#include <sys/stat.h> #include <sys/types.h> - +#include <linux/kernel.h> +#include <linux/time64.h> +#include <linux/numa.h> +#include <linux/zalloc.h> + +#include "../util/header.h" +#include "../util/mutex.h" +#include <api/fs/fs.h> #include <numa.h> #include <numaif.h> +#ifndef RUSAGE_THREAD +# define RUSAGE_THREAD 1 +#endif + /* - * Regular printout to the terminal, supressed if -q is specified: + * Regular printout to the terminal, suppressed if -q is specified: */ #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) /* * Debug printf: */ +#undef dprintf #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0) struct thread_data { int curr_cpu; - cpu_set_t bind_cpumask; + cpu_set_t *bind_cpumask; int bind_node; u8 *process_data; int process_nr; @@ -51,7 +66,10 @@ struct thread_data { unsigned int loops_done; u64 val; u64 runtime_ns; - pthread_mutex_t *process_lock; + u64 system_time_ns; + u64 user_time_ns; + double speed_gbs; + struct mutex *process_lock; }; /* Parameters set by options: */ @@ -101,7 +119,6 @@ struct params { long bytes_thread; int nr_tasks; - bool show_quiet; bool show_convergence; bool measure_convergence; @@ -121,15 +138,16 @@ struct params { struct global_info { u8 *data; - pthread_mutex_t startup_mutex; + struct mutex startup_mutex; + struct cond startup_cond; int nr_tasks_started; - pthread_mutex_t startup_done_mutex; - - pthread_mutex_t start_work_mutex; + struct mutex start_work_mutex; + struct cond start_work_cond; int nr_tasks_working; + bool start_work; - pthread_mutex_t stop_work_mutex; + struct mutex stop_work_mutex; u64 bytes_done; struct thread_data *threads; @@ -159,11 +177,11 @@ static const struct option options[] = { OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"), OPT_STRING('T', "mb_thread" , &p0.mb_thread_str, "MB", "thread memory (MBs)"), - OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run"), - OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run"), + OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run (default: unlimited)"), + OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run (default: 5 secs)"), OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"), - OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"), + OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via reads (can be mixed with -W)"), OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"), OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"), OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"), @@ -178,9 +196,11 @@ static const struct option options[] = { OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"), OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"), OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"), - OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"), + OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details, " + "convergence is reached when each process (all its threads) is running on a single NUMA node."), OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"), - OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "bzero the initial allocations"), + OPT_BOOLEAN('q', "quiet" , &quiet, + "quiet mode (do not show any warnings or messages)"), OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"), /* Special option string parsing callbacks: */ @@ -203,72 +223,163 @@ static const char * const numa_usage[] = { NULL }; -static cpu_set_t bind_to_cpu(int target_cpu) +/* + * To get number of numa nodes present. + */ +static int nr_numa_nodes(void) { - cpu_set_t orig_mask, mask; - int ret; + int i, nr_nodes = 0; - ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); - BUG_ON(ret); + for (i = 0; i < g->p.nr_nodes; i++) { + if (numa_bitmask_isbitset(numa_nodes_ptr, i)) + nr_nodes++; + } + + return nr_nodes; +} + +/* + * To check if given numa node is present. + */ +static int is_node_present(int node) +{ + return numa_bitmask_isbitset(numa_nodes_ptr, node); +} + +/* + * To check given numa node has cpus. + */ +static bool node_has_cpus(int node) +{ + struct bitmask *cpumask = numa_allocate_cpumask(); + bool ret = false; /* fall back to nocpus */ + int cpu; + + BUG_ON(!cpumask); + if (!numa_node_to_cpus(node, cpumask)) { + for (cpu = 0; cpu < (int)cpumask->size; cpu++) { + if (numa_bitmask_isbitset(cpumask, cpu)) { + ret = true; + break; + } + } + } + numa_free_cpumask(cpumask); + + return ret; +} + +static cpu_set_t *bind_to_cpu(int target_cpu) +{ + int nrcpus = numa_num_possible_cpus(); + cpu_set_t *orig_mask, *mask; + size_t size; + + orig_mask = CPU_ALLOC(nrcpus); + BUG_ON(!orig_mask); + size = CPU_ALLOC_SIZE(nrcpus); + CPU_ZERO_S(size, orig_mask); + + if (sched_getaffinity(0, size, orig_mask)) + goto err_out; + + mask = CPU_ALLOC(nrcpus); + if (!mask) + goto err_out; - CPU_ZERO(&mask); + CPU_ZERO_S(size, mask); if (target_cpu == -1) { int cpu; for (cpu = 0; cpu < g->p.nr_cpus; cpu++) - CPU_SET(cpu, &mask); + CPU_SET_S(cpu, size, mask); } else { - BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus); - CPU_SET(target_cpu, &mask); + if (target_cpu < 0 || target_cpu >= g->p.nr_cpus) + goto err; + + CPU_SET_S(target_cpu, size, mask); } - ret = sched_setaffinity(0, sizeof(mask), &mask); - BUG_ON(ret); + if (sched_setaffinity(0, size, mask)) + goto err; return orig_mask; + +err: + CPU_FREE(mask); +err_out: + CPU_FREE(orig_mask); + + /* BUG_ON due to failure in allocation of orig_mask/mask */ + BUG_ON(-1); + return NULL; } -static cpu_set_t bind_to_node(int target_node) +static cpu_set_t *bind_to_node(int target_node) { - int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes; - cpu_set_t orig_mask, mask; + int nrcpus = numa_num_possible_cpus(); + size_t size; + cpu_set_t *orig_mask, *mask; int cpu; - int ret; - BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus); - BUG_ON(!cpus_per_node); + orig_mask = CPU_ALLOC(nrcpus); + BUG_ON(!orig_mask); + size = CPU_ALLOC_SIZE(nrcpus); + CPU_ZERO_S(size, orig_mask); - ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); - BUG_ON(ret); + if (sched_getaffinity(0, size, orig_mask)) + goto err_out; - CPU_ZERO(&mask); + mask = CPU_ALLOC(nrcpus); + if (!mask) + goto err_out; - if (target_node == -1) { + CPU_ZERO_S(size, mask); + + if (target_node == NUMA_NO_NODE) { for (cpu = 0; cpu < g->p.nr_cpus; cpu++) - CPU_SET(cpu, &mask); + CPU_SET_S(cpu, size, mask); } else { - int cpu_start = (target_node + 0) * cpus_per_node; - int cpu_stop = (target_node + 1) * cpus_per_node; + struct bitmask *cpumask = numa_allocate_cpumask(); - BUG_ON(cpu_stop > g->p.nr_cpus); + if (!cpumask) + goto err; - for (cpu = cpu_start; cpu < cpu_stop; cpu++) - CPU_SET(cpu, &mask); + if (!numa_node_to_cpus(target_node, cpumask)) { + for (cpu = 0; cpu < (int)cpumask->size; cpu++) { + if (numa_bitmask_isbitset(cpumask, cpu)) + CPU_SET_S(cpu, size, mask); + } + } + numa_free_cpumask(cpumask); } - ret = sched_setaffinity(0, sizeof(mask), &mask); - BUG_ON(ret); + if (sched_setaffinity(0, size, mask)) + goto err; return orig_mask; + +err: + CPU_FREE(mask); +err_out: + CPU_FREE(orig_mask); + + /* BUG_ON due to failure in allocation of orig_mask/mask */ + BUG_ON(-1); + return NULL; } -static void bind_to_cpumask(cpu_set_t mask) +static void bind_to_cpumask(cpu_set_t *mask) { int ret; + size_t size = CPU_ALLOC_SIZE(numa_num_possible_cpus()); - ret = sched_setaffinity(0, sizeof(mask), &mask); - BUG_ON(ret); + ret = sched_setaffinity(0, size, mask); + if (ret) { + CPU_FREE(mask); + BUG_ON(ret); + } } static void mempol_restore(void) @@ -282,18 +393,22 @@ static void mempol_restore(void) static void bind_to_memnode(int node) { - unsigned long nodemask; + struct bitmask *node_mask; int ret; - if (node == -1) + if (node == NUMA_NO_NODE) return; - BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)); - nodemask = 1L << node; + node_mask = numa_allocate_nodemask(); + BUG_ON(!node_mask); - ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); - dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); + numa_bitmask_clearall(node_mask); + numa_bitmask_setbit(node_mask, node); + ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1); + dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret); + + numa_bitmask_free(node_mask); BUG_ON(ret); } @@ -310,7 +425,7 @@ do { \ static u8 *alloc_data(ssize_t bytes0, int map_flags, int init_zero, int init_cpu0, int thp, int init_random) { - cpu_set_t orig_mask; + cpu_set_t *orig_mask = NULL; ssize_t bytes; u8 *buf; int ret; @@ -320,8 +435,10 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags, /* Allocate and initialize all memory on CPU#0: */ if (init_cpu0) { - orig_mask = bind_to_node(0); - bind_to_memnode(0); + int node = numa_node_of_cpu(0); + + orig_mask = bind_to_node(node); + bind_to_memnode(node); } bytes = bytes0 + HPSIZE; @@ -366,6 +483,7 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags, /* Restore affinity: */ if (init_cpu0) { bind_to_cpumask(orig_mask); + CPU_FREE(orig_mask); mempol_restore(); } @@ -408,18 +526,6 @@ static void * setup_private_data(ssize_t bytes) return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); } -/* - * Return a process-shared (global) mutex: - */ -static void init_global_mutex(pthread_mutex_t *mutex) -{ - pthread_mutexattr_t attr; - - pthread_mutexattr_init(&attr); - pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); - pthread_mutex_init(mutex, &attr); -} - static int parse_cpu_list(const char *arg) { p0.cpu_list_str = strdup(arg); @@ -429,14 +535,65 @@ static int parse_cpu_list(const char *arg) return 0; } -static void parse_setup_cpu_list(void) +/* + * Check whether a CPU is online + * + * Returns: + * 1 -> if CPU is online + * 0 -> if CPU is offline + * -1 -> error case + */ +static int is_cpu_online(unsigned int cpu) +{ + char *str; + size_t strlen; + char buf[256]; + int status = -1; + struct stat statbuf; + + snprintf(buf, sizeof(buf), + "/sys/devices/system/cpu/cpu%d", cpu); + if (stat(buf, &statbuf) != 0) + return 0; + + /* + * Check if /sys/devices/system/cpu/cpux/online file + * exists. Some cases cpu0 won't have online file since + * it is not expected to be turned off generally. + * In kernels without CONFIG_HOTPLUG_CPU, this + * file won't exist + */ + snprintf(buf, sizeof(buf), + "/sys/devices/system/cpu/cpu%d/online", cpu); + if (stat(buf, &statbuf) != 0) + return 1; + + /* + * Read online file using sysfs__read_str. + * If read or open fails, return -1. + * If read succeeds, return value from file + * which gets stored in "str" + */ + snprintf(buf, sizeof(buf), + "devices/system/cpu/cpu%d/online", cpu); + + if (sysfs__read_str(buf, &str, &strlen) < 0) + return status; + + status = atoi(str); + + free(str); + return status; +} + +static int parse_setup_cpu_list(void) { struct thread_data *td; char *str0, *str; int t; if (!g->p.cpu_list_str) - return; + return 0; dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); @@ -500,11 +657,21 @@ static void parse_setup_cpu_list(void) dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul); - BUG_ON(bind_cpu_0 < 0 || bind_cpu_0 >= g->p.nr_cpus); - BUG_ON(bind_cpu_1 < 0 || bind_cpu_1 >= g->p.nr_cpus); + if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) { + printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus); + return -1; + } + + if (is_cpu_online(bind_cpu_0) != 1 || is_cpu_online(bind_cpu_1) != 1) { + printf("\nTest not applicable, bind_cpu_0 or bind_cpu_1 is offline\n"); + return -1; + } + + BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0); BUG_ON(bind_cpu_0 > bind_cpu_1); for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) { + size_t size = CPU_ALLOC_SIZE(g->p.nr_cpus); int i; for (i = 0; i < mul; i++) { @@ -524,10 +691,15 @@ static void parse_setup_cpu_list(void) tprintf("%2d", bind_cpu); } - CPU_ZERO(&td->bind_cpumask); + td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus); + BUG_ON(!td->bind_cpumask); + CPU_ZERO_S(size, td->bind_cpumask); for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) { - BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus); - CPU_SET(cpu, &td->bind_cpumask); + if (cpu < 0 || cpu >= g->p.nr_cpus) { + CPU_FREE(td->bind_cpumask); + BUG_ON(-1); + } + CPU_SET_S(cpu, size, td->bind_cpumask); } t++; } @@ -541,6 +713,7 @@ out: printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t); free(str0); + return 0; } static int parse_cpus_opt(const struct option *opt __maybe_unused, @@ -561,14 +734,14 @@ static int parse_node_list(const char *arg) return 0; } -static void parse_setup_node_list(void) +static int parse_setup_node_list(void) { struct thread_data *td; char *str0, *str; int t; if (!g->p.node_list_str) - return; + return 0; dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); @@ -619,15 +792,19 @@ static void parse_setup_node_list(void) dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step); - BUG_ON(bind_node_0 < 0 || bind_node_0 >= g->p.nr_nodes); - BUG_ON(bind_node_1 < 0 || bind_node_1 >= g->p.nr_nodes); + if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) { + printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes); + return -1; + } + + BUG_ON(bind_node_0 < 0 || bind_node_1 < 0); BUG_ON(bind_node_0 > bind_node_1); for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) { int i; for (i = 0; i < mul; i++) { - if (t >= g->p.nr_tasks) { + if (t >= g->p.nr_tasks || !node_has_cpus(bind_node)) { printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node); goto out; } @@ -651,6 +828,7 @@ out: printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t); free(str0); + return 0; } static int parse_nodes_opt(const struct option *opt __maybe_unused, @@ -660,12 +838,8 @@ static int parse_nodes_opt(const struct option *opt __maybe_unused, return -1; return parse_node_list(arg); - - return 0; } -#define BIT(x) (1ul << x) - static inline uint32_t lfsr_32(uint32_t lfsr) { const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31); @@ -678,7 +852,7 @@ static inline uint32_t lfsr_32(uint32_t lfsr) * kernel (KSM, zero page, etc.) cannot optimize away RAM * accesses: */ -static inline u64 access_data(u64 *data __attribute__((unused)), u64 val) +static inline u64 access_data(u64 *data, u64 val) { if (g->p.data_reads) val += *data; @@ -726,7 +900,7 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val if (g->p.data_rand_walk) { u32 lfsr = nr + loop + val; - int j; + long j; for (i = 0; i < words/1024; i++) { long start, end; @@ -744,12 +918,12 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val } } } else if (!g->p.data_backwards || (nr + loop) & 1) { + /* Process data forwards: */ d0 = data + off; d = data + off + 1; d1 = data + words; - /* Process data forwards: */ for (;;) { if (unlikely(d >= d1)) d = data; @@ -767,7 +941,6 @@ static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val d = data + off - 1; d1 = data + words; - /* Process data forwards: */ for (;;) { if (unlikely(d < data)) d = data + words-1; @@ -793,8 +966,6 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) prctl(0, bytes_worked); } -#define MAX_NR_NODES 64 - /* * Count the number of nodes a process's threads * are spread out on. @@ -805,10 +976,15 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) */ static int count_process_nodes(int process_nr) { - char node_present[MAX_NR_NODES] = { 0, }; + char *node_present; int nodes; int n, t; + node_present = (char *)malloc(g->p.nr_nodes * sizeof(char)); + BUG_ON(!node_present); + for (nodes = 0; nodes < g->p.nr_nodes; nodes++) + node_present[nodes] = 0; + for (t = 0; t < g->p.nr_threads; t++) { struct thread_data *td; int task_nr; @@ -818,14 +994,20 @@ static int count_process_nodes(int process_nr) td = g->threads + task_nr; node = numa_node_of_cpu(td->curr_cpu); + if (node < 0) /* curr_cpu was likely still -1 */ { + free(node_present); + return 0; + } + node_present[node] = 1; } nodes = 0; - for (n = 0; n < MAX_NR_NODES; n++) + for (n = 0; n < g->p.nr_nodes; n++) nodes += node_present[n]; + free(node_present); return nodes; } @@ -872,6 +1054,11 @@ static void calc_convergence_compression(int *strong) for (p = 0; p < g->p.nr_proc; p++) { unsigned int nodes = count_process_nodes(p); + if (!nodes) { + *strong = 0; + return; + } + nodes_min = min(nodes, nodes_min); nodes_max = max(nodes, nodes_max); } @@ -889,7 +1076,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence) { unsigned int loops_done_min, loops_done_max; int process_groups; - int nodes[MAX_NR_NODES]; + int *nodes; int distance; int nr_min; int nr_max; @@ -903,6 +1090,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence) if (!g->p.show_convergence && !g->p.measure_convergence) return; + nodes = (int *)malloc(g->p.nr_nodes * sizeof(int)); + BUG_ON(!nodes); for (node = 0; node < g->p.nr_nodes; node++) nodes[node] = 0; @@ -933,6 +1122,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence) sum = 0; for (node = 0; node < g->p.nr_nodes; node++) { + if (!is_node_present(node)) + continue; nr = nodes[node]; nr_min = min(nr, nr_min); nr_max = max(nr, nr_max); @@ -942,8 +1133,10 @@ static void calc_convergence(double runtime_ns_max, double *convergence) BUG_ON(sum > g->p.nr_tasks); - if (0 && (sum < g->p.nr_tasks)) + if (0 && (sum < g->p.nr_tasks)) { + free(nodes); return; + } /* * Count the number of distinct process groups present @@ -953,8 +1146,11 @@ static void calc_convergence(double runtime_ns_max, double *convergence) process_groups = 0; for (node = 0; node < g->p.nr_nodes; node++) { - int processes = count_node_processes(node); + int processes; + if (!is_node_present(node)) + continue; + processes = count_node_processes(node); nr = nodes[node]; tprintf(" %2d/%-2d", nr, processes); @@ -979,7 +1175,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence) if (strong && process_groups == g->p.nr_proc) { if (!*convergence) { *convergence = runtime_ns_max; - tprintf(" (%6.1fs converged)\n", *convergence/1e9); + tprintf(" (%6.1fs converged)\n", *convergence / NSEC_PER_SEC); if (g->p.measure_convergence) { g->all_converged = true; g->stop_work = true; @@ -987,17 +1183,19 @@ static void calc_convergence(double runtime_ns_max, double *convergence) } } else { if (*convergence) { - tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9); + tprintf(" (%6.1fs de-converged)", runtime_ns_max / NSEC_PER_SEC); *convergence = 0; } tprintf("\n"); } + + free(nodes); } static void show_summary(double runtime_ns_max, int l, double *convergence) { tprintf("\r # %5.1f%% [%.1f mins]", - (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0); + (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max / NSEC_PER_SEC / 60.0); calc_convergence(runtime_ns_max, convergence); @@ -1021,9 +1219,10 @@ static void *worker_thread(void *__tdata) u8 *global_data; u8 *process_data; u8 *thread_data; - u64 bytes_done; + u64 bytes_done, secs; long work_done; u32 l; + struct rusage rusage; bind_to_cpumask(td->bind_cpumask); bind_to_memnode(td->bind_node); @@ -1050,19 +1249,22 @@ static void *worker_thread(void *__tdata) } if (g->p.serialize_startup) { - pthread_mutex_lock(&g->startup_mutex); + mutex_lock(&g->startup_mutex); g->nr_tasks_started++; - pthread_mutex_unlock(&g->startup_mutex); + /* The last thread wakes the main process. */ + if (g->nr_tasks_started == g->p.nr_tasks) + cond_signal(&g->startup_cond); + + mutex_unlock(&g->startup_mutex); /* Here we will wait for the main process to start us all at once: */ - pthread_mutex_lock(&g->start_work_mutex); + mutex_lock(&g->start_work_mutex); + g->start_work = false; g->nr_tasks_working++; + while (!g->start_work) + cond_wait(&g->start_work_cond, &g->start_work_mutex); - /* Last one wake the main process: */ - if (g->nr_tasks_working == g->p.nr_tasks) - pthread_mutex_unlock(&g->startup_done_mutex); - - pthread_mutex_unlock(&g->start_work_mutex); + mutex_unlock(&g->start_work_mutex); } gettimeofday(&start0, NULL); @@ -1081,17 +1283,17 @@ static void *worker_thread(void *__tdata) val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val); if (g->p.sleep_usecs) { - pthread_mutex_lock(td->process_lock); + mutex_lock(td->process_lock); usleep(g->p.sleep_usecs); - pthread_mutex_unlock(td->process_lock); + mutex_unlock(td->process_lock); } /* * Amount of work to be done under a process-global lock: */ if (g->p.bytes_process_locked) { - pthread_mutex_lock(td->process_lock); + mutex_lock(td->process_lock); val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val); - pthread_mutex_unlock(td->process_lock); + mutex_unlock(td->process_lock); } work_done = g->p.bytes_global + g->p.bytes_process + @@ -1110,7 +1312,7 @@ static void *worker_thread(void *__tdata) /* Check whether our max runtime timed out: */ if (g->p.nr_secs) { timersub(&stop, &start0, &diff); - if (diff.tv_sec >= g->p.nr_secs) { + if ((u32)diff.tv_sec >= g->p.nr_secs) { g->stop_work = true; break; } @@ -1125,7 +1327,7 @@ static void *worker_thread(void *__tdata) * by migrating to CPU#0: */ if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { - cpu_set_t orig_mask; + cpu_set_t *orig_mask; int target_cpu; int this_cpu; @@ -1149,15 +1351,16 @@ static void *worker_thread(void *__tdata) printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu); bind_to_cpumask(orig_mask); + CPU_FREE(orig_mask); } if (details >= 3) { timersub(&stop, &start, &diff); - runtime_ns_max = diff.tv_sec * 1000000000; - runtime_ns_max += diff.tv_usec * 1000; + runtime_ns_max = diff.tv_sec * NSEC_PER_SEC; + runtime_ns_max += diff.tv_usec * NSEC_PER_USEC; if (details >= 0) { - printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016lx]\n", + printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n", process_nr, thread_nr, runtime_ns_max / bytes_done, val); } fflush(stdout); @@ -1166,22 +1369,30 @@ static void *worker_thread(void *__tdata) continue; timersub(&stop, &start0, &diff); - runtime_ns_max = diff.tv_sec * 1000000000ULL; - runtime_ns_max += diff.tv_usec * 1000ULL; + runtime_ns_max = diff.tv_sec * NSEC_PER_SEC; + runtime_ns_max += diff.tv_usec * NSEC_PER_USEC; show_summary(runtime_ns_max, l, &convergence); } gettimeofday(&stop, NULL); timersub(&stop, &start0, &diff); - td->runtime_ns = diff.tv_sec * 1000000000ULL; - td->runtime_ns += diff.tv_usec * 1000ULL; + td->runtime_ns = diff.tv_sec * NSEC_PER_SEC; + td->runtime_ns += diff.tv_usec * NSEC_PER_USEC; + secs = td->runtime_ns / NSEC_PER_SEC; + td->speed_gbs = secs ? bytes_done / secs / 1e9 : 0; + + getrusage(RUSAGE_THREAD, &rusage); + td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC; + td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC; + td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC; + td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC; free_data(thread_data, g->p.bytes_thread); - pthread_mutex_lock(&g->stop_work_mutex); + mutex_lock(&g->stop_work_mutex); g->bytes_done += bytes_done; - pthread_mutex_unlock(&g->stop_work_mutex); + mutex_unlock(&g->stop_work_mutex); return NULL; } @@ -1191,7 +1402,7 @@ static void *worker_thread(void *__tdata) */ static void worker_process(int process_nr) { - pthread_mutex_t process_lock; + struct mutex process_lock; struct thread_data *td; pthread_t *pthreads; u8 *process_data; @@ -1199,7 +1410,7 @@ static void worker_process(int process_nr) int ret; int t; - pthread_mutex_init(&process_lock, NULL); + mutex_init(&process_lock); set_taskname("process %d", process_nr); /* @@ -1252,7 +1463,7 @@ static void print_summary(void) printf("\n ###\n"); printf(" # %d %s will execute (on %d nodes, %d CPUs):\n", - g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus); + g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", nr_numa_nodes(), g->p.nr_cpus); printf(" # %5dx %5ldMB global shared mem operations\n", g->p.nr_loops, g->p.bytes_global/1024/1024); printf(" # %5dx %5ldMB process shared mem operations\n", @@ -1274,21 +1485,31 @@ static void init_thread_data(void) for (t = 0; t < g->p.nr_tasks; t++) { struct thread_data *td = g->threads + t; + size_t cpuset_size = CPU_ALLOC_SIZE(g->p.nr_cpus); int cpu; /* Allow all nodes by default: */ - td->bind_node = -1; + td->bind_node = NUMA_NO_NODE; /* Allow all CPUs by default: */ - CPU_ZERO(&td->bind_cpumask); + td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus); + BUG_ON(!td->bind_cpumask); + CPU_ZERO_S(cpuset_size, td->bind_cpumask); for (cpu = 0; cpu < g->p.nr_cpus; cpu++) - CPU_SET(cpu, &td->bind_cpumask); + CPU_SET_S(cpu, cpuset_size, td->bind_cpumask); } } static void deinit_thread_data(void) { ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; + int t; + + /* Free the bind_cpumask allocated for thread_data */ + for (t = 0; t < g->p.nr_tasks; t++) { + struct thread_data *td = g->threads + t; + CPU_FREE(td->bind_cpumask); + } free_data(g->threads, size); } @@ -1305,9 +1526,9 @@ static int init(void) g->p.nr_nodes = numa_max_node() + 1; /* char array in count_process_nodes(): */ - BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); + BUG_ON(g->p.nr_nodes < 0); - if (g->p.show_quiet && !g->p.show_details) + if (quiet && !g->p.show_details) g->p.show_details = -1; /* Some memory should be specified: */ @@ -1348,16 +1569,17 @@ static int init(void) g->data = setup_shared_data(g->p.bytes_global); /* Startup serialization: */ - init_global_mutex(&g->start_work_mutex); - init_global_mutex(&g->startup_mutex); - init_global_mutex(&g->startup_done_mutex); - init_global_mutex(&g->stop_work_mutex); + mutex_init_pshared(&g->start_work_mutex); + cond_init_pshared(&g->start_work_cond); + mutex_init_pshared(&g->startup_mutex); + cond_init_pshared(&g->startup_cond); + mutex_init_pshared(&g->stop_work_mutex); init_thread_data(); tprintf("#\n"); - parse_setup_cpu_list(); - parse_setup_node_list(); + if (parse_setup_cpu_list() || parse_setup_node_list()) + return -1; tprintf("#\n"); print_summary(); @@ -1385,7 +1607,7 @@ static void print_res(const char *name, double val, if (!name) name = "main,"; - if (g->p.show_quiet) + if (!quiet) printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short); else printf(" %14.3f %s\n", val, txt_long); @@ -1402,7 +1624,7 @@ static int __bench_numa(const char *name) double runtime_sec_min; int wait_stat; double bytes; - int i, t; + int i, t, p; if (init()) return -1; @@ -1410,9 +1632,6 @@ static int __bench_numa(const char *name) pids = zalloc(g->p.nr_proc * sizeof(*pids)); pid = -1; - /* All threads try to acquire it, this way we can wait for them to start up: */ - pthread_mutex_lock(&g->start_work_mutex); - if (g->p.serialize_startup) { tprintf(" #\n"); tprintf(" # Startup synchronization: ..."); fflush(stdout); @@ -1434,36 +1653,47 @@ static int __bench_numa(const char *name) pids[i] = pid; } - /* Wait for all the threads to start up: */ - while (g->nr_tasks_started != g->p.nr_tasks) - usleep(1000); - - BUG_ON(g->nr_tasks_started != g->p.nr_tasks); if (g->p.serialize_startup) { + bool threads_ready = false; double startup_sec; - pthread_mutex_lock(&g->startup_done_mutex); - - /* This will start all threads: */ - pthread_mutex_unlock(&g->start_work_mutex); - - /* This mutex is locked - the last started thread will wake us: */ - pthread_mutex_lock(&g->startup_done_mutex); + /* + * Wait for all the threads to start up. The last thread will + * signal this process. + */ + mutex_lock(&g->startup_mutex); + while (g->nr_tasks_started != g->p.nr_tasks) + cond_wait(&g->startup_cond, &g->startup_mutex); + + mutex_unlock(&g->startup_mutex); + + /* Wait for all threads to be at the start_work_cond. */ + while (!threads_ready) { + mutex_lock(&g->start_work_mutex); + threads_ready = (g->nr_tasks_working == g->p.nr_tasks); + mutex_unlock(&g->start_work_mutex); + if (!threads_ready) + usleep(1); + } gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); - startup_sec = diff.tv_sec * 1000000000.0; - startup_sec += diff.tv_usec * 1000.0; - startup_sec /= 1e9; + startup_sec = diff.tv_sec * NSEC_PER_SEC; + startup_sec += diff.tv_usec * NSEC_PER_USEC; + startup_sec /= NSEC_PER_SEC; tprintf(" threads initialized in %.6f seconds.\n", startup_sec); tprintf(" #\n"); start = stop; - pthread_mutex_unlock(&g->startup_done_mutex); + /* Start all threads running. */ + mutex_lock(&g->start_work_mutex); + g->start_work = true; + mutex_unlock(&g->start_work_mutex); + cond_broadcast(&g->start_work_cond); } else { gettimeofday(&start, NULL); } @@ -1496,14 +1726,14 @@ static int __bench_numa(const char *name) tprintf("\n ###\n"); tprintf("\n"); - runtime_sec_max = diff.tv_sec * 1000000000.0; - runtime_sec_max += diff.tv_usec * 1000.0; - runtime_sec_max /= 1e9; + runtime_sec_max = diff.tv_sec * NSEC_PER_SEC; + runtime_sec_max += diff.tv_usec * NSEC_PER_USEC; + runtime_sec_max /= NSEC_PER_SEC; - runtime_sec_min = runtime_ns_min/1e9; + runtime_sec_min = runtime_ns_min / NSEC_PER_SEC; bytes = g->bytes_done; - runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9; + runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC; if (g->p.measure_convergence) { print_res(name, runtime_sec_max, @@ -1529,7 +1759,7 @@ static int __bench_numa(const char *name) print_res(name, bytes / 1e9, "GB,", "data-total", "GB data processed, total"); - print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks), + print_res(name, runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks), "nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime"); print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max, @@ -1538,6 +1768,24 @@ static int __bench_numa(const char *name) print_res(name, bytes / runtime_sec_max / 1e9, "GB/sec,", "total-speed", "GB/sec total speed"); + if (g->p.show_details >= 2) { + char tname[14 + 2 * 11 + 1]; + struct thread_data *td; + for (p = 0; p < g->p.nr_proc; p++) { + for (t = 0; t < g->p.nr_threads; t++) { + memset(tname, 0, sizeof(tname)); + td = g->threads + p*g->p.nr_threads + t; + snprintf(tname, sizeof(tname), "process%d:thread%d", p, t); + print_res(tname, td->speed_gbs, + "GB/sec", "thread-speed", "GB/sec/thread speed"); + print_res(tname, td->system_time_ns / NSEC_PER_SEC, + "secs", "thread-system-time", "system CPU time/thread"); + print_res(tname, td->user_time_ns / NSEC_PER_SEC, + "secs", "thread-user-time", "user CPU time/thread"); + } + } + } + free(pids); deinit(); @@ -1583,6 +1831,11 @@ static void init_params(struct params *p, const char *name, int argc, const char p->data_rand_walk = true; p->nr_loops = -1; p->init_random = true; + p->mb_global_str = "1"; + p->nr_proc = 1; + p->nr_threads = 1; + p->nr_secs = 5; + p->run_all = argc == 1; } static int run_bench_numa(const char *name, const char **argv) @@ -1600,7 +1853,6 @@ static int run_bench_numa(const char *name, const char **argv) return 0; err: - usage_with_options(numa_usage, options); return -1; } @@ -1620,12 +1872,12 @@ err: */ static const char *tests[][MAX_ARGS] = { /* Basic single-stream NUMA bandwidth measurements: */ - { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", + { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "0", OPT_BW_RAM }, { "RAM-bw-local-NOTHP,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP }, - { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", + { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024", "-C" , "0", "-M", "1", OPT_BW_RAM }, /* 2-stream NUMA bandwidth measurements: */ @@ -1642,7 +1894,7 @@ static const char *tests[][MAX_ARGS] = { { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV }, { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV }, { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV }, - { " 2x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, + { " 2x3-convergence,", "mem", "-p", "2", "-t", "3", "-P", "1020", OPT_CONV }, { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV }, { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV }, { " 4x4-convergence-NOTHP,", @@ -1667,24 +1919,24 @@ static const char *tests[][MAX_ARGS] = { "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP }, { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW }, - { " 4x1-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, - { " 8x1-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, - { "16x1-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, - { "32x1-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, + { " 1x4-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW }, + { " 1x8-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW }, + { "1x16-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW }, + { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW }, - { " 2x3-bw-thread,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, - { " 4x4-bw-thread,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, - { " 4x6-bw-thread,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, - { " 4x8-bw-thread,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, - { " 4x8-bw-thread-NOTHP,", + { " 2x3-bw-process,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW }, + { " 4x4-bw-process,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW }, + { " 4x6-bw-process,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW }, + { " 4x8-bw-process,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW }, + { " 4x8-bw-process-NOTHP,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP }, - { " 3x3-bw-thread,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, - { " 5x5-bw-thread,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, + { " 3x3-bw-process,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW }, + { " 5x5-bw-process,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW }, - { "2x16-bw-thread,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, - { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, + { "2x16-bw-process,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW }, + { "1x32-bw-process,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW }, - { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, + { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW }, { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP }, { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW }, { "numa01-bw-thread-NOTHP,", @@ -1701,8 +1953,7 @@ static int bench_all(void) BUG_ON(ret < 0); for (i = 0; i < nr; i++) { - if (run_bench_numa(tests[i][0], tests[i] + 1)) - return -1; + run_bench_numa(tests[i][0], tests[i] + 1); } printf("\n"); @@ -1710,7 +1961,7 @@ static int bench_all(void) return 0; } -int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused) +int bench_numa(int argc, const char **argv) { init_params(&p0, "main,", argc, argv); argc = parse_options(argc, argv, options, bench_numa_usage, 0); diff --git a/tools/perf/bench/pmu-scan.c b/tools/perf/bench/pmu-scan.c new file mode 100644 index 000000000000..14a464ad8cea --- /dev/null +++ b/tools/perf/bench/pmu-scan.c @@ -0,0 +1,187 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark scanning sysfs files for PMU information. + * + * Copyright 2023 Google LLC. + */ +#include <errno.h> +#include <stdio.h> +#include "bench.h" +#include "util/debug.h" +#include "util/pmu.h" +#include "util/pmus.h" +#include "util/stat.h" +#include <linux/atomic.h> +#include <linux/err.h> +#include <linux/time64.h> +#include <subcmd/parse-options.h> + +static unsigned int iterations = 100; + +struct pmu_scan_result { + char *name; + int nr_aliases; + int nr_formats; + int nr_caps; + bool is_core; +}; + +static const struct option options[] = { + OPT_UINTEGER('i', "iterations", &iterations, + "Number of iterations used to compute average"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals pmu-scan <options>", + NULL +}; + +static int nr_pmus; +static struct pmu_scan_result *results; + +static int save_result(void) +{ + struct perf_pmu *pmu = NULL; + struct list_head *list; + struct pmu_scan_result *r; + + while ((pmu = perf_pmus__scan(pmu)) != NULL) { + r = realloc(results, (nr_pmus + 1) * sizeof(*r)); + if (r == NULL) + return -ENOMEM; + + results = r; + r = results + nr_pmus; + + r->name = strdup(pmu->name); + r->is_core = pmu->is_core; + r->nr_caps = pmu->nr_caps; + + r->nr_aliases = perf_pmu__num_events(pmu); + + r->nr_formats = 0; + list_for_each(list, &pmu->format) + r->nr_formats++; + + pr_debug("pmu[%d] name=%s, nr_caps=%d, nr_aliases=%d, nr_formats=%d\n", + nr_pmus, r->name, r->nr_caps, r->nr_aliases, r->nr_formats); + nr_pmus++; + } + + perf_pmus__destroy(); + return 0; +} + +static int check_result(bool core_only) +{ + struct pmu_scan_result *r; + struct perf_pmu *pmu; + struct list_head *list; + int nr; + + for (int i = 0; i < nr_pmus; i++) { + r = &results[i]; + if (core_only && !r->is_core) + continue; + + pmu = perf_pmus__find(r->name); + if (pmu == NULL) { + pr_err("Cannot find PMU %s\n", r->name); + return -1; + } + + if (pmu->nr_caps != (u32)r->nr_caps) { + pr_err("Unmatched number of event caps in %s: expect %d vs got %d\n", + pmu->name, r->nr_caps, pmu->nr_caps); + return -1; + } + + nr = perf_pmu__num_events(pmu); + if (nr != r->nr_aliases) { + pr_err("Unmatched number of event aliases in %s: expect %d vs got %d\n", + pmu->name, r->nr_aliases, nr); + return -1; + } + + nr = 0; + list_for_each(list, &pmu->format) + nr++; + if (nr != r->nr_formats) { + pr_err("Unmatched number of event formats in %s: expect %d vs got %d\n", + pmu->name, r->nr_formats, nr); + return -1; + } + } + return 0; +} + +static void delete_result(void) +{ + for (int i = 0; i < nr_pmus; i++) + free(results[i].name); + free(results); + + results = NULL; + nr_pmus = 0; +} + +static int run_pmu_scan(void) +{ + struct stats stats; + struct timeval start, end, diff; + double time_average, time_stddev; + u64 runtime_us; + int ret; + + init_stats(&stats); + pr_info("Computing performance of sysfs PMU event scan for %u times\n", + iterations); + + if (save_result() < 0) { + pr_err("Failed to initialize PMU scan result\n"); + return -1; + } + + for (int j = 0; j < 2; j++) { + bool core_only = (j == 0); + + for (unsigned int i = 0; i < iterations; i++) { + gettimeofday(&start, NULL); + if (core_only) + perf_pmus__scan_core(NULL); + else + perf_pmus__scan(NULL); + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&stats, runtime_us); + + ret = check_result(core_only); + perf_pmus__destroy(); + if (ret < 0) + break; + } + time_average = avg_stats(&stats); + time_stddev = stddev_stats(&stats); + pr_info(" Average%s PMU scanning took: %.3f usec (+- %.3f usec)\n", + core_only ? " core" : "", time_average, time_stddev); + } + delete_result(); + return 0; +} + +int bench_pmu_scan(int argc, const char **argv) +{ + int err = 0; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + err = run_pmu_scan(); + + return err; +} diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c index cc1190a0849b..93dcd9dba3d0 100644 --- a/tools/perf/bench/sched-messaging.c +++ b/tools/perf/bench/sched-messaging.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * * sched-messaging.c @@ -9,10 +10,7 @@ * */ -#include "../perf.h" -#include "../util/util.h" -#include "../util/parse-options.h" -#include "../builtin.h" +#include <subcmd/parse-options.h> #include "bench.h" /* Test groups of 20 processes spraying to 20 receivers */ @@ -26,35 +24,44 @@ #include <sys/socket.h> #include <sys/wait.h> #include <sys/time.h> -#include <sys/poll.h> +#include <poll.h> #include <limits.h> +#include <err.h> +#include <linux/list.h> +#include <linux/time64.h> #define DATASIZE 100 static bool use_pipes = false; -static unsigned int loops = 100; +static unsigned int nr_loops = 100; static bool thread_mode = false; static unsigned int num_groups = 10; +static unsigned int total_children = 0; +static struct list_head sender_contexts = LIST_HEAD_INIT(sender_contexts); +static struct list_head receiver_contexts = LIST_HEAD_INIT(receiver_contexts); struct sender_context { + struct list_head list; unsigned int num_fds; int ready_out; int wakefd; - int out_fds[0]; + int out_fds[]; }; struct receiver_context { + struct list_head list; unsigned int num_packets; int in_fds[2]; int ready_out; int wakefd; }; -static void barf(const char *msg) -{ - fprintf(stderr, "%s (error: %s)\n", msg, strerror(errno)); - exit(1); -} +union messaging_worker { + pthread_t thread; + pid_t pid; +}; + +static union messaging_worker *worker_tab; static void fdpair(int fds[2]) { @@ -66,42 +73,42 @@ static void fdpair(int fds[2]) return; } - barf(use_pipes ? "pipe()" : "socketpair()"); + err(EXIT_FAILURE, use_pipes ? "pipe()" : "socketpair()"); } /* Block until we're ready to go */ static void ready(int ready_out, int wakefd) { - char dummy; struct pollfd pollfd = { .fd = wakefd, .events = POLLIN }; /* Tell them we're ready. */ - if (write(ready_out, &dummy, 1) != 1) - barf("CLIENT: ready write"); + if (write(ready_out, "R", 1) != 1) + err(EXIT_FAILURE, "CLIENT: ready write"); /* Wait for "GO" signal */ if (poll(&pollfd, 1, -1) != 1) - barf("poll"); + err(EXIT_FAILURE, "poll"); } -/* Sender sprays loops messages down each file descriptor */ +/* Sender sprays nr_loops messages down each file descriptor */ static void *sender(struct sender_context *ctx) { char data[DATASIZE]; unsigned int i, j; ready(ctx->ready_out, ctx->wakefd); + memset(data, 'S', sizeof(data)); /* Now pump to every receiver. */ - for (i = 0; i < loops; i++) { + for (i = 0; i < nr_loops; i++) { for (j = 0; j < ctx->num_fds; j++) { int ret, done = 0; again: ret = write(ctx->out_fds[j], data + done, - sizeof(data)-done); + sizeof(data) - done); if (ret < 0) - barf("SENDER: write"); + err(EXIT_FAILURE, "SENDER: write"); done += ret; if (done < DATASIZE) goto again; @@ -131,7 +138,7 @@ static void *receiver(struct receiver_context* ctx) again: ret = read(ctx->in_fds[0], data + done, DATASIZE - done); if (ret < 0) - barf("SERVER: read"); + err(EXIT_FAILURE, "SERVER: read"); done += ret; if (done < DATASIZE) goto again; @@ -140,48 +147,51 @@ again: return NULL; } -static pthread_t create_worker(void *ctx, void *(*func)(void *)) +static void create_thread_worker(union messaging_worker *worker, + void *ctx, void *(*func)(void *)) { pthread_attr_t attr; - pthread_t childid; - int err; - - if (!thread_mode) { - /* process mode */ - /* Fork the receiver. */ - switch (fork()) { - case -1: - barf("fork()"); - break; - case 0: - (*func) (ctx); - exit(0); - break; - default: - break; - } - - return (pthread_t)0; - } + int ret; if (pthread_attr_init(&attr) != 0) - barf("pthread_attr_init:"); + err(EXIT_FAILURE, "pthread_attr_init:"); #ifndef __ia64__ if (pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN) != 0) - barf("pthread_attr_setstacksize"); + err(EXIT_FAILURE, "pthread_attr_setstacksize"); #endif - err = pthread_create(&childid, &attr, func, ctx); - if (err != 0) { - fprintf(stderr, "pthread_create failed: %s (%d)\n", - strerror(err), err); - exit(-1); + ret = pthread_create(&worker->thread, &attr, func, ctx); + if (ret != 0) + err(EXIT_FAILURE, "pthread_create failed"); + + pthread_attr_destroy(&attr); +} + +static void create_process_worker(union messaging_worker *worker, + void *ctx, void *(*func)(void *)) +{ + /* Fork the receiver. */ + worker->pid = fork(); + + if (worker->pid == -1) { + err(EXIT_FAILURE, "fork()"); + } else if (worker->pid == 0) { + (*func) (ctx); + exit(0); } - return childid; } -static void reap_worker(pthread_t id) +static void create_worker(union messaging_worker *worker, + void *ctx, void *(*func)(void *)) +{ + if (!thread_mode) + return create_process_worker(worker, ctx, func); + else + return create_thread_worker(worker, ctx, func); +} + +static void reap_worker(union messaging_worker *worker) { int proc_status; void *thread_status; @@ -192,41 +202,43 @@ static void reap_worker(pthread_t id) if (!WIFEXITED(proc_status)) exit(1); } else { - pthread_join(id, &thread_status); + pthread_join(worker->thread, &thread_status); } } /* One group of senders and receivers */ -static unsigned int group(pthread_t *pth, +static unsigned int group(union messaging_worker *worker, unsigned int num_fds, int ready_out, int wakefd) { unsigned int i; - struct sender_context *snd_ctx = malloc(sizeof(struct sender_context) - + num_fds * sizeof(int)); + struct sender_context *snd_ctx = malloc(sizeof(struct sender_context) + + num_fds * sizeof(int)); if (!snd_ctx) - barf("malloc()"); + err(EXIT_FAILURE, "malloc()"); + list_add(&snd_ctx->list, &sender_contexts); for (i = 0; i < num_fds; i++) { int fds[2]; struct receiver_context *ctx = malloc(sizeof(*ctx)); if (!ctx) - barf("malloc()"); + err(EXIT_FAILURE, "malloc()"); + list_add(&ctx->list, &receiver_contexts); /* Create the pipe between client and server */ fdpair(fds); - ctx->num_packets = num_fds * loops; + ctx->num_packets = num_fds * nr_loops; ctx->in_fds[0] = fds[0]; ctx->in_fds[1] = fds[1]; ctx->ready_out = ready_out; ctx->wakefd = wakefd; - pth[i] = create_worker(ctx, (void *)receiver); + create_worker(worker + i, ctx, (void *)receiver); snd_ctx->out_fds[i] = fds[1]; if (!thread_mode) @@ -239,7 +251,7 @@ static unsigned int group(pthread_t *pth, snd_ctx->wakefd = wakefd; snd_ctx->num_fds = num_fds; - pth[num_fds+i] = create_worker(snd_ctx, (void *)sender); + create_worker(worker + num_fds + i, snd_ctx, (void *)sender); } /* Close the fds we have left */ @@ -251,13 +263,24 @@ static unsigned int group(pthread_t *pth, return num_fds * 2; } +static void sig_handler(int sig __maybe_unused) +{ + unsigned int i; + + /* + * When exit abnormally, kill all forked child processes. + */ + for (i = 0; i < total_children; i++) + kill(worker_tab[i].pid, SIGKILL); +} + static const struct option options[] = { OPT_BOOLEAN('p', "pipe", &use_pipes, "Use pipe() instead of socketpair()"), OPT_BOOLEAN('t', "thread", &thread_mode, "Be multi thread instead of multi process"), OPT_UINTEGER('g', "group", &num_groups, "Specify number of groups"), - OPT_UINTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_UINTEGER('l', "nr_loops", &nr_loops, "Specify the number of loops to run (default: 100)"), OPT_END() }; @@ -266,45 +289,48 @@ static const char * const bench_sched_message_usage[] = { NULL }; -int bench_sched_messaging(int argc, const char **argv, - const char *prefix __maybe_unused) +int bench_sched_messaging(int argc, const char **argv) { - unsigned int i, total_children; + unsigned int i; struct timeval start, stop, diff; unsigned int num_fds = 20; int readyfds[2], wakefds[2]; char dummy; - pthread_t *pth_tab; + struct sender_context *pos, *n; argc = parse_options(argc, argv, options, bench_sched_message_usage, 0); - pth_tab = malloc(num_fds * 2 * num_groups * sizeof(pthread_t)); - if (!pth_tab) - barf("main:malloc()"); + worker_tab = malloc(num_fds * 2 * num_groups * sizeof(union messaging_worker)); + if (!worker_tab) + err(EXIT_FAILURE, "main:malloc()"); fdpair(readyfds); fdpair(wakefds); - total_children = 0; + if (!thread_mode) { + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + } + for (i = 0; i < num_groups; i++) - total_children += group(pth_tab+total_children, num_fds, + total_children += group(worker_tab + total_children, num_fds, readyfds[1], wakefds[0]); /* Wait for everyone to be ready */ for (i = 0; i < total_children; i++) if (read(readyfds[0], &dummy, 1) != 1) - barf("Reading for readyfds"); + err(EXIT_FAILURE, "Reading for readyfds"); gettimeofday(&start, NULL); /* Kick them off */ if (write(wakefds[1], &dummy, 1) != 1) - barf("Writing to start them"); + err(EXIT_FAILURE, "Writing to start them"); /* Reap them all */ for (i = 0; i < total_children; i++) - reap_worker(pth_tab[i]); + reap_worker(worker_tab + i); gettimeofday(&stop, NULL); @@ -318,12 +344,12 @@ int bench_sched_messaging(int argc, const char **argv, num_groups, num_groups * 2 * num_fds, thread_mode ? "threads" : "processes"); printf(" %14s: %lu.%03lu [sec]\n", "Total time", - diff.tv_sec, - (unsigned long) (diff.tv_usec/1000)); + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; case BENCH_FORMAT_SIMPLE: - printf("%lu.%03lu\n", diff.tv_sec, - (unsigned long) (diff.tv_usec/1000)); + printf("%lu.%03lu\n", (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; default: /* reaching here is something disaster */ @@ -332,5 +358,14 @@ int bench_sched_messaging(int argc, const char **argv, break; } + free(worker_tab); + list_for_each_entry_safe(pos, n, &sender_contexts, list) { + list_del_init(&pos->list); + free(pos); + } + list_for_each_entry_safe(pos, n, &receiver_contexts, list) { + list_del_init(&pos->list); + free(pos); + } return 0; } diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c index 69cfba8d4c6c..70139036d68f 100644 --- a/tools/perf/bench/sched-pipe.c +++ b/tools/perf/bench/sched-pipe.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * * sched-pipe.c @@ -7,33 +8,89 @@ * Based on pipe-test-1m.c by Ingo Molnar <mingo@redhat.com> * http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> - * */ - -#include "../perf.h" -#include "../util/util.h" -#include "../util/parse-options.h" -#include "../builtin.h" +#include <subcmd/parse-options.h> +#include <api/fs/fs.h> #include "bench.h" +#include "util/cgroup.h" #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <signal.h> #include <sys/wait.h> -#include <linux/unistd.h> #include <string.h> #include <errno.h> +#include <fcntl.h> #include <assert.h> +#include <sys/epoll.h> #include <sys/time.h> #include <sys/types.h> +#include <sys/syscall.h> +#include <linux/time64.h> + +#include <pthread.h> + +struct thread_data { + int nr; + int pipe_read; + int pipe_write; + struct epoll_event epoll_ev; + int epoll_fd; + bool cgroup_failed; + pthread_t pthread; +}; #define LOOPS_DEFAULT 1000000 -static int loops = LOOPS_DEFAULT; +static int loops = LOOPS_DEFAULT; + +/* Use processes by default: */ +static bool threaded; + +static bool nonblocking; +static char *cgrp_names[2]; +static struct cgroup *cgrps[2]; + +static int parse_two_cgroups(const struct option *opt __maybe_unused, + const char *str, int unset __maybe_unused) +{ + char *p = strdup(str); + char *q; + int ret = -1; + + if (p == NULL) { + fprintf(stderr, "memory allocation failure\n"); + return -1; + } + + q = strchr(p, ','); + if (q == NULL) { + fprintf(stderr, "it should have two cgroup names: %s\n", p); + goto out; + } + *q = '\0'; + + cgrp_names[0] = strdup(p); + cgrp_names[1] = strdup(q + 1); + + if (cgrp_names[0] == NULL || cgrp_names[1] == NULL) { + fprintf(stderr, "memory allocation failure\n"); + goto out; + } + ret = 0; + +out: + free(p); + return ret; +} static const struct option options[] = { - OPT_INTEGER('l', "loop", &loops, - "Specify number of loops"), + OPT_BOOLEAN('n', "nonblocking", &nonblocking, "Use non-blocking operations"), + OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_BOOLEAN('T', "threaded", &threaded, "Specify threads/process based task setup"), + OPT_CALLBACK('G', "cgroups", NULL, "SEND,RECV", + "Put sender and receivers in given cgroups", + parse_two_cgroups), OPT_END() }; @@ -42,78 +99,223 @@ static const char * const bench_sched_pipe_usage[] = { NULL }; -int bench_sched_pipe(int argc, const char **argv, - const char *prefix __maybe_unused) +static int enter_cgroup(int nr) { + char buf[32]; + int fd, len, ret; + int saved_errno; + struct cgroup *cgrp; + pid_t pid; + + if (cgrp_names[nr] == NULL) + return 0; + + if (cgrps[nr] == NULL) { + cgrps[nr] = cgroup__new(cgrp_names[nr], /*do_open=*/true); + if (cgrps[nr] == NULL) + goto err; + } + cgrp = cgrps[nr]; + + if (threaded) + pid = syscall(__NR_gettid); + else + pid = getpid(); + + snprintf(buf, sizeof(buf), "%d\n", pid); + len = strlen(buf); + + /* try cgroup v2 interface first */ + if (threaded) + fd = openat(cgrp->fd, "cgroup.threads", O_WRONLY); + else + fd = openat(cgrp->fd, "cgroup.procs", O_WRONLY); + + /* try cgroup v1 if failed */ + if (fd < 0 && errno == ENOENT) + fd = openat(cgrp->fd, "tasks", O_WRONLY); + + if (fd < 0) + goto err; + + ret = write(fd, buf, len); + close(fd); + + if (ret != len) { + printf("Cannot enter to cgroup: %s\n", cgrp->name); + return -1; + } + return 0; + +err: + saved_errno = errno; + printf("Failed to open cgroup file in %s\n", cgrp_names[nr]); + + if (saved_errno == ENOENT) { + char mnt[PATH_MAX]; + + if (cgroupfs_find_mountpoint(mnt, sizeof(mnt), "perf_event") == 0) + printf(" Hint: create the cgroup first, like 'mkdir %s/%s'\n", + mnt, cgrp_names[nr]); + } else if (saved_errno == EACCES && geteuid() > 0) { + printf(" Hint: try to run as root\n"); + } + + return -1; +} + +static void exit_cgroup(int nr) +{ + cgroup__put(cgrps[nr]); + free(cgrp_names[nr]); +} + +static inline int read_pipe(struct thread_data *td) +{ + int ret, m; +retry: + if (nonblocking) { + ret = epoll_wait(td->epoll_fd, &td->epoll_ev, 1, -1); + if (ret < 0) + return ret; + } + ret = read(td->pipe_read, &m, sizeof(int)); + if (nonblocking && ret < 0 && errno == EWOULDBLOCK) + goto retry; + return ret; +} + +static void *worker_thread(void *__tdata) +{ + struct thread_data *td = __tdata; + int i, ret, m = 0; + + ret = enter_cgroup(td->nr); + if (ret < 0) { + td->cgroup_failed = true; + return NULL; + } + + if (nonblocking) { + td->epoll_ev.events = EPOLLIN; + td->epoll_fd = epoll_create(1); + BUG_ON(td->epoll_fd < 0); + BUG_ON(epoll_ctl(td->epoll_fd, EPOLL_CTL_ADD, td->pipe_read, &td->epoll_ev) < 0); + } + + for (i = 0; i < loops; i++) { + ret = write(td->pipe_write, &m, sizeof(int)); + BUG_ON(ret != sizeof(int)); + ret = read_pipe(td); + BUG_ON(ret != sizeof(int)); + } + + return NULL; +} + +int bench_sched_pipe(int argc, const char **argv) +{ + struct thread_data threads[2] = {}; + struct thread_data *td; int pipe_1[2], pipe_2[2]; - int m = 0, i; struct timeval start, stop, diff; unsigned long long result_usec = 0; + int nr_threads = 2; + int t; /* * why does "ret" exist? * discarding returned value of read(), write() * causes error in building environment for perf */ - int __maybe_unused ret, wait_stat; + int __maybe_unused ret, wait_stat, flags = 0; pid_t pid, retpid __maybe_unused; - argc = parse_options(argc, argv, options, - bench_sched_pipe_usage, 0); + argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0); - BUG_ON(pipe(pipe_1)); - BUG_ON(pipe(pipe_2)); + if (nonblocking) + flags |= O_NONBLOCK; - pid = fork(); - assert(pid >= 0); + BUG_ON(pipe2(pipe_1, flags)); + BUG_ON(pipe2(pipe_2, flags)); gettimeofday(&start, NULL); - if (!pid) { - for (i = 0; i < loops; i++) { - ret = read(pipe_1[0], &m, sizeof(int)); - ret = write(pipe_2[1], &m, sizeof(int)); + for (t = 0; t < nr_threads; t++) { + td = threads + t; + + td->nr = t; + + if (t == 0) { + td->pipe_read = pipe_1[0]; + td->pipe_write = pipe_2[1]; + } else { + td->pipe_write = pipe_1[1]; + td->pipe_read = pipe_2[0]; + } + } + + if (threaded) { + for (t = 0; t < nr_threads; t++) { + td = threads + t; + + ret = pthread_create(&td->pthread, NULL, worker_thread, td); + BUG_ON(ret); + } + + for (t = 0; t < nr_threads; t++) { + td = threads + t; + + ret = pthread_join(td->pthread, NULL); + BUG_ON(ret); } } else { - for (i = 0; i < loops; i++) { - ret = write(pipe_1[1], &m, sizeof(int)); - ret = read(pipe_2[0], &m, sizeof(int)); + pid = fork(); + assert(pid >= 0); + + if (!pid) { + worker_thread(threads + 0); + exit(0); + } else { + worker_thread(threads + 1); } + + retpid = waitpid(pid, &wait_stat, 0); + assert((retpid == pid) && WIFEXITED(wait_stat)); } gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); - if (pid) { - retpid = waitpid(pid, &wait_stat, 0); - assert((retpid == pid) && WIFEXITED(wait_stat)); - } else { - exit(0); - } + exit_cgroup(0); + exit_cgroup(1); + + if (threads[0].cgroup_failed || threads[1].cgroup_failed) + return 0; switch (bench_format) { case BENCH_FORMAT_DEFAULT: - printf("# Executed %d pipe operations between two tasks\n\n", - loops); + printf("# Executed %d pipe operations between two %s\n\n", + loops, threaded ? "threads" : "processes"); - result_usec = diff.tv_sec * 1000000; + result_usec = diff.tv_sec * USEC_PER_SEC; result_usec += diff.tv_usec; printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", - diff.tv_sec, - (unsigned long) (diff.tv_usec/1000)); + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); printf(" %14lf usecs/op\n", (double)result_usec / (double)loops); printf(" %14d ops/sec\n", (int)((double)loops / - ((double)result_usec / (double)1000000))); + ((double)result_usec / (double)USEC_PER_SEC))); break; case BENCH_FORMAT_SIMPLE: printf("%lu.%03lu\n", - diff.tv_sec, - (unsigned long) (diff.tv_usec / 1000)); + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); break; default: diff --git a/tools/perf/bench/sched-seccomp-notify.c b/tools/perf/bench/sched-seccomp-notify.c new file mode 100644 index 000000000000..269c1f4a6852 --- /dev/null +++ b/tools/perf/bench/sched-seccomp-notify.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <subcmd/parse-options.h> +#include "bench.h" + +#include <uapi/linux/filter.h> +#include <sys/types.h> +#include <sys/time.h> +#include <linux/unistd.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <linux/time64.h> +#include <uapi/linux/seccomp.h> +#include <sys/prctl.h> + +#include <unistd.h> +#include <limits.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <sys/wait.h> +#include <string.h> +#include <errno.h> +#include <err.h> +#include <inttypes.h> + +#define LOOPS_DEFAULT 1000000UL +static uint64_t loops = LOOPS_DEFAULT; +static bool sync_mode; + +static const struct option options[] = { + OPT_U64('l', "loop", &loops, "Specify number of loops"), + OPT_BOOLEAN('s', "sync-mode", &sync_mode, + "Enable the synchronous mode for seccomp notifications"), + OPT_END() +}; + +static const char * const bench_seccomp_usage[] = { + "perf bench sched secccomp-notify <options>", + NULL +}; + +static int seccomp(unsigned int op, unsigned int flags, void *args) +{ + return syscall(__NR_seccomp, op, flags, args); +} + +static int user_notif_syscall(int nr, unsigned int flags) +{ + struct sock_filter filter[] = { + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, + offsetof(struct seccomp_data, nr)), + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF), + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), + }; + + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + + return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog); +} + +#define USER_NOTIF_MAGIC INT_MAX +static void user_notification_sync_loop(int listener) +{ + struct seccomp_notif_resp resp; + struct seccomp_notif req; + uint64_t nr; + + for (nr = 0; nr < loops; nr++) { + memset(&req, 0, sizeof(req)); + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req)) + err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_RECV failed"); + + if (req.data.nr != __NR_gettid) + errx(EXIT_FAILURE, "unexpected syscall: %d", req.data.nr); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + resp.flags = 0; + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp)) + err(EXIT_FAILURE, "SECCOMP_IOCTL_NOTIF_SEND failed"); + } +} + +#ifndef SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0) +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64) +#endif +int bench_sched_seccomp_notify(int argc, const char **argv) +{ + struct timeval start, stop, diff; + unsigned long long result_usec = 0; + int status, listener; + pid_t pid; + long ret; + + argc = parse_options(argc, argv, options, bench_seccomp_usage, 0); + + gettimeofday(&start, NULL); + + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + listener = user_notif_syscall(__NR_gettid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) + err(EXIT_FAILURE, "can't create a notification descriptor"); + + pid = fork(); + if (pid < 0) + err(EXIT_FAILURE, "fork"); + if (pid == 0) { + if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0)) + err(EXIT_FAILURE, "can't set the parent death signal"); + while (1) { + ret = syscall(__NR_gettid); + if (ret == USER_NOTIF_MAGIC) + continue; + break; + } + _exit(1); + } + + if (sync_mode) { + if (ioctl(listener, SECCOMP_IOCTL_NOTIF_SET_FLAGS, + SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP, 0)) + err(EXIT_FAILURE, + "can't set SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP"); + } + user_notification_sync_loop(listener); + + kill(pid, SIGKILL); + if (waitpid(pid, &status, 0) != pid) + err(EXIT_FAILURE, "waitpid(%d) failed", pid); + if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) + errx(EXIT_FAILURE, "unexpected exit code: %d", status); + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Executed %" PRIu64 " system calls\n\n", + loops); + + result_usec = diff.tv_sec * USEC_PER_SEC; + result_usec += diff.tv_usec; + + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); + + printf(" %14lf usecs/op\n", + (double)result_usec / (double)loops); + printf(" %14d ops/sec\n", + (int)((double)loops / + ((double)result_usec / (double)USEC_PER_SEC))); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / USEC_PER_MSEC)); + break; + + default: + /* reaching here is something disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + break; + } + + return 0; +} diff --git a/tools/perf/bench/synthesize.c b/tools/perf/bench/synthesize.c new file mode 100644 index 000000000000..265d49a913d9 --- /dev/null +++ b/tools/perf/bench/synthesize.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Benchmark synthesis of perf events such as at the start of a 'perf + * record'. Synthesis is done on the current process and the 'dummy' event + * handlers are invoked that support dump_trace but otherwise do nothing. + * + * Copyright 2019 Google LLC. + */ +#include <errno.h> +#include <stdio.h> +#include "bench.h" +#include "../util/debug.h" +#include "../util/session.h" +#include "../util/stat.h" +#include "../util/synthetic-events.h" +#include "../util/target.h" +#include "../util/thread_map.h" +#include "../util/tool.h" +#include "../util/util.h" +#include <linux/atomic.h> +#include <linux/err.h> +#include <linux/time64.h> +#include <subcmd/parse-options.h> + +static unsigned int min_threads = 1; +static unsigned int max_threads = UINT_MAX; +static unsigned int single_iterations = 10000; +static unsigned int multi_iterations = 10; +static bool run_st; +static bool run_mt; + +static const struct option options[] = { + OPT_BOOLEAN('s', "st", &run_st, "Run single threaded benchmark"), + OPT_BOOLEAN('t', "mt", &run_mt, "Run multi-threaded benchmark"), + OPT_UINTEGER('m', "min-threads", &min_threads, + "Minimum number of threads in multithreaded bench"), + OPT_UINTEGER('M', "max-threads", &max_threads, + "Maximum number of threads in multithreaded bench"), + OPT_UINTEGER('i', "single-iterations", &single_iterations, + "Number of iterations used to compute single-threaded average"), + OPT_UINTEGER('I', "multi-iterations", &multi_iterations, + "Number of iterations used to compute multi-threaded average"), + OPT_END() +}; + +static const char *const bench_usage[] = { + "perf bench internals synthesize <options>", + NULL +}; + +static atomic_t event_count; + +static int process_synthesized_event(const struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_sample *sample __maybe_unused, + struct machine *machine __maybe_unused) +{ + atomic_inc(&event_count); + return 0; +} + +static int do_run_single_threaded(struct perf_session *session, + struct perf_thread_map *threads, + struct target *target, bool data_mmap) +{ + const unsigned int nr_threads_synthesize = 1; + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double time_average, time_stddev, event_average, event_stddev; + int err; + struct stats time_stats, event_stats; + + init_stats(&time_stats); + init_stats(&event_stats); + + for (i = 0; i < single_iterations; i++) { + atomic_set(&event_count, 0); + gettimeofday(&start, NULL); + err = __machine__synthesize_threads(&session->machines.host, + NULL, + target, threads, + process_synthesized_event, + true, data_mmap, + nr_threads_synthesize); + if (err) + return err; + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&event_stats, atomic_read(&event_count)); + } + + time_average = avg_stats(&time_stats); + time_stddev = stddev_stats(&time_stats); + printf(" Average %ssynthesis took: %.3f usec (+- %.3f usec)\n", + data_mmap ? "data " : "", time_average, time_stddev); + + event_average = avg_stats(&event_stats); + event_stddev = stddev_stats(&event_stats); + printf(" Average num. events: %.3f (+- %.3f)\n", + event_average, event_stddev); + + printf(" Average time per event %.3f usec\n", + time_average / event_average); + return 0; +} + +static int run_single_threaded(void) +{ + struct perf_session *session; + struct target target = { + .pid = "self", + }; + struct perf_thread_map *threads; + struct perf_env host_env; + int err; + + perf_set_singlethreaded(); + perf_env__init(&host_env); + session = __perf_session__new(/*data=*/NULL, /*tool=*/NULL, + /*trace_event_repipe=*/false, &host_env); + if (IS_ERR(session)) { + pr_err("Session creation failed.\n"); + perf_env__exit(&host_env); + return PTR_ERR(session); + } + threads = thread_map__new_by_pid(getpid()); + if (!threads) { + pr_err("Thread map creation failed.\n"); + err = -ENOMEM; + goto err_out; + } + + puts( +"Computing performance of single threaded perf event synthesis by\n" +"synthesizing events on the perf process itself:"); + + err = do_run_single_threaded(session, threads, &target, false); + if (err) + goto err_out; + + err = do_run_single_threaded(session, threads, &target, true); + +err_out: + if (threads) + perf_thread_map__put(threads); + + perf_session__delete(session); + perf_env__exit(&host_env); + return err; +} + +static int do_run_multi_threaded(struct target *target, + unsigned int nr_threads_synthesize) +{ + struct timeval start, end, diff; + u64 runtime_us; + unsigned int i; + double time_average, time_stddev, event_average, event_stddev; + int err = 0; + struct stats time_stats, event_stats; + struct perf_session *session; + struct perf_env host_env; + + perf_env__init(&host_env); + init_stats(&time_stats); + init_stats(&event_stats); + for (i = 0; i < multi_iterations; i++) { + session = __perf_session__new(/*data=*/NULL, /*tool=*/NULL, + /*trace_event_repipe=*/false, &host_env); + if (IS_ERR(session)) { + err = PTR_ERR(session); + goto err_out; + } + atomic_set(&event_count, 0); + gettimeofday(&start, NULL); + err = __machine__synthesize_threads(&session->machines.host, + NULL, + target, NULL, + process_synthesized_event, + true, false, + nr_threads_synthesize); + if (err) { + perf_session__delete(session); + goto err_out; + } + + gettimeofday(&end, NULL); + timersub(&end, &start, &diff); + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec; + update_stats(&time_stats, runtime_us); + update_stats(&event_stats, atomic_read(&event_count)); + perf_session__delete(session); + } + + time_average = avg_stats(&time_stats); + time_stddev = stddev_stats(&time_stats); + printf(" Average synthesis took: %.3f usec (+- %.3f usec)\n", + time_average, time_stddev); + + event_average = avg_stats(&event_stats); + event_stddev = stddev_stats(&event_stats); + printf(" Average num. events: %.3f (+- %.3f)\n", + event_average, event_stddev); + + printf(" Average time per event %.3f usec\n", + time_average / event_average); +err_out: + perf_env__exit(&host_env); + return err; +} + +static int run_multi_threaded(void) +{ + struct target target = { + .cpu_list = "0" + }; + unsigned int nr_threads_synthesize; + int err; + + if (max_threads == UINT_MAX) + max_threads = sysconf(_SC_NPROCESSORS_ONLN); + + puts( +"Computing performance of multi threaded perf event synthesis by\n" +"synthesizing events on CPU 0:"); + + for (nr_threads_synthesize = min_threads; + nr_threads_synthesize <= max_threads; + nr_threads_synthesize++) { + if (nr_threads_synthesize == 1) + perf_set_singlethreaded(); + else + perf_set_multithreaded(); + + printf(" Number of synthesis threads: %u\n", + nr_threads_synthesize); + + err = do_run_multi_threaded(&target, nr_threads_synthesize); + if (err) + return err; + } + perf_set_singlethreaded(); + return 0; +} + +int bench_synthesize(int argc, const char **argv) +{ + int err = 0; + + argc = parse_options(argc, argv, options, bench_usage, 0); + if (argc) { + usage_with_options(bench_usage, options); + exit(EXIT_FAILURE); + } + + /* + * If neither single threaded or multi-threaded are specified, default + * to running just single threaded. + */ + if (!run_st && !run_mt) + run_st = true; + + if (run_st) + err = run_single_threaded(); + + if (!err && run_mt) + err = run_multi_threaded(); + + return err; +} diff --git a/tools/perf/bench/syscall.c b/tools/perf/bench/syscall.c new file mode 100644 index 000000000000..e7dc216f717f --- /dev/null +++ b/tools/perf/bench/syscall.c @@ -0,0 +1,188 @@ +/* + * + * syscall.c + * + * syscall: Benchmark for system call performance + */ +#include "../perf.h" +#include "../util/util.h" +#include <subcmd/parse-options.h> +#include "../builtin.h" +#include "bench.h" + +#include <stdio.h> +#include <sys/time.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <unistd.h> +#include <stdlib.h> + +#ifndef __NR_fork +#define __NR_fork -1 +#endif + +static int loops; + +static const struct option options[] = { + OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_END() +}; + +static const char * const bench_syscall_usage[] = { + "perf bench syscall <options>", + NULL +}; + +static void test_fork(void) +{ + pid_t pid = fork(); + + if (pid < 0) { + fprintf(stderr, "fork failed\n"); + exit(1); + } else if (pid == 0) { + exit(0); + } else { + if (waitpid(pid, NULL, 0) < 0) { + fprintf(stderr, "waitpid failed\n"); + exit(1); + } + } +} + +static void test_execve(void) +{ + const char *pathname = "/bin/true"; + char *const argv[] = { (char *)pathname, NULL }; + pid_t pid = fork(); + + if (pid < 0) { + fprintf(stderr, "fork failed\n"); + exit(1); + } else if (pid == 0) { + execve(pathname, argv, NULL); + fprintf(stderr, "execve /bin/true failed\n"); + exit(1); + } else { + if (waitpid(pid, NULL, 0) < 0) { + fprintf(stderr, "waitpid failed\n"); + exit(1); + } + } +} + +static int bench_syscall_common(int argc, const char **argv, int syscall) +{ + struct timeval start, stop, diff; + unsigned long long result_usec = 0; + const char *name = NULL; + int i; + + switch (syscall) { + case __NR_fork: + case __NR_execve: + /* Limit default loop to 10000 times to save time */ + loops = 10000; + break; + default: + loops = 10000000; + break; + } + + /* Options -l and --loops override default above */ + argc = parse_options(argc, argv, options, bench_syscall_usage, 0); + + gettimeofday(&start, NULL); + + for (i = 0; i < loops; i++) { + switch (syscall) { + case __NR_getppid: + getppid(); + break; + case __NR_getpgid: + getpgid(0); + break; + case __NR_fork: + test_fork(); + break; + case __NR_execve: + test_execve(); + default: + break; + } + } + + gettimeofday(&stop, NULL); + timersub(&stop, &start, &diff); + + switch (syscall) { + case __NR_getppid: + name = "getppid()"; + break; + case __NR_getpgid: + name = "getpgid()"; + break; + case __NR_fork: + name = "fork()"; + break; + case __NR_execve: + name = "execve()"; + break; + default: + break; + } + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + printf("# Executed %'d %s calls\n", loops, name); + + result_usec = diff.tv_sec * 1000000; + result_usec += diff.tv_usec; + + printf(" %14s: %lu.%03lu [sec]\n\n", "Total time", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec/1000)); + + printf(" %14lf usecs/op\n", + (double)result_usec / (double)loops); + printf(" %'14d ops/sec\n", + (int)((double)loops / + ((double)result_usec / (double)1000000))); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%lu.%03lu\n", + (unsigned long) diff.tv_sec, + (unsigned long) (diff.tv_usec / 1000)); + break; + + default: + /* reaching here is something disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + break; + } + + return 0; +} + +int bench_syscall_basic(int argc, const char **argv) +{ + return bench_syscall_common(argc, argv, __NR_getppid); +} + +int bench_syscall_getpgid(int argc, const char **argv) +{ + return bench_syscall_common(argc, argv, __NR_getpgid); +} + +int bench_syscall_fork(int argc, const char **argv) +{ + return bench_syscall_common(argc, argv, __NR_fork); +} + +int bench_syscall_execve(int argc, const char **argv) +{ + return bench_syscall_common(argc, argv, __NR_execve); +} diff --git a/tools/perf/bench/uprobe.c b/tools/perf/bench/uprobe.c new file mode 100644 index 000000000000..0b90275862e1 --- /dev/null +++ b/tools/perf/bench/uprobe.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* + * uprobe.c + * + * uprobe benchmarks + * + * Copyright (C) 2023, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> + */ +#include "../perf.h" +#include "../util/util.h" +#include <subcmd/parse-options.h> +#include "../builtin.h" +#include "bench.h" +#include <linux/compiler.h> +#include <linux/time64.h> + +#include <inttypes.h> +#include <stdio.h> +#include <sys/time.h> +#include <sys/types.h> +#include <time.h> +#include <unistd.h> +#include <stdlib.h> + +#define LOOPS_DEFAULT 1000 +static int loops = LOOPS_DEFAULT; + +enum bench_uprobe { + BENCH_UPROBE__BASELINE, + BENCH_UPROBE__EMPTY, + BENCH_UPROBE__TRACE_PRINTK, + BENCH_UPROBE__EMPTY_RET, + BENCH_UPROBE__TRACE_PRINTK_RET, +}; + +static const struct option options[] = { + OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), + OPT_END() +}; + +static const char * const bench_uprobe_usage[] = { + "perf bench uprobe <options>", + NULL +}; + +#ifdef HAVE_BPF_SKEL +#include "bpf_skel/bench_uprobe.skel.h" + +#define bench_uprobe__attach_uprobe(prog) \ + skel->links.prog = bpf_program__attach_uprobe_opts(/*prog=*/skel->progs.prog, \ + /*pid=*/-1, \ + /*binary_path=*/"libc.so.6", \ + /*func_offset=*/0, \ + /*opts=*/&uprobe_opts); \ + if (!skel->links.prog) { \ + err = -errno; \ + fprintf(stderr, "Failed to attach bench uprobe \"%s\": %s\n", #prog, strerror(errno)); \ + goto cleanup; \ + } + +struct bench_uprobe_bpf *skel; + +static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench) +{ + DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts); + int err; + + /* Load and verify BPF application */ + skel = bench_uprobe_bpf__open(); + if (!skel) { + fprintf(stderr, "Failed to open and load uprobes bench BPF skeleton\n"); + return -1; + } + + err = bench_uprobe_bpf__load(skel); + if (err) { + fprintf(stderr, "Failed to load and verify BPF skeleton\n"); + goto cleanup; + } + + uprobe_opts.func_name = "usleep"; + switch (bench) { + case BENCH_UPROBE__BASELINE: break; + case BENCH_UPROBE__EMPTY: bench_uprobe__attach_uprobe(empty); break; + case BENCH_UPROBE__TRACE_PRINTK: bench_uprobe__attach_uprobe(trace_printk); break; + case BENCH_UPROBE__EMPTY_RET: bench_uprobe__attach_uprobe(empty_ret); break; + case BENCH_UPROBE__TRACE_PRINTK_RET: bench_uprobe__attach_uprobe(trace_printk_ret); break; + default: + fprintf(stderr, "Invalid bench: %d\n", bench); + goto cleanup; + } + + return err; +cleanup: + bench_uprobe_bpf__destroy(skel); + skel = NULL; + return err; +} + +static void bench_uprobe__teardown_bpf_skel(void) +{ + if (skel) { + bench_uprobe_bpf__destroy(skel); + skel = NULL; + } +} +#else +static int bench_uprobe__setup_bpf_skel(enum bench_uprobe bench __maybe_unused) { return 0; } +static void bench_uprobe__teardown_bpf_skel(void) {}; +#endif + +static int bench_uprobe_format__default_fprintf(const char *name, const char *unit, u64 diff, FILE *fp) +{ + static u64 baseline, previous; + s64 diff_to_baseline = diff - baseline, + diff_to_previous = diff - previous; + int printed = fprintf(fp, "# Executed %'d %s calls\n", loops, name); + + printed += fprintf(fp, " %14s: %'" PRIu64 " %ss", "Total time", diff, unit); + + if (baseline) { + printed += fprintf(fp, " %s%'" PRId64 " to baseline", diff_to_baseline > 0 ? "+" : "", diff_to_baseline); + + if (previous != baseline) + fprintf(stdout, " %s%'" PRId64 " to previous", diff_to_previous > 0 ? "+" : "", diff_to_previous); + } + + printed += fprintf(fp, "\n\n %'.3f %ss/op", (double)diff / (double)loops, unit); + + if (baseline) { + printed += fprintf(fp, " %'.3f %ss/op to baseline", (double)diff_to_baseline / (double)loops, unit); + + if (previous != baseline) + printed += fprintf(fp, " %'.3f %ss/op to previous", (double)diff_to_previous / (double)loops, unit); + } else { + baseline = diff; + } + + fputc('\n', fp); + + previous = diff; + + return printed + 1; +} + +static int bench_uprobe(int argc, const char **argv, enum bench_uprobe bench) +{ + const char *name = "usleep(1000)", *unit = "usec"; + struct timespec start, end; + u64 diff; + int i; + + argc = parse_options(argc, argv, options, bench_uprobe_usage, 0); + + if (bench != BENCH_UPROBE__BASELINE && bench_uprobe__setup_bpf_skel(bench) < 0) + return 0; + + clock_gettime(CLOCK_REALTIME, &start); + + for (i = 0; i < loops; i++) { + usleep(USEC_PER_MSEC); + } + + clock_gettime(CLOCK_REALTIME, &end); + + diff = end.tv_sec * NSEC_PER_SEC + end.tv_nsec - (start.tv_sec * NSEC_PER_SEC + start.tv_nsec); + diff /= NSEC_PER_USEC; + + switch (bench_format) { + case BENCH_FORMAT_DEFAULT: + bench_uprobe_format__default_fprintf(name, unit, diff, stdout); + break; + + case BENCH_FORMAT_SIMPLE: + printf("%" PRIu64 "\n", diff); + break; + + default: + /* reaching here is something of a disaster */ + fprintf(stderr, "Unknown format:%d\n", bench_format); + exit(1); + } + + if (bench != BENCH_UPROBE__BASELINE) + bench_uprobe__teardown_bpf_skel(); + + return 0; +} + +int bench_uprobe_baseline(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__BASELINE); +} + +int bench_uprobe_empty(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__EMPTY); +} + +int bench_uprobe_trace_printk(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK); +} + +int bench_uprobe_empty_ret(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__EMPTY_RET); +} + +int bench_uprobe_trace_printk_ret(int argc, const char **argv) +{ + return bench_uprobe(argc, argv, BENCH_UPROBE__TRACE_PRINTK_RET); +} |
