diff options
Diffstat (limited to 'tools/perf/builtin-trace.c')
-rw-r--r-- | tools/perf/builtin-trace.c | 1287 |
1 files changed, 998 insertions, 289 deletions
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 90eaff8c0f6e..6ac51925ea42 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -19,6 +19,7 @@ #ifdef HAVE_LIBBPF_SUPPORT #include <bpf/bpf.h> #include <bpf/libbpf.h> +#include <bpf/btf.h> #ifdef HAVE_BPF_SKEL #include "bpf_skel/augmented_raw_syscalls.skel.h" #endif @@ -38,6 +39,7 @@ #include "util/synthetic-events.h" #include "util/evlist.h" #include "util/evswitch.h" +#include "util/hashmap.h" #include "util/mmap.h" #include <subcmd/pager.h> #include <subcmd/exec-cmd.h> @@ -62,8 +64,9 @@ #include "print_binary.h" #include "string2.h" #include "syscalltbl.h" -#include "rb_resort.h" #include "../perf.h" +#include "trace_augment.h" +#include "dwarf-regs.h" #include <errno.h> #include <inttypes.h> @@ -84,9 +87,10 @@ #include <linux/ctype.h> #include <perf/mmap.h> +#include <tools/libc_compat.h> #ifdef HAVE_LIBTRACEEVENT -#include <traceevent/event-parse.h> +#include <event-parse.h> #endif #ifndef O_CLOEXEC @@ -101,6 +105,12 @@ /* * strtoul: Go from a string to a value, i.e. for msr: MSR_FS_BASE to 0xc0000100 + * + * We have to explicitely mark the direction of the flow of data, if from the + * kernel to user space or the other way around, since the BPF collector we + * have so far copies only from user to kernel space, mark the arguments that + * go that direction, so that we don“t end up collecting the previous contents + * for syscall args that goes from kernel to user space. */ struct syscall_arg_fmt { size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); @@ -109,7 +119,12 @@ struct syscall_arg_fmt { void *parm; const char *name; u16 nr_entries; // for arrays + bool from_user; bool show_zero; +#ifdef HAVE_LIBBPF_SUPPORT + const struct btf_type *type; + int type_id; /* used in btf_dump */ +#endif }; struct syscall_fmt { @@ -126,11 +141,19 @@ struct syscall_fmt { bool hexret; }; +enum summary_mode { + SUMMARY__NONE = 0, + SUMMARY__BY_TOTAL, + SUMMARY__BY_THREAD, +}; + struct trace { struct perf_tool tool; - struct syscalltbl *sctbl; struct { - struct syscall *table; + /** Sorted sycall numbers used by the trace. */ + struct syscall **table; + /** Size of table. */ + size_t table_size; struct { struct evsel *sys_enter, *sys_exit, @@ -140,6 +163,9 @@ struct trace { #ifdef HAVE_BPF_SKEL struct augmented_raw_syscalls_bpf *skel; #endif +#ifdef HAVE_LIBBPF_SUPPORT + struct btf *btf; +#endif struct record_opts opts; struct evlist *evlist; struct machine *host; @@ -161,14 +187,25 @@ struct trace { pid_t *entries; struct bpf_map *map; } filter_pids; + /* + * TODO: The map is from an ID (aka system call number) to struct + * syscall_stats. If there is >1 e_machine, such as i386 and x86-64 + * processes, then the stats here will gather wrong the statistics for + * the non EM_HOST system calls. A fix would be to add the e_machine + * into the key, but this would make the code inconsistent with the + * per-thread version. + */ + struct hashmap *syscall_stats; double duration_filter; double runtime_ms; + unsigned long pfmaj, pfmin; struct { u64 vfs_getname, proc_getname; } stats; unsigned int max_stack; unsigned int min_stack; + enum summary_mode summary_mode; int raw_augmented_syscalls_args_size; bool raw_augmented_syscalls; bool fd_path_disabled; @@ -196,6 +233,7 @@ struct trace { bool show_string_prefix; bool force; bool vfs_getname; + bool force_btf; int trace_pgfaults; char *perfconfig_events; struct { @@ -204,6 +242,20 @@ struct trace { } oe; }; +static void trace__load_vmlinux_btf(struct trace *trace __maybe_unused) +{ +#ifdef HAVE_LIBBPF_SUPPORT + if (trace->btf != NULL) + return; + + trace->btf = btf__load_vmlinux_btf(); + if (verbose > 0) { + fprintf(trace->output, trace->btf ? "vmlinux BTF loaded\n" : + "Failed to load vmlinux BTF\n"); + } +#endif +} + struct tp_field { int offset; union { @@ -358,7 +410,12 @@ static struct syscall_arg_fmt *evsel__syscall_arg_fmt(struct evsel *evsel) } if (et->fmt == NULL) { - et->fmt = calloc(evsel->tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt)); + const struct tep_event *tp_format = evsel__tp_format(evsel); + + if (tp_format == NULL) + goto out_delete; + + et->fmt = calloc(tp_format->format.nr_fields, sizeof(struct syscall_arg_fmt)); if (et->fmt == NULL) goto out_delete; } @@ -765,7 +822,7 @@ static const char *fcntl_cmds[] = { static DEFINE_STRARRAY(fcntl_cmds, "F_"); static const char *fcntl_linux_specific_cmds[] = { - "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC", + "SETLEASE", "GETLEASE", "NOTIFY", "DUPFD_QUERY", [5] = "CANCELLK", "DUPFD_CLOEXEC", "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS", "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT", }; @@ -830,6 +887,15 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, #define SCA_FILENAME syscall_arg__scnprintf_filename +// 'argname' is just documentational at this point, to remove the previous comment with that info +#define SCA_FILENAME_FROM_USER(argname) \ + { .scnprintf = SCA_FILENAME, \ + .from_user = true, } + +static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg); + +#define SCA_BUF syscall_arg__scnprintf_buf + static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, struct syscall_arg *arg) { @@ -887,6 +953,177 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags +#ifdef HAVE_LIBBPF_SUPPORT +static void syscall_arg_fmt__cache_btf_enum(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type) +{ + int id; + + type = strstr(type, "enum "); + if (type == NULL) + return; + + type += 5; // skip "enum " to get the enumeration name + + id = btf__find_by_name(btf, type); + if (id < 0) + return; + + arg_fmt->type = btf__type_by_id(btf, id); +} + +static bool syscall_arg__strtoul_btf_enum(char *bf, size_t size, struct syscall_arg *arg, u64 *val) +{ + const struct btf_type *bt = arg->fmt->type; + struct btf *btf = arg->trace->btf; + struct btf_enum *be = btf_enum(bt); + + for (int i = 0; i < btf_vlen(bt); ++i, ++be) { + const char *name = btf__name_by_offset(btf, be->name_off); + int max_len = max(size, strlen(name)); + + if (strncmp(name, bf, max_len) == 0) { + *val = be->val; + return true; + } + } + + return false; +} + +static bool syscall_arg__strtoul_btf_type(char *bf, size_t size, struct syscall_arg *arg, u64 *val) +{ + const struct btf_type *bt; + char *type = arg->type_name; + struct btf *btf; + + trace__load_vmlinux_btf(arg->trace); + + btf = arg->trace->btf; + if (btf == NULL) + return false; + + if (arg->fmt->type == NULL) { + // See if this is an enum + syscall_arg_fmt__cache_btf_enum(arg->fmt, btf, type); + } + + // Now let's see if we have a BTF type resolved + bt = arg->fmt->type; + if (bt == NULL) + return false; + + // If it is an enum: + if (btf_is_enum(arg->fmt->type)) + return syscall_arg__strtoul_btf_enum(bf, size, arg, val); + + return false; +} + +static size_t btf_enum_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, int val) +{ + struct btf_enum *be = btf_enum(type); + const int nr_entries = btf_vlen(type); + + for (int i = 0; i < nr_entries; ++i, ++be) { + if (be->val == val) { + return scnprintf(bf, size, "%s", + btf__name_by_offset(btf, be->name_off)); + } + } + + return 0; +} + +struct trace_btf_dump_snprintf_ctx { + char *bf; + size_t printed, size; +}; + +static void trace__btf_dump_snprintf(void *vctx, const char *fmt, va_list args) +{ + struct trace_btf_dump_snprintf_ctx *ctx = vctx; + + ctx->printed += vscnprintf(ctx->bf + ctx->printed, ctx->size - ctx->printed, fmt, args); +} + +static size_t btf_struct_scnprintf(const struct btf_type *type, struct btf *btf, char *bf, size_t size, struct syscall_arg *arg) +{ + struct trace_btf_dump_snprintf_ctx ctx = { + .bf = bf, + .size = size, + }; + struct augmented_arg *augmented_arg = arg->augmented.args; + int type_id = arg->fmt->type_id, consumed; + struct btf_dump *btf_dump; + + LIBBPF_OPTS(btf_dump_opts, dump_opts); + LIBBPF_OPTS(btf_dump_type_data_opts, dump_data_opts); + + if (arg == NULL || arg->augmented.args == NULL) + return 0; + + dump_data_opts.compact = true; + dump_data_opts.skip_names = !arg->trace->show_arg_names; + + btf_dump = btf_dump__new(btf, trace__btf_dump_snprintf, &ctx, &dump_opts); + if (btf_dump == NULL) + return 0; + + /* pretty print the struct data here */ + if (btf_dump__dump_type_data(btf_dump, type_id, arg->augmented.args->value, type->size, &dump_data_opts) == 0) + return 0; + + consumed = sizeof(*augmented_arg) + augmented_arg->size; + arg->augmented.args = ((void *)arg->augmented.args) + consumed; + arg->augmented.size -= consumed; + + btf_dump__free(btf_dump); + + return ctx.printed; +} + +static size_t trace__btf_scnprintf(struct trace *trace, struct syscall_arg *arg, char *bf, + size_t size, int val, char *type) +{ + struct syscall_arg_fmt *arg_fmt = arg->fmt; + + if (trace->btf == NULL) + return 0; + + if (arg_fmt->type == NULL) { + // Check if this is an enum and if we have the BTF type for it. + syscall_arg_fmt__cache_btf_enum(arg_fmt, trace->btf, type); + } + + // Did we manage to find a BTF type for the syscall/tracepoint argument? + if (arg_fmt->type == NULL) + return 0; + + if (btf_is_enum(arg_fmt->type)) + return btf_enum_scnprintf(arg_fmt->type, trace->btf, bf, size, val); + else if (btf_is_struct(arg_fmt->type) || btf_is_union(arg_fmt->type)) + return btf_struct_scnprintf(arg_fmt->type, trace->btf, bf, size, arg); + + return 0; +} + +#else // HAVE_LIBBPF_SUPPORT +static size_t trace__btf_scnprintf(struct trace *trace __maybe_unused, struct syscall_arg *arg __maybe_unused, + char *bf __maybe_unused, size_t size __maybe_unused, int val __maybe_unused, + char *type __maybe_unused) +{ + return 0; +} + +static bool syscall_arg__strtoul_btf_type(char *bf __maybe_unused, size_t size __maybe_unused, + struct syscall_arg *arg __maybe_unused, u64 *val __maybe_unused) +{ + return false; +} +#endif // HAVE_LIBBPF_SUPPORT + +#define STUL_BTF_TYPE syscall_arg__strtoul_btf_type + #define STRARRAY(name, array) \ { .scnprintf = SCA_STRARRAY, \ .strtoul = STUL_STRARRAY, \ @@ -897,7 +1134,6 @@ static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, .strtoul = STUL_STRARRAY_FLAGS, \ .parm = &strarray__##array, } -#include "trace/beauty/arch_errno_names.c" #include "trace/beauty/eventfd.c" #include "trace/beauty/futex_op.c" #include "trace/beauty/futex_val3.c" @@ -921,16 +1157,17 @@ static const struct syscall_fmt syscall_fmts[] = { [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, }, { .name = "bind", .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ }, - [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, + [1] = SCA_SOCKADDR_FROM_USER(umyaddr), [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, }, { .name = "bpf", - .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, + .arg = { [0] = STRARRAY(cmd, bpf_cmd), + [1] = { .from_user = true /* attr */, }, } }, { .name = "brk", .hexret = true, .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, }, { .name = "clock_gettime", .arg = { [0] = STRARRAY(clk_id, clockid), }, }, { .name = "clock_nanosleep", - .arg = { [2] = { .scnprintf = SCA_TIMESPEC, /* rqtp */ }, }, }, + .arg = { [2] = SCA_TIMESPEC_FROM_USER(req), }, }, { .name = "clone", .errpid = true, .nr_args = 5, .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, }, [1] = { .name = "child_stack", .scnprintf = SCA_HEX, }, @@ -941,12 +1178,21 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, { .name = "connect", .arg = { [0] = { .scnprintf = SCA_INT, /* fd */ }, - [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, + [1] = SCA_SOCKADDR_FROM_USER(servaddr), [2] = { .scnprintf = SCA_INT, /* addrlen */ }, }, }, { .name = "epoll_ctl", .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, { .name = "eventfd2", .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, + { .name = "faccessat", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), + [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, + { .name = "faccessat2", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), + [2] = { .scnprintf = SCA_ACCMODE, /* mode */ }, + [3] = { .scnprintf = SCA_FACCESSAT2_FLAGS, /* flags */ }, }, }, { .name = "fchmodat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, { .name = "fchownat", @@ -966,10 +1212,9 @@ static const struct syscall_fmt syscall_fmts[] = { [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, }, { .name = "fspick", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* path */ }, + [1] = SCA_FILENAME_FROM_USER(path), [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, }, { .name = "fstat", .alias = "newfstat", }, - { .name = "fstatat", .alias = "newfstatat", }, { .name = "futex", .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, }, @@ -1025,32 +1270,36 @@ static const struct syscall_fmt syscall_fmts[] = { #if defined(__s390x__) .alias = "old_mmap", #endif - .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, + .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ .strtoul = STUL_STRARRAY_FLAGS, .parm = &strarray__mmap_flags, }, [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, }, { .name = "mount", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ }, + .arg = { [0] = SCA_FILENAME_FROM_USER(devname), [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */ .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "move_mount", .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ }, - [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ }, - [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ }, + [3] = SCA_FILENAME_FROM_USER(pathname), [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, }, { .name = "mprotect", .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, - [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, + [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, }, }, { .name = "mq_unlink", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(u_name), }, }, { .name = "mremap", .hexret = true, .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, }, { .name = "name_to_handle_at", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, - { .name = "newfstatat", - .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + { .name = "nanosleep", + .arg = { [0] = SCA_TIMESPEC_FROM_USER(req), }, }, + { .name = "newfstatat", .alias = "fstatat", + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), + [3] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "open", .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, { .name = "open_by_handle_at", @@ -1060,7 +1309,7 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, { .name = "perf_event_open", - .arg = { [0] = { .scnprintf = SCA_PERF_ATTR, /* attr */ }, + .arg = { [0] = SCA_PERF_ATTR_FROM_USER(attr), [2] = { .scnprintf = SCA_INT, /* cpu */ }, [3] = { .scnprintf = SCA_FD, /* group_fd */ }, [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, }, @@ -1072,7 +1321,7 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, }, { .name = "pkey_mprotect", .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, - [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, + [2] = { .scnprintf = SCA_MMAP_PROT, .show_zero = true, /* prot */ }, [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, }, { .name = "poll", .timeout = true, }, { .name = "ppoll", .timeout = true, }, @@ -1085,7 +1334,8 @@ static const struct syscall_fmt syscall_fmts[] = { { .name = "pread", .alias = "pread64", }, { .name = "preadv", .alias = "pread", }, { .name = "prlimit64", - .arg = { [1] = STRARRAY(resource, rlimit_resources), }, }, + .arg = { [1] = STRARRAY(resource, rlimit_resources), + [2] = { .from_user = true /* new_rlim */, }, }, }, { .name = "pwrite", .alias = "pwrite64", }, { .name = "readlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, @@ -1102,6 +1352,8 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ }, [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, }, + { .name = "rseq", .errpid = true, + .arg = { [0] = { .from_user = true /* rseq */, }, }, }, { .name = "rt_sigaction", .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, { .name = "rt_sigprocmask", @@ -1123,12 +1375,15 @@ static const struct syscall_fmt syscall_fmts[] = { .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, { .name = "sendto", .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, - [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, }, + [4] = SCA_SOCKADDR_FROM_USER(addr), }, }, + { .name = "set_robust_list", .errpid = true, + .arg = { [0] = { .from_user = true /* head */, }, }, }, { .name = "set_tid_address", .errpid = true, }, { .name = "setitimer", .arg = { [0] = STRARRAY(which, itimers), }, }, { .name = "setrlimit", - .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, + .arg = { [0] = STRARRAY(resource, rlimit_resources), + [1] = { .from_user = true /* rlim */, }, }, }, { .name = "setsockopt", .arg = { [1] = STRARRAY(level, socket_level), }, }, { .name = "socket", @@ -1142,12 +1397,12 @@ static const struct syscall_fmt syscall_fmts[] = { { .name = "stat", .alias = "newstat", }, { .name = "statx", .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, - [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } , + [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ } , [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, { .name = "swapoff", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, }, { .name = "swapon", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(specialfile), }, }, { .name = "symlinkat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, { .name = "sync_file_range", @@ -1157,16 +1412,20 @@ static const struct syscall_fmt syscall_fmts[] = { { .name = "tkill", .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, { .name = "umount2", .alias = "umount", - .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, + .arg = { [0] = SCA_FILENAME_FROM_USER(name), }, }, { .name = "uname", .alias = "newuname", }, { .name = "unlinkat", - .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, + .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, + [1] = SCA_FILENAME_FROM_USER(pathname), + [2] = { .scnprintf = SCA_FS_AT_FLAGS, /* flags */ }, }, }, { .name = "utimensat", .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, { .name = "wait4", .errpid = true, .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, { .name = "waitid", .errpid = true, .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, + { .name = "write", + .arg = { [1] = { .scnprintf = SCA_BUF /* buf */, .from_user = true, }, }, }, }; static int syscall_fmt__cmp(const void *name, const void *fmtp) @@ -1207,23 +1466,39 @@ static const struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) return __syscall_fmt__find_by_alias(syscall_fmts, nmemb, alias); } -/* - * is_exit: is this "exit" or "exit_group"? - * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter. - * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc. - * nonexistent: Just a hole in the syscall table, syscall id not allocated +/** + * struct syscall */ struct syscall { + /** @e_machine: The ELF machine associated with the entry. */ + int e_machine; + /** @id: id value from the tracepoint, the system call number. */ + int id; struct tep_event *tp_format; int nr_args; + /** + * @args_size: sum of the sizes of the syscall arguments, anything + * after that is augmented stuff: pathname for openat, etc. + */ + int args_size; struct { struct bpf_program *sys_enter, *sys_exit; } bpf_prog; + /** @is_exit: is this "exit" or "exit_group"? */ bool is_exit; + /** + * @is_open: is this "open" or "openat"? To associate the fd returned in + * sys_exit with the pathname in sys_enter. + */ bool is_open; + /** + * @nonexistent: Name lookup failed. Just a hole in the syscall table, + * syscall id not allocated. + */ bool nonexistent; + bool use_btf; struct tep_format_field *args; const char *name; const struct syscall_fmt *fmt; @@ -1280,16 +1555,48 @@ struct thread_trace { struct file *table; } files; - struct intlist *syscall_stats; + struct hashmap *syscall_stats; }; -static struct thread_trace *thread_trace__new(void) +static size_t syscall_id_hash(long key, void *ctx __maybe_unused) +{ + return key; +} + +static bool syscall_id_equal(long key1, long key2, void *ctx __maybe_unused) +{ + return key1 == key2; +} + +static struct hashmap *alloc_syscall_stats(void) +{ + return hashmap__new(syscall_id_hash, syscall_id_equal, NULL); +} + +static void delete_syscall_stats(struct hashmap *syscall_stats) +{ + struct hashmap_entry *pos; + size_t bkt; + + if (syscall_stats == NULL) + return; + + hashmap__for_each_entry(syscall_stats, pos, bkt) + zfree(&pos->pvalue); + hashmap__free(syscall_stats); +} + +static struct thread_trace *thread_trace__new(struct trace *trace) { struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); if (ttrace) { ttrace->files.max = -1; - ttrace->syscall_stats = intlist__new(NULL); + if (trace->summary) { + ttrace->syscall_stats = alloc_syscall_stats(); + if (IS_ERR(ttrace->syscall_stats)) + zfree(&ttrace); + } } return ttrace; @@ -1304,14 +1611,14 @@ static void thread_trace__delete(void *pttrace) if (!ttrace) return; - intlist__delete(ttrace->syscall_stats); + delete_syscall_stats(ttrace->syscall_stats); ttrace->syscall_stats = NULL; thread_trace__free_files(ttrace); zfree(&ttrace->entry_str); free(ttrace); } -static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) +static struct thread_trace *thread__trace(struct thread *thread, struct trace *trace) { struct thread_trace *ttrace; @@ -1319,7 +1626,7 @@ static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) goto fail; if (thread__priv(thread) == NULL) - thread__set_priv(thread, thread_trace__new()); + thread__set_priv(thread, thread_trace__new(trace)); if (thread__priv(thread) == NULL) goto fail; @@ -1329,7 +1636,7 @@ static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) return ttrace; fail: - color_fprintf(fp, PERF_COLOR_RED, + color_fprintf(trace->output, PERF_COLOR_RED, "WARNING: not enough memory, dropping samples!\n"); return NULL; } @@ -1537,6 +1844,32 @@ static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, return 0; } +#define MAX_CONTROL_CHAR 31 +#define MAX_ASCII 127 + +static size_t syscall_arg__scnprintf_buf(char *bf, size_t size, struct syscall_arg *arg) +{ + struct augmented_arg *augmented_arg = arg->augmented.args; + unsigned char *orig = (unsigned char *)augmented_arg->value; + size_t printed = 0; + int consumed; + + if (augmented_arg == NULL) + return 0; + + for (int j = 0; j < augmented_arg->size; ++j) { + bool control_char = orig[j] <= MAX_CONTROL_CHAR || orig[j] >= MAX_ASCII; + /* print control characters (0~31 and 127), and non-ascii characters in \(digits) */ + printed += scnprintf(bf + printed, size - printed, control_char ? "\\%d" : "%c", (int)orig[j]); + } + + consumed = sizeof(*augmented_arg) + augmented_arg->size; + arg->augmented.args = ((void *)arg->augmented.args) + consumed; + arg->augmented.size -= consumed; + + return printed; +} + static bool trace__filter_duration(struct trace *trace, double t) { return t < (trace->duration_filter * NSEC_PER_MSEC); @@ -1612,7 +1945,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine, switch (event->header.type) { case PERF_RECORD_LOST: color_fprintf(trace->output, PERF_COLOR_RED, - "LOST %" PRIu64 " events!\n", event->lost.lost); + "LOST %" PRIu64 " events!\n", (u64)event->lost.lost); ret = machine__process_lost_event(machine, event, sample); break; default: @@ -1623,7 +1956,7 @@ static int trace__process_event(struct trace *trace, struct machine *machine, return ret; } -static int trace__tool_process(struct perf_tool *tool, +static int trace__tool_process(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct machine *machine) @@ -1730,7 +2063,8 @@ static const struct syscall_arg_fmt *syscall_arg_fmt__find_by_name(const char *n } static struct tep_format_field * -syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field) +syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field *field, + bool *use_btf) { struct tep_format_field *last_field = NULL; int len; @@ -1743,11 +2077,15 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field len = strlen(field->name); + // As far as heuristics (or intention) goes this seems to hold true, and makes sense! + if ((field->flags & TEP_FIELD_IS_POINTER) && strstarts(field->type, "const ")) + arg->from_user = true; + if (strcmp(field->type, "const char *") == 0 && ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) || - strstr(field->name, "path") != NULL)) + strstr(field->name, "path") != NULL)) { arg->scnprintf = SCA_FILENAME; - else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr")) + } else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr")) arg->scnprintf = SCA_PTR; else if (strcmp(field->type, "pid_t") == 0) arg->scnprintf = SCA_PID; @@ -1768,6 +2106,9 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field * 7 unsigned long */ arg->scnprintf = SCA_FD; + } else if (strstr(field->type, "enum") && use_btf != NULL) { + *use_btf = true; + arg->strtoul = STUL_BTF_TYPE; } else { const struct syscall_arg_fmt *fmt = syscall_arg_fmt__find_by_name(field->name); @@ -1784,7 +2125,8 @@ syscall_arg_fmt__init_array(struct syscall_arg_fmt *arg, struct tep_format_field static int syscall__set_arg_fmts(struct syscall *sc) { - struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args); + struct tep_format_field *last_field = syscall_arg_fmt__init_array(sc->arg_fmt, sc->args, + &sc->use_btf); if (last_field) sc->args_size = last_field->offset + last_field->size; @@ -1792,40 +2134,21 @@ static int syscall__set_arg_fmts(struct syscall *sc) return 0; } -static int trace__read_syscall_info(struct trace *trace, int id) +static int syscall__read_info(struct syscall *sc, struct trace *trace) { char tp_name[128]; - struct syscall *sc; - const char *name = syscalltbl__name(trace->sctbl, id); - -#ifdef HAVE_SYSCALL_TABLE_SUPPORT - if (trace->syscalls.table == NULL) { - trace->syscalls.table = calloc(trace->sctbl->syscalls.max_id + 1, sizeof(*sc)); - if (trace->syscalls.table == NULL) - return -ENOMEM; - } -#else - if (id > trace->sctbl->syscalls.max_id || (id == 0 && trace->syscalls.table == NULL)) { - // When using libaudit we don't know beforehand what is the max syscall id - struct syscall *table = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); - - if (table == NULL) - return -ENOMEM; - - // Need to memset from offset 0 and +1 members if brand new - if (trace->syscalls.table == NULL) - memset(table, 0, (id + 1) * sizeof(*sc)); - else - memset(table + trace->sctbl->syscalls.max_id + 1, 0, (id - trace->sctbl->syscalls.max_id) * sizeof(*sc)); + const char *name; + int err; - trace->syscalls.table = table; - trace->sctbl->syscalls.max_id = id; - } -#endif - sc = trace->syscalls.table + id; if (sc->nonexistent) return -EEXIST; + if (sc->name) { + /* Info already read. */ + return 0; + } + + name = syscalltbl__name(sc->e_machine, sc->id); if (name == NULL) { sc->nonexistent = true; return -EEXIST; @@ -1848,11 +2171,16 @@ static int trace__read_syscall_info(struct trace *trace, int id) */ if (IS_ERR(sc->tp_format)) { sc->nonexistent = true; - return PTR_ERR(sc->tp_format); + err = PTR_ERR(sc->tp_format); + sc->tp_format = NULL; + return err; } - if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? - RAW_SYSCALL_ARGS_NUM : sc->tp_format->format.nr_fields)) + /* + * The tracepoint format contains __syscall_nr field, so it's one more + * than the actual number of syscall arguments. + */ + if (syscall__alloc_arg_fmts(sc, sc->tp_format->format.nr_fields - 1)) return -ENOMEM; sc->args = sc->tp_format->format.fields; @@ -1869,16 +2197,26 @@ static int trace__read_syscall_info(struct trace *trace, int id) sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat"); - return syscall__set_arg_fmts(sc); + err = syscall__set_arg_fmts(sc); + + /* after calling syscall__set_arg_fmts() we'll know whether use_btf is true */ + if (sc->use_btf) + trace__load_vmlinux_btf(trace); + + return err; } -static int evsel__init_tp_arg_scnprintf(struct evsel *evsel) +static int evsel__init_tp_arg_scnprintf(struct evsel *evsel, bool *use_btf) { struct syscall_arg_fmt *fmt = evsel__syscall_arg_fmt(evsel); if (fmt != NULL) { - syscall_arg_fmt__init_array(fmt, evsel->tp_format->format.fields); - return 0; + const struct tep_event *tp_format = evsel__tp_format(evsel); + + if (tp_format) { + syscall_arg_fmt__init_array(fmt, tp_format->format.fields, use_btf); + return 0; + } } return -ENOMEM; @@ -1910,10 +2248,14 @@ static int trace__validate_ev_qualifier(struct trace *trace) strlist__for_each_entry(pos, trace->ev_qualifier) { const char *sc = pos->s; - int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; + /* + * TODO: Assume more than the validation/warnings are all for + * the same binary type as perf. + */ + int id = syscalltbl__id(EM_HOST, sc), match_next = -1; if (id < 0) { - id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); + id = syscalltbl__strglobmatch_first(EM_HOST, sc, &match_next); if (id >= 0) goto matches; @@ -1933,7 +2275,7 @@ matches: continue; while (1) { - id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); + id = syscalltbl__strglobmatch_next(EM_HOST, sc, &match_next); if (id < 0) break; if (nr_allocated == nr_used) { @@ -2036,7 +2378,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, unsigned char *args, void *augmented_args, int augmented_args_size, struct trace *trace, struct thread *thread) { - size_t printed = 0; + size_t printed = 0, btf_printed; unsigned long val; u8 bit = 1; struct syscall_arg arg = { @@ -2052,6 +2394,7 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, .show_string_prefix = trace->show_string_prefix, }; struct thread_trace *ttrace = thread__priv(thread); + void *default_scnprintf; /* * Things like fcntl will set this in its 'cmd' formatter to pick the @@ -2077,17 +2420,15 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, val = syscall_arg_fmt__mask_val(&sc->arg_fmt[arg.idx], &arg, val); /* - * Suppress this argument if its value is zero and - * and we don't have a string associated in an - * strarray for it. - */ - if (val == 0 && - !trace->show_zeros && - !(sc->arg_fmt && - (sc->arg_fmt[arg.idx].show_zero || - sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY || - sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) && - sc->arg_fmt[arg.idx].parm)) + * Suppress this argument if its value is zero and show_zero + * property isn't set. + * + * If it has a BTF type, then override the zero suppression knob + * as the common case is for zero in an enum to have an associated entry. + */ + if (val == 0 && !trace->show_zeros && + !(sc->arg_fmt && sc->arg_fmt[arg.idx].show_zero) && + !(sc->arg_fmt && sc->arg_fmt[arg.idx].strtoul == STUL_BTF_TYPE)) continue; printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); @@ -2095,6 +2436,17 @@ static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); + default_scnprintf = sc->arg_fmt[arg.idx].scnprintf; + + if (trace->force_btf || default_scnprintf == NULL || default_scnprintf == SCA_PTR) { + btf_printed = trace__btf_scnprintf(trace, &arg, bf + printed, + size - printed, val, field->type); + if (btf_printed) { + printed += btf_printed; + continue; + } + } + printed += syscall_arg_fmt__scnprintf_val(&sc->arg_fmt[arg.idx], bf + printed, size - printed, &arg, val); } @@ -2121,13 +2473,92 @@ next_arg: return printed; } +static struct syscall *syscall__new(int e_machine, int id) +{ + struct syscall *sc = zalloc(sizeof(*sc)); + + if (!sc) + return NULL; + + sc->e_machine = e_machine; + sc->id = id; + return sc; +} + +static void syscall__delete(struct syscall *sc) +{ + if (!sc) + return; + + free(sc->arg_fmt); + free(sc); +} + +static int syscall__bsearch_cmp(const void *key, const void *entry) +{ + const struct syscall *a = key, *b = *((const struct syscall **)entry); + + if (a->e_machine != b->e_machine) + return a->e_machine - b->e_machine; + + return a->id - b->id; +} + +static int syscall__cmp(const void *va, const void *vb) +{ + const struct syscall *a = *((const struct syscall **)va); + const struct syscall *b = *((const struct syscall **)vb); + + if (a->e_machine != b->e_machine) + return a->e_machine - b->e_machine; + + return a->id - b->id; +} + +static struct syscall *trace__find_syscall(struct trace *trace, int e_machine, int id) +{ + struct syscall key = { + .e_machine = e_machine, + .id = id, + }; + struct syscall *sc, **tmp; + + if (trace->syscalls.table) { + struct syscall **sc_entry = bsearch(&key, trace->syscalls.table, + trace->syscalls.table_size, + sizeof(trace->syscalls.table[0]), + syscall__bsearch_cmp); + + if (sc_entry) + return *sc_entry; + } + + sc = syscall__new(e_machine, id); + if (!sc) + return NULL; + + tmp = reallocarray(trace->syscalls.table, trace->syscalls.table_size + 1, + sizeof(trace->syscalls.table[0])); + if (!tmp) { + syscall__delete(sc); + return NULL; + } + + trace->syscalls.table = tmp; + trace->syscalls.table[trace->syscalls.table_size++] = sc; + qsort(trace->syscalls.table, trace->syscalls.table_size, sizeof(trace->syscalls.table[0]), + syscall__cmp); + return sc; +} + typedef int (*tracepoint_handler)(struct trace *trace, struct evsel *evsel, union perf_event *event, struct perf_sample *sample); -static struct syscall *trace__syscall_info(struct trace *trace, - struct evsel *evsel, int id) +static struct syscall *trace__syscall_info(struct trace *trace, struct evsel *evsel, + int e_machine, int id) { + struct syscall *sc; int err = 0; if (id < 0) { @@ -2152,39 +2583,20 @@ static struct syscall *trace__syscall_info(struct trace *trace, err = -EINVAL; -#ifdef HAVE_SYSCALL_TABLE_SUPPORT - if (id > trace->sctbl->syscalls.max_id) { -#else - if (id >= trace->sctbl->syscalls.max_id) { - /* - * With libaudit we don't know beforehand what is the max_id, - * so we let trace__read_syscall_info() figure that out as we - * go on reading syscalls. - */ - err = trace__read_syscall_info(trace, id); - if (err) -#endif - goto out_cant_read; - } - - if ((trace->syscalls.table == NULL || trace->syscalls.table[id].name == NULL) && - (err = trace__read_syscall_info(trace, id)) != 0) - goto out_cant_read; - - if (trace->syscalls.table && trace->syscalls.table[id].nonexistent) - goto out_cant_read; - - return &trace->syscalls.table[id]; + sc = trace__find_syscall(trace, e_machine, id); + if (sc) + err = syscall__read_info(sc, trace); -out_cant_read: - if (verbose > 0) { + if (err && verbose > 0) { char sbuf[STRERR_BUFSIZE]; - fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, str_error_r(-err, sbuf, sizeof(sbuf))); - if (id <= trace->sctbl->syscalls.max_id && trace->syscalls.table[id].name != NULL) - fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); + + fprintf(trace->output, "Problems reading syscall %d: %d (%s)", id, -err, + str_error_r(-err, sbuf, sizeof(sbuf))); + if (sc && sc->name) + fprintf(trace->output, "(%s)", sc->name); fputs(" information\n", trace->output); } - return NULL; + return err ? NULL : sc; } struct syscall_stats { @@ -2195,24 +2607,26 @@ struct syscall_stats { }; static void thread__update_stats(struct thread *thread, struct thread_trace *ttrace, - int id, struct perf_sample *sample, long err, bool errno_summary) + int id, struct perf_sample *sample, long err, + struct trace *trace) { - struct int_node *inode; - struct syscall_stats *stats; + struct hashmap *syscall_stats = ttrace->syscall_stats; + struct syscall_stats *stats = NULL; u64 duration = 0; - inode = intlist__findnew(ttrace->syscall_stats, id); - if (inode == NULL) - return; + if (trace->summary_mode == SUMMARY__BY_TOTAL) + syscall_stats = trace->syscall_stats; - stats = inode->priv; - if (stats == NULL) { + if (!hashmap__find(syscall_stats, id, &stats)) { stats = zalloc(sizeof(*stats)); if (stats == NULL) return; init_stats(&stats->stats); - inode->priv = stats; + if (hashmap__add(syscall_stats, id, stats) < 0) { + free(stats); + return; + } } if (ttrace->entry_time && sample->time > ttrace->entry_time) @@ -2223,7 +2637,7 @@ static void thread__update_stats(struct thread *thread, struct thread_trace *ttr if (err < 0) { ++stats->nr_failures; - if (!errno_summary) + if (!trace->errno_summary) return; err = -err; @@ -2294,7 +2708,6 @@ static int trace__fprintf_sample(struct trace *trace, struct evsel *evsel, static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size) { - void *augmented_args = NULL; /* * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter * and there we get all 6 syscall args plus the tracepoint common fields @@ -2312,18 +2725,24 @@ static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sam int args_size = raw_augmented_args_size ?: sc->args_size; *augmented_args_size = sample->raw_size - args_size; - if (*augmented_args_size > 0) - augmented_args = sample->raw_data + args_size; + if (*augmented_args_size > 0) { + static uintptr_t argbuf[1024]; /* assuming single-threaded */ - return augmented_args; -} + if ((size_t)(*augmented_args_size) > sizeof(argbuf)) + return NULL; -static void syscall__exit(struct syscall *sc) -{ - if (!sc) - return; + /* + * The perf ring-buffer is 8-byte aligned but sample->raw_data + * is not because it's preceded by u32 size. Later, beautifier + * will use the augmented args with stricter alignments like in + * some struct. To make sure it's aligned, let's copy the args + * into a static buffer as it's single-threaded for now. + */ + memcpy(argbuf, sample->raw_data + args_size, *augmented_args_size); - zfree(&sc->arg_fmt); + return argbuf; + } + return NULL; } static int trace__sys_enter(struct trace *trace, struct evsel *evsel, @@ -2335,16 +2754,17 @@ static int trace__sys_enter(struct trace *trace, struct evsel *evsel, int printed = 0; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; - int augmented_args_size = 0; + int augmented_args_size = 0, e_machine; void *augmented_args = NULL; - struct syscall *sc = trace__syscall_info(trace, evsel, id); + struct syscall *sc; struct thread_trace *ttrace; - if (sc == NULL) - return -1; - thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - ttrace = thread__trace(thread, trace->output); + e_machine = thread__e_machine(thread, trace->host); + sc = trace__syscall_info(trace, evsel, e_machine, id); + if (sc == NULL) + goto out_put; + ttrace = thread__trace(thread, trace); if (ttrace == NULL) goto out_put; @@ -2411,16 +2831,19 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel, struct thread_trace *ttrace; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; - struct syscall *sc = trace__syscall_info(trace, evsel, id); + struct syscall *sc; char msg[1024]; void *args, *augmented_args = NULL; - int augmented_args_size; + int augmented_args_size, e_machine; + size_t printed = 0; - if (sc == NULL) - return -1; thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - ttrace = thread__trace(thread, trace->output); + e_machine = thread__e_machine(thread, trace->host); + sc = trace__syscall_info(trace, evsel, e_machine, id); + if (sc == NULL) + return -1; + ttrace = thread__trace(thread, trace); /* * We need to get ttrace just to make sure it is there when syscall__scnprintf_args() * and the rest of the beautifiers accessing it via struct syscall_arg touches it. @@ -2430,8 +2853,8 @@ static int trace__fprintf_sys_enter(struct trace *trace, struct evsel *evsel, args = perf_evsel__sc_tp_ptr(evsel, args, sample); augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size); - syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); - fprintf(trace->output, "%s", msg); + printed += syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); + fprintf(trace->output, "%.*s", (int)printed, msg); err = 0; out_put: thread__put(thread); @@ -2484,15 +2907,16 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel, bool duration_calculated = false; struct thread *thread; int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0; - int alignment = trace->args_alignment; - struct syscall *sc = trace__syscall_info(trace, evsel, id); + int alignment = trace->args_alignment, e_machine; + struct syscall *sc; struct thread_trace *ttrace; - if (sc == NULL) - return -1; - thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - ttrace = thread__trace(thread, trace->output); + e_machine = thread__e_machine(thread, trace->host); + sc = trace__syscall_info(trace, evsel, e_machine, id); + if (sc == NULL) + goto out_put; + ttrace = thread__trace(thread, trace); if (ttrace == NULL) goto out_put; @@ -2501,7 +2925,7 @@ static int trace__sys_exit(struct trace *trace, struct evsel *evsel, ret = perf_evsel__sc_tp_uint(evsel, ret, sample); if (trace->summary) - thread__update_stats(thread, ttrace, id, sample, ret, trace->errno_summary); + thread__update_stats(thread, ttrace, id, sample, ret, trace); if (!trace->fd_path_disabled && sc->is_open && ret >= 0 && ttrace->filename.pending_open) { trace__set_fd_pathname(thread, ret, ttrace->filename.name); @@ -2681,7 +3105,7 @@ static int trace__sched_stat_runtime(struct trace *trace, struct evsel *evsel, struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); - struct thread_trace *ttrace = thread__trace(thread, trace->output); + struct thread_trace *ttrace = thread__trace(thread, trace); if (ttrace == NULL) goto out_dump; @@ -2739,9 +3163,10 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, { char bf[2048]; size_t size = sizeof(bf); - struct tep_format_field *field = evsel->tp_format->format.fields; + const struct tep_event *tp_format = evsel__tp_format(evsel); + struct tep_format_field *field = tp_format ? tp_format->format.fields : NULL; struct syscall_arg_fmt *arg = __evsel__syscall_arg_fmt(evsel); - size_t printed = 0; + size_t printed = 0, btf_printed; unsigned long val; u8 bit = 1; struct syscall_arg syscall_arg = { @@ -2782,17 +3207,8 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, */ val = syscall_arg_fmt__mask_val(arg, &syscall_arg, val); - /* - * Suppress this argument if its value is zero and - * we don't have a string associated in an - * strarray for it. - */ - if (val == 0 && - !trace->show_zeros && - !((arg->show_zero || - arg->scnprintf == SCA_STRARRAY || - arg->scnprintf == SCA_STRARRAYS) && - arg->parm)) + /* Suppress this argument if its value is zero and show_zero property isn't set. */ + if (val == 0 && !trace->show_zeros && !arg->show_zero && arg->strtoul != STUL_BTF_TYPE) continue; printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); @@ -2800,10 +3216,16 @@ static size_t trace__fprintf_tp_fields(struct trace *trace, struct evsel *evsel, if (trace->show_arg_names) printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); + btf_printed = trace__btf_scnprintf(trace, &syscall_arg, bf + printed, size - printed, val, field->type); + if (btf_printed) { + printed += btf_printed; + continue; + } + printed += syscall_arg_fmt__scnprintf_val(arg, bf + printed, size - printed, &syscall_arg, val); } - return printed + fprintf(trace->output, "%s", bf); + return fprintf(trace->output, "%.*s", (int)printed, bf); } static int trace__event_handler(struct trace *trace, struct evsel *evsel, @@ -2812,13 +3234,8 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, { struct thread *thread; int callchain_ret = 0; - /* - * Check if we called perf_evsel__disable(evsel) due to, for instance, - * this event's max_events having been hit and this is an entry coming - * from the ring buffer that we should discard, since the max events - * have already been considered/printed. - */ - if (evsel->disabled) + + if (evsel->nr_events_printed >= evsel->max_events) return 0; thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); @@ -2845,7 +3262,8 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, if (evsel == trace->syscalls.events.bpf_output) { int id = perf_evsel__sc_tp_uint(evsel, id, sample); - struct syscall *sc = trace__syscall_info(trace, evsel, id); + int e_machine = thread ? thread__e_machine(thread, trace->host) : EM_HOST; + struct syscall *sc = trace__syscall_info(trace, evsel, e_machine, id); if (sc) { fprintf(trace->output, "%s(", sc->name); @@ -2865,11 +3283,13 @@ static int trace__event_handler(struct trace *trace, struct evsel *evsel, if (evsel__is_bpf_output(evsel)) { bpf_output__fprintf(trace, sample); - } else if (evsel->tp_format) { - if (strncmp(evsel->tp_format->name, "sys_enter_", 10) || - trace__fprintf_sys_enter(trace, evsel, sample)) { + } else { + const struct tep_event *tp_format = evsel__tp_format(evsel); + + if (tp_format && (strncmp(tp_format->name, "sys_enter_", 10) || + trace__fprintf_sys_enter(trace, evsel, sample))) { if (trace->libtraceevent_print) { - event_format__fprintf(evsel->tp_format, sample->cpu, + event_format__fprintf(tp_format, sample->cpu, sample->raw_data, sample->raw_size, trace->output); } else { @@ -2903,7 +3323,7 @@ static void print_location(FILE *f, struct perf_sample *sample, { if ((verbose > 0 || print_dso) && al->map) - fprintf(f, "%s@", map__dso(al->map)->long_name); + fprintf(f, "%s@", dso__long_name(map__dso(al->map))); if ((verbose > 0 || print_sym) && al->sym) fprintf(f, "%s+0x%" PRIx64, al->sym->name, @@ -2940,14 +3360,17 @@ static int trace__pgfault(struct trace *trace, } } - ttrace = thread__trace(thread, trace->output); + ttrace = thread__trace(thread, trace); if (ttrace == NULL) goto out_put; - if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) + if (evsel->core.attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) { ttrace->pfmaj++; - else + trace->pfmaj++; + } else { ttrace->pfmin++; + trace->pfmin++; + } if (trace->summary_only) goto out; @@ -3010,7 +3433,7 @@ static void trace__set_base_time(struct trace *trace, trace->base_time = sample->time; } -static int trace__process_sample(struct perf_tool *tool, +static int trace__process_sample(const struct perf_tool *tool, union perf_event *event, struct perf_sample *sample, struct evsel *evsel, @@ -3106,6 +3529,7 @@ out_free: } static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); +static size_t trace__fprintf_total_summary(struct trace *trace, FILE *fp); static bool evlist__add_vfs_getname(struct evlist *evlist) { @@ -3277,6 +3701,23 @@ out_enomem: } #ifdef HAVE_BPF_SKEL +static int syscall_arg_fmt__cache_btf_struct(struct syscall_arg_fmt *arg_fmt, struct btf *btf, char *type) +{ + int id; + + if (arg_fmt->type != NULL) + return -1; + + id = btf__find_by_name(btf, type); + if (id < 0) + return -1; + + arg_fmt->type = btf__type_by_id(btf, id); + arg_fmt->type_id = id; + + return 0; +} + static struct bpf_program *trace__find_bpf_program_by_title(struct trace *trace, const char *name) { struct bpf_program *pos, *prog = NULL; @@ -3329,9 +3770,9 @@ out_unaugmented: return trace->skel->progs.syscall_unaugmented; } -static void trace__init_syscall_bpf_progs(struct trace *trace, int id) +static void trace__init_syscall_bpf_progs(struct trace *trace, int e_machine, int id) { - struct syscall *sc = trace__syscall_info(trace, NULL, id); + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, id); if (sc == NULL) return; @@ -3340,23 +3781,107 @@ static void trace__init_syscall_bpf_progs(struct trace *trace, int id) sc->bpf_prog.sys_exit = trace__find_syscall_bpf_prog(trace, sc, sc->fmt ? sc->fmt->bpf_prog_name.sys_exit : NULL, "exit"); } -static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int id) +static int trace__bpf_prog_sys_enter_fd(struct trace *trace, int e_machine, int id) { - struct syscall *sc = trace__syscall_info(trace, NULL, id); + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, id); return sc ? bpf_program__fd(sc->bpf_prog.sys_enter) : bpf_program__fd(trace->skel->progs.syscall_unaugmented); } -static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int id) +static int trace__bpf_prog_sys_exit_fd(struct trace *trace, int e_machine, int id) { - struct syscall *sc = trace__syscall_info(trace, NULL, id); + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, id); return sc ? bpf_program__fd(sc->bpf_prog.sys_exit) : bpf_program__fd(trace->skel->progs.syscall_unaugmented); } -static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, struct syscall *sc) +static int trace__bpf_sys_enter_beauty_map(struct trace *trace, int e_machine, int key, unsigned int *beauty_array) { - struct tep_format_field *field, *candidate_field; - int id; + struct tep_format_field *field; + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, key); + const struct btf_type *bt; + char *struct_offset, *tmp, name[32]; + bool can_augment = false; + int i, cnt; + + if (sc == NULL) + return -1; + + trace__load_vmlinux_btf(trace); + if (trace->btf == NULL) + return -1; + + for (i = 0, field = sc->args; field; ++i, field = field->next) { + // XXX We're only collecting pointer payloads _from_ user space + if (!sc->arg_fmt[i].from_user) + continue; + + struct_offset = strstr(field->type, "struct "); + if (struct_offset == NULL) + struct_offset = strstr(field->type, "union "); + else + struct_offset++; // "union" is shorter + + if (field->flags & TEP_FIELD_IS_POINTER && struct_offset) { /* struct or union (think BPF's attr arg) */ + struct_offset += 6; + + /* for 'struct foo *', we only want 'foo' */ + for (tmp = struct_offset, cnt = 0; *tmp != ' ' && *tmp != '\0'; ++tmp, ++cnt) { + } + strncpy(name, struct_offset, cnt); + name[cnt] = '\0'; + + /* cache struct's btf_type and type_id */ + if (syscall_arg_fmt__cache_btf_struct(&sc->arg_fmt[i], trace->btf, name)) + continue; + + bt = sc->arg_fmt[i].type; + beauty_array[i] = bt->size; + can_augment = true; + } else if (field->flags & TEP_FIELD_IS_POINTER && /* string */ + strcmp(field->type, "const char *") == 0 && + (strstr(field->name, "name") || + strstr(field->name, "path") || + strstr(field->name, "file") || + strstr(field->name, "root") || + strstr(field->name, "key") || + strstr(field->name, "special") || + strstr(field->name, "type") || + strstr(field->name, "description"))) { + beauty_array[i] = 1; + can_augment = true; + } else if (field->flags & TEP_FIELD_IS_POINTER && /* buffer */ + strstr(field->type, "char *") && + (strstr(field->name, "buf") || + strstr(field->name, "val") || + strstr(field->name, "msg"))) { + int j; + struct tep_format_field *field_tmp; + + /* find the size of the buffer that appears in pairs with buf */ + for (j = 0, field_tmp = sc->args; field_tmp; ++j, field_tmp = field_tmp->next) { + if (!(field_tmp->flags & TEP_FIELD_IS_POINTER) && /* only integers */ + (strstr(field_tmp->name, "count") || + strstr(field_tmp->name, "siz") || /* size, bufsiz */ + (strstr(field_tmp->name, "len") && strcmp(field_tmp->name, "filename")))) { + /* filename's got 'len' in it, we don't want that */ + beauty_array[i] = -(j + 1); + can_augment = true; + break; + } + } + } + } + + if (can_augment) + return 0; + + return -1; +} + +static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace, + struct syscall *sc) +{ + struct tep_format_field *field, *candidate_field; /* * We're only interested in syscalls that have a pointer: */ @@ -3368,12 +3893,13 @@ static struct bpf_program *trace__find_usable_bpf_prog_entry(struct trace *trace return NULL; try_to_find_pair: - for (id = 0; id < trace->sctbl->syscalls.nr_entries; ++id) { - struct syscall *pair = trace__syscall_info(trace, NULL, id); + for (int i = 0, num_idx = syscalltbl__num_idx(sc->e_machine); i < num_idx; ++i) { + int id = syscalltbl__id_at_idx(sc->e_machine, i); + struct syscall *pair = trace__syscall_info(trace, NULL, sc->e_machine, id); struct bpf_program *pair_prog; bool is_candidate = false; - if (pair == NULL || pair == sc || + if (pair == NULL || pair->id == sc->id || pair->bpf_prog.sys_enter == trace->skel->progs.syscall_unaugmented) continue; @@ -3444,7 +3970,8 @@ try_to_find_pair: goto next_candidate; } - pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, sc->name); + pr_debug("Reusing \"%s\" BPF sys_enter augmenter for \"%s\"\n", pair->name, + sc->name); return pair_prog; next_candidate: continue; @@ -3453,29 +3980,40 @@ try_to_find_pair: return NULL; } -static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) +static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace, int e_machine) { int map_enter_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_enter); int map_exit_fd = bpf_map__fd(trace->skel->maps.syscalls_sys_exit); - int err = 0, key; + int beauty_map_fd = bpf_map__fd(trace->skel->maps.beauty_map_enter); + int err = 0; + unsigned int beauty_array[6]; - for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) { - int prog_fd; + for (int i = 0, num_idx = syscalltbl__num_idx(e_machine); i < num_idx; ++i) { + int prog_fd, key = syscalltbl__id_at_idx(e_machine, i); if (!trace__syscall_enabled(trace, key)) continue; - trace__init_syscall_bpf_progs(trace, key); + trace__init_syscall_bpf_progs(trace, e_machine, key); // It'll get at least the "!raw_syscalls:unaugmented" - prog_fd = trace__bpf_prog_sys_enter_fd(trace, key); + prog_fd = trace__bpf_prog_sys_enter_fd(trace, e_machine, key); err = bpf_map_update_elem(map_enter_fd, &key, &prog_fd, BPF_ANY); if (err) break; - prog_fd = trace__bpf_prog_sys_exit_fd(trace, key); + prog_fd = trace__bpf_prog_sys_exit_fd(trace, e_machine, key); err = bpf_map_update_elem(map_exit_fd, &key, &prog_fd, BPF_ANY); if (err) break; + + /* use beauty_map to tell BPF how many bytes to collect, set beauty_map's value here */ + memset(beauty_array, 0, sizeof(beauty_array)); + err = trace__bpf_sys_enter_beauty_map(trace, e_machine, key, (unsigned int *)beauty_array); + if (err) + continue; + err = bpf_map_update_elem(beauty_map_fd, &key, beauty_array, BPF_ANY); + if (err) + break; } /* @@ -3506,8 +4044,9 @@ static int trace__init_syscalls_bpf_prog_array_maps(struct trace *trace) * first and second arg (this one on the raw_syscalls:sys_exit prog * array tail call, then that one will be used. */ - for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) { - struct syscall *sc = trace__syscall_info(trace, NULL, key); + for (int i = 0, num_idx = syscalltbl__num_idx(e_machine); i < num_idx; ++i) { + int key = syscalltbl__id_at_idx(e_machine, i); + struct syscall *sc = trace__syscall_info(trace, NULL, e_machine, key); struct bpf_program *pair_prog; int prog_fd; @@ -3628,13 +4167,16 @@ static int __trace__deliver_event(struct trace *trace, union perf_event *event) { struct evlist *evlist = trace->evlist; struct perf_sample sample; - int err = evlist__parse_sample(evlist, event, &sample); + int err; + perf_sample__init(&sample, /*all=*/false); + err = evlist__parse_sample(evlist, event, &sample); if (err) fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); else trace__handle_event(trace, event, &sample); + perf_sample__exit(&sample); return 0; } @@ -3681,22 +4223,31 @@ static int ordered_events__deliver_event(struct ordered_events *oe, return __trace__deliver_event(trace, event->event); } -static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg) +static struct syscall_arg_fmt *evsel__find_syscall_arg_fmt_by_name(struct evsel *evsel, char *arg, + char **type) { - struct tep_format_field *field; struct syscall_arg_fmt *fmt = __evsel__syscall_arg_fmt(evsel); + const struct tep_event *tp_format; + + if (!fmt) + return NULL; - if (evsel->tp_format == NULL || fmt == NULL) + tp_format = evsel__tp_format(evsel); + if (!tp_format) return NULL; - for (field = evsel->tp_format->format.fields; field; field = field->next, ++fmt) - if (strcmp(field->name, arg) == 0) + for (const struct tep_format_field *field = tp_format->format.fields; field; + field = field->next, ++fmt) { + if (strcmp(field->name, arg) == 0) { + *type = field->type; return fmt; + } + } return NULL; } -static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel *evsel) +static int trace__expand_filter(struct trace *trace, struct evsel *evsel) { char *tok, *left = evsel->filter, *new_filter = evsel->filter; @@ -3729,14 +4280,14 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel struct syscall_arg_fmt *fmt; int left_size = tok - left, right_size = right_end - right; - char arg[128]; + char arg[128], *type; while (isspace(left[left_size - 1])) --left_size; scnprintf(arg, sizeof(arg), "%.*s", left_size, left); - fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg); + fmt = evsel__find_syscall_arg_fmt_by_name(evsel, arg, &type); if (fmt == NULL) { pr_err("\"%s\" not found in \"%s\", can't set filter \"%s\"\n", arg, evsel->name, evsel->filter); @@ -3749,6 +4300,9 @@ static int trace__expand_filter(struct trace *trace __maybe_unused, struct evsel if (fmt->strtoul) { u64 val; struct syscall_arg syscall_arg = { + .trace = trace, + .fmt = fmt, + .type_name = type, .parm = fmt->parm, }; @@ -3893,6 +4447,12 @@ static int trace__run(struct trace *trace, int argc, const char **argv) goto out_delete_evlist; } + if (trace->summary_mode == SUMMARY__BY_TOTAL) { + trace->syscall_stats = alloc_syscall_stats(); + if (trace->syscall_stats == NULL) + goto out_delete_evlist; + } + evlist__config(evlist, &trace->opts, &callchain_param); if (forks) { @@ -3916,21 +4476,31 @@ static int trace__run(struct trace *trace, int argc, const char **argv) * CPU the bpf-output event's file descriptor. */ perf_cpu_map__for_each_cpu(cpu, i, trace->syscalls.events.bpf_output->core.cpus) { + int mycpu = cpu.cpu; + bpf_map__update_elem(trace->skel->maps.__augmented_syscalls__, - &cpu.cpu, sizeof(int), + &mycpu, sizeof(mycpu), xyarray__entry(trace->syscalls.events.bpf_output->core.fd, - cpu.cpu, 0), + mycpu, 0), sizeof(__u32), BPF_ANY); } } + + if (trace->skel) + trace->filter_pids.map = trace->skel->maps.pids_filtered; #endif err = trace__set_filter_pids(trace); if (err < 0) goto out_error_mem; #ifdef HAVE_BPF_SKEL - if (trace->skel && trace->skel->progs.sys_enter) - trace__init_syscalls_bpf_prog_array_maps(trace); + if (trace->skel && trace->skel->progs.sys_enter) { + /* + * TODO: Initialize for all host binary machine types, not just + * those matching the perf binary. + */ + trace__init_syscalls_bpf_prog_array_maps(trace, EM_HOST); + } #endif if (trace->ev_qualifier_ids.nr > 0) { @@ -3955,12 +4525,13 @@ static int trace__run(struct trace *trace, int argc, const char **argv) * So just disable this beautifier (SCA_FD, SCA_FDAT) when 'close' is * not in use. */ - trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(trace->sctbl, "close")); + /* TODO: support for more than just perf binary machine type close. */ + trace->fd_path_disabled = !trace__syscall_enabled(trace, syscalltbl__id(EM_HOST, "close")); err = trace__expand_filters(trace, &evsel); if (err) goto out_delete_evlist; - err = evlist__apply_filters(evlist, &evsel); + err = evlist__apply_filters(evlist, &evsel, &trace->opts.target); if (err < 0) goto out_error_apply_filters; @@ -4050,8 +4621,12 @@ out_disable: ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL); if (!err) { - if (trace->summary) - trace__fprintf_thread_summary(trace, trace->output); + if (trace->summary) { + if (trace->summary_mode == SUMMARY__BY_TOTAL) + trace__fprintf_total_summary(trace, trace->output); + else + trace__fprintf_thread_summary(trace, trace->output); + } if (trace->show_tool_stats) { fprintf(trace->output, "Stats:\n " @@ -4063,6 +4638,7 @@ out_disable: } out_delete_evlist: + delete_syscall_stats(trace->syscall_stats); trace__symbols__exit(trace); evlist__free_syscall_tp_fields(evlist); evlist__delete(evlist); @@ -4122,6 +4698,7 @@ static int trace__replay(struct trace *trace) struct evsel *evsel; int err = -1; + perf_tool__init(&trace->tool, /*ordered_events=*/true); trace->tool.sample = trace__process_sample; trace->tool.mmap = perf_event__process_mmap; trace->tool.mmap2 = perf_event__process_mmap2; @@ -4190,6 +4767,12 @@ static int trace__replay(struct trace *trace) evsel->handler = trace__pgfault; } + if (trace->summary_mode == SUMMARY__BY_TOTAL) { + trace->syscall_stats = alloc_syscall_stats(); + if (trace->syscall_stats == NULL) + goto out; + } + setup_pager(); err = perf_session__process_events(session); @@ -4200,12 +4783,13 @@ static int trace__replay(struct trace *trace) trace__fprintf_thread_summary(trace, trace->output); out: + delete_syscall_stats(trace->syscall_stats); perf_session__delete(session); return err; } -static size_t trace__fprintf_threads_header(FILE *fp) +static size_t trace__fprintf_summary_header(FILE *fp) { size_t printed; @@ -4214,29 +4798,56 @@ static size_t trace__fprintf_threads_header(FILE *fp) return printed; } -DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, +struct syscall_entry { struct syscall_stats *stats; double msecs; int syscall; -) +}; + +static int entry_cmp(const void *e1, const void *e2) { - struct int_node *source = rb_entry(nd, struct int_node, rb_node); - struct syscall_stats *stats = source->priv; + const struct syscall_entry *entry1 = e1; + const struct syscall_entry *entry2 = e2; - entry->syscall = source->i; - entry->stats = stats; - entry->msecs = stats ? (u64)stats->stats.n * (avg_stats(&stats->stats) / NSEC_PER_MSEC) : 0; + return entry1->msecs > entry2->msecs ? -1 : 1; } -static size_t thread__dump_stats(struct thread_trace *ttrace, - struct trace *trace, FILE *fp) +static struct syscall_entry *syscall__sort_stats(struct hashmap *syscall_stats) +{ + struct syscall_entry *entry; + struct hashmap_entry *pos; + unsigned bkt, i, nr; + + nr = syscall_stats->sz; + entry = malloc(nr * sizeof(*entry)); + if (entry == NULL) + return NULL; + + i = 0; + hashmap__for_each_entry(syscall_stats, pos, bkt) { + struct syscall_stats *ss = pos->pvalue; + struct stats *st = &ss->stats; + + entry[i].stats = ss; + entry[i].msecs = (u64)st->n * (avg_stats(st) / NSEC_PER_MSEC); + entry[i].syscall = pos->key; + i++; + } + assert(i == nr); + + qsort(entry, nr, sizeof(*entry), entry_cmp); + return entry; +} + +static size_t syscall__dump_stats(struct trace *trace, int e_machine, FILE *fp, + struct hashmap *syscall_stats) { size_t printed = 0; struct syscall *sc; - struct rb_node *nd; - DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); + struct syscall_entry *entries; - if (syscall_stats == NULL) + entries = syscall__sort_stats(syscall_stats); + if (entries == NULL) return 0; printed += fprintf(fp, "\n"); @@ -4245,8 +4856,10 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); - resort_rb__for_each_entry(nd, syscall_stats) { - struct syscall_stats *stats = syscall_stats_entry->stats; + for (size_t i = 0; i < syscall_stats->sz; i++) { + struct syscall_entry *entry = &entries[i]; + struct syscall_stats *stats = entry->stats; + if (stats) { double min = (double)(stats->stats.min) / NSEC_PER_MSEC; double max = (double)(stats->stats.max) / NSEC_PER_MSEC; @@ -4257,10 +4870,13 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, pct = avg ? 100.0 * stddev_stats(&stats->stats) / avg : 0.0; avg /= NSEC_PER_MSEC; - sc = &trace->syscalls.table[syscall_stats_entry->syscall]; + sc = trace__syscall_info(trace, /*evsel=*/NULL, e_machine, entry->syscall); + if (!sc) + continue; + printed += fprintf(fp, " %-15s", sc->name); printed += fprintf(fp, " %8" PRIu64 " %6" PRIu64 " %9.3f %9.3f %9.3f", - n, stats->nr_failures, syscall_stats_entry->msecs, min, avg); + n, stats->nr_failures, entry->msecs, min, avg); printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); if (trace->errno_summary && stats->nr_failures) { @@ -4274,16 +4890,28 @@ static size_t thread__dump_stats(struct thread_trace *ttrace, } } - resort_rb__delete(syscall_stats); + free(entries); printed += fprintf(fp, "\n\n"); return printed; } +static size_t thread__dump_stats(struct thread_trace *ttrace, + struct trace *trace, int e_machine, FILE *fp) +{ + return syscall__dump_stats(trace, e_machine, fp, ttrace->syscall_stats); +} + +static size_t system__dump_stats(struct trace *trace, int e_machine, FILE *fp) +{ + return syscall__dump_stats(trace, e_machine, fp, trace->syscall_stats); +} + static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) { size_t printed = 0; struct thread_trace *ttrace = thread__priv(thread); + int e_machine = thread__e_machine(thread, trace->host); double ratio; if (ttrace == NULL) @@ -4303,7 +4931,7 @@ static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trac else if (fputc('\n', fp) != EOF) ++printed; - printed += thread__dump_stats(ttrace, trace, fp); + printed += thread__dump_stats(ttrace, trace, e_machine, fp); return printed; } @@ -4333,7 +4961,7 @@ static int trace_nr_events_cmp(void *priv __maybe_unused, static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) { - size_t printed = trace__fprintf_threads_header(fp); + size_t printed = trace__fprintf_summary_header(fp); LIST_HEAD(threads); if (machine__thread_list(trace->host, &threads) == 0) { @@ -4348,6 +4976,28 @@ static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) return printed; } +static size_t trace__fprintf_total_summary(struct trace *trace, FILE *fp) +{ + size_t printed = trace__fprintf_summary_header(fp); + + printed += fprintf(fp, " total, "); + printed += fprintf(fp, "%lu events", trace->nr_events); + + if (trace->pfmaj) + printed += fprintf(fp, ", %lu majfaults", trace->pfmaj); + if (trace->pfmin) + printed += fprintf(fp, ", %lu minfaults", trace->pfmin); + if (trace->sched) + printed += fprintf(fp, ", %.3f msec\n", trace->runtime_ms); + else if (fputc('\n', fp) != EOF) + ++printed; + + /* TODO: get all system e_machines. */ + printed += system__dump_stats(trace, EM_HOST, fp); + + return printed; +} + static int trace__set_duration(const struct option *opt, const char *str, int unset __maybe_unused) { @@ -4441,47 +5091,62 @@ static void evsel__set_syscall_arg_fmt(struct evsel *evsel, const char *name) const struct syscall_fmt *scfmt = syscall_fmt__find(name); if (scfmt) { - int skip = 0; + const struct tep_event *tp_format = evsel__tp_format(evsel); + + if (tp_format) { + int skip = 0; - if (strcmp(evsel->tp_format->format.fields->name, "__syscall_nr") == 0 || - strcmp(evsel->tp_format->format.fields->name, "nr") == 0) - ++skip; + if (strcmp(tp_format->format.fields->name, "__syscall_nr") == 0 || + strcmp(tp_format->format.fields->name, "nr") == 0) + ++skip; - memcpy(fmt + skip, scfmt->arg, (evsel->tp_format->format.nr_fields - skip) * sizeof(*fmt)); + memcpy(fmt + skip, scfmt->arg, + (tp_format->format.nr_fields - skip) * sizeof(*fmt)); + } } } } -static int evlist__set_syscall_tp_fields(struct evlist *evlist) +static int evlist__set_syscall_tp_fields(struct evlist *evlist, bool *use_btf) { struct evsel *evsel; evlist__for_each_entry(evlist, evsel) { - if (evsel->priv || !evsel->tp_format) + const struct tep_event *tp_format; + + if (evsel->priv) continue; - if (strcmp(evsel->tp_format->system, "syscalls")) { - evsel__init_tp_arg_scnprintf(evsel); + tp_format = evsel__tp_format(evsel); + if (!tp_format) + continue; + + if (strcmp(tp_format->system, "syscalls")) { + evsel__init_tp_arg_scnprintf(evsel, use_btf); continue; } if (evsel__init_syscall_tp(evsel)) return -1; - if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) { + if (!strncmp(tp_format->name, "sys_enter_", 10)) { struct syscall_tp *sc = __evsel__syscall_tp(evsel); if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64))) return -1; - evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_enter_") - 1); - } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) { + evsel__set_syscall_arg_fmt(evsel, + tp_format->name + sizeof("sys_enter_") - 1); + } else if (!strncmp(tp_format->name, "sys_exit_", 9)) { struct syscall_tp *sc = __evsel__syscall_tp(evsel); - if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap)) + if (__tp_field__init_uint(&sc->ret, sizeof(u64), + sc->id.offset + sizeof(u64), + evsel->needs_swap)) return -1; - evsel__set_syscall_arg_fmt(evsel, evsel->tp_format->name + sizeof("sys_exit_") - 1); + evsel__set_syscall_arg_fmt(evsel, + tp_format->name + sizeof("sys_exit_") - 1); } } @@ -4520,8 +5185,9 @@ static int trace__parse_events_option(const struct option *opt, const char *str, *sep = '\0'; list = 0; - if (syscalltbl__id(trace->sctbl, s) >= 0 || - syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { + /* TODO: support for more than just perf binary machine type syscalls. */ + if (syscalltbl__id(EM_HOST, s) >= 0 || + syscalltbl__strglobmatch_first(EM_HOST, s, &idx) >= 0) { list = 1; goto do_concat; } @@ -4604,6 +5270,23 @@ static int trace__parse_cgroups(const struct option *opt, const char *str, int u return 0; } +static int trace__parse_summary_mode(const struct option *opt, const char *str, + int unset __maybe_unused) +{ + struct trace *trace = opt->value; + + if (!strcmp(str, "thread")) { + trace->summary_mode = SUMMARY__BY_THREAD; + } else if (!strcmp(str, "total")) { + trace->summary_mode = SUMMARY__BY_TOTAL; + } else { + pr_err("Unknown summary mode: %s\n", str); + return -1; + } + + return 0; +} + static int trace__config(const char *var, const char *value, void *arg) { struct trace *trace = arg; @@ -4650,17 +5333,20 @@ out: static void trace__exit(struct trace *trace) { - int i; - strlist__delete(trace->ev_qualifier); zfree(&trace->ev_qualifier_ids.entries); if (trace->syscalls.table) { - for (i = 0; i <= trace->sctbl->syscalls.max_id; i++) - syscall__exit(&trace->syscalls.table[i]); + for (size_t i = 0; i < trace->syscalls.table_size; i++) + syscall__delete(trace->syscalls.table[i]); zfree(&trace->syscalls.table); } - syscalltbl__delete(trace->sctbl); zfree(&trace->perfconfig_events); + evlist__delete(trace->evlist); + trace->evlist = NULL; +#ifdef HAVE_LIBBPF_SUPPORT + btf__free(trace->btf); + trace->btf = NULL; +#endif } #ifdef HAVE_BPF_SKEL @@ -4751,6 +5437,9 @@ int cmd_trace(int argc, const char **argv) "Show all syscalls and summary with statistics"), OPT_BOOLEAN(0, "errno-summary", &trace.errno_summary, "Show errno stats per syscall, use with -s or -S"), + OPT_CALLBACK(0, "summary-mode", &trace, "mode", + "How to show summary: select thread (default) or total", + trace__parse_summary_mode), OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", "Trace pagefaults", parse_pagefaults, "maj"), OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), @@ -4782,6 +5471,8 @@ int cmd_trace(int argc, const char **argv) OPT_INTEGER('D', "delay", &trace.opts.target.initial_delay, "ms to wait before starting measurement after program " "start"), + OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer" + "to customized ones"), OPTS_EVSWITCH(&trace.evswitch), OPT_END() }; @@ -4803,9 +5494,8 @@ int cmd_trace(int argc, const char **argv) sigaction(SIGCHLD, &sigchld_act, NULL); trace.evlist = evlist__new(); - trace.sctbl = syscalltbl__new(); - if (trace.evlist == NULL || trace.sctbl == NULL) { + if (trace.evlist == NULL) { pr_err("Not enough memory to run!\n"); err = -ENOMEM; goto out; @@ -4869,6 +5559,11 @@ int cmd_trace(int argc, const char **argv) if (!trace.trace_syscalls) goto skip_augmentation; + if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) { + pr_debug("Syscall augmentation fails with record, disabling augmentation"); + goto skip_augmentation; + } + trace.skel = augmented_raw_syscalls_bpf__open(); if (!trace.skel) { pr_debug("Failed to open augmented syscalls BPF skeleton"); @@ -4902,7 +5597,7 @@ int cmd_trace(int argc, const char **argv) goto out; } trace.syscalls.events.bpf_output = evlist__last(trace.evlist); - assert(!strcmp(evsel__name(trace.syscalls.events.bpf_output), "__augmented_syscalls__")); + assert(evsel__name_is(trace.syscalls.events.bpf_output, "__augmented_syscalls__")); skip_augmentation: #endif err = -1; @@ -4934,11 +5629,16 @@ skip_augmentation: } if (trace.evlist->core.nr_entries > 0) { + bool use_btf = false; + evlist__set_default_evsel_handler(trace.evlist, trace__event_handler); - if (evlist__set_syscall_tp_fields(trace.evlist)) { + if (evlist__set_syscall_tp_fields(trace.evlist, &use_btf)) { perror("failed to set syscalls:* tracepoint fields"); goto out; } + + if (use_btf) + trace__load_vmlinux_btf(&trace); } if (trace.sort_events) { @@ -4959,7 +5659,7 @@ skip_augmentation: */ if (trace.syscalls.events.bpf_output) { evlist__for_each_entry(trace.evlist, evsel) { - bool raw_syscalls_sys_exit = strcmp(evsel__name(evsel), "raw_syscalls:sys_exit") == 0; + bool raw_syscalls_sys_exit = evsel__name_is(evsel, "raw_syscalls:sys_exit"); if (raw_syscalls_sys_exit) { trace.raw_augmented_syscalls = true; @@ -5023,8 +5723,10 @@ init_augmented_syscall_tp: } } - if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) - return trace__record(&trace, argc-1, &argv[1]); + if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) { + err = trace__record(&trace, argc-1, &argv[1]); + goto out; + } /* Using just --errno-summary will trigger --summary */ if (trace.errno_summary && !trace.summary && !trace.summary_only) @@ -5034,6 +5736,13 @@ init_augmented_syscall_tp: if (trace.summary_only) trace.summary = trace.summary_only; + /* Keep exited threads, otherwise information might be lost for summary */ + if (trace.summary) { + symbol_conf.keep_exited_threads = true; + if (trace.summary_mode == SUMMARY__NONE) + trace.summary_mode = SUMMARY__BY_THREAD; + } + if (output_name != NULL) { err = trace__open_output(&trace, output_name); if (err < 0) { |