diff options
Diffstat (limited to 'tools/perf/util/bpf_skel')
-rw-r--r-- | tools/perf/util/bpf_skel/func_latency.bpf.c | 28 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/kwork_trace.bpf.c | 2 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/lock_contention.bpf.c | 352 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/lock_data.h | 8 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/off_cpu.bpf.c | 98 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/syscall_summary.bpf.c | 153 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/syscall_summary.h | 27 | ||||
-rw-r--r-- | tools/perf/util/bpf_skel/vmlinux/vmlinux.h | 9 |
8 files changed, 646 insertions, 31 deletions
diff --git a/tools/perf/util/bpf_skel/func_latency.bpf.c b/tools/perf/util/bpf_skel/func_latency.bpf.c index fb144811b34f..e731a79a753a 100644 --- a/tools/perf/util/bpf_skel/func_latency.bpf.c +++ b/tools/perf/util/bpf_skel/func_latency.bpf.c @@ -50,6 +50,7 @@ const volatile int use_nsec = 0; const volatile unsigned int bucket_range; const volatile unsigned int min_latency; const volatile unsigned int max_latency; +const volatile unsigned int bucket_num = NUM_BUCKET; SEC("kprobe/func") int BPF_PROG(func_begin) @@ -101,6 +102,7 @@ int BPF_PROG(func_end) start = bpf_map_lookup_elem(&functime, &tid); if (start) { __s64 delta = bpf_ktime_get_ns() - *start; + __u64 val = delta; __u32 key = 0; __u64 *hist; @@ -110,30 +112,27 @@ int BPF_PROG(func_end) return 0; if (bucket_range != 0) { - delta /= cmp_base; + val = delta / cmp_base; if (min_latency > 0) { - if (delta > min_latency) - delta -= min_latency; + if (val > min_latency) + val -= min_latency; else goto do_lookup; } // Less than 1 unit (ms or ns), or, in the future, // than the min latency desired. - if (delta > 0) { // 1st entry: [ 1 unit .. bucket_range units ) - // clang 12 doesn't like s64 / u32 division - key = (__u64)delta / bucket_range + 1; - if (key >= NUM_BUCKET || - delta >= max_latency - min_latency) - key = NUM_BUCKET - 1; + if (val > 0) { // 1st entry: [ 1 unit .. bucket_range units ) + key = val / bucket_range + 1; + if (key >= bucket_num) + key = bucket_num - 1; } - delta += min_latency; goto do_lookup; } // calculate index using delta - for (key = 0; key < (NUM_BUCKET - 1); key++) { + for (key = 0; key < (bucket_num - 1); key++) { if (delta < (cmp_base << key)) break; } @@ -143,12 +142,9 @@ do_lookup: if (!hist) return 0; - *hist += 1; + __sync_fetch_and_add(hist, 1); - if (bucket_range == 0) - delta /= cmp_base; - - __sync_fetch_and_add(&total, delta); + __sync_fetch_and_add(&total, delta); // always in nsec __sync_fetch_and_add(&count, 1); if (delta > max) diff --git a/tools/perf/util/bpf_skel/kwork_trace.bpf.c b/tools/perf/util/bpf_skel/kwork_trace.bpf.c index cbd79bc4b330..9ce9c8dddc4b 100644 --- a/tools/perf/util/bpf_skel/kwork_trace.bpf.c +++ b/tools/perf/util/bpf_skel/kwork_trace.bpf.c @@ -80,7 +80,7 @@ static __always_inline int local_strncmp(const char *s1, for (i = 0; i < sz; i++) { ret = (unsigned char)s1[i] - (unsigned char)s2[i]; - if (ret || !s1[i] || !s2[i]) + if (ret || !s1[i]) break; } diff --git a/tools/perf/util/bpf_skel/lock_contention.bpf.c b/tools/perf/util/bpf_skel/lock_contention.bpf.c index 6533ea9b044c..96e7d853b9ed 100644 --- a/tools/perf/util/bpf_skel/lock_contention.bpf.c +++ b/tools/perf/util/bpf_skel/lock_contention.bpf.c @@ -11,6 +11,12 @@ /* for collect_lock_syms(). 4096 was rejected by the verifier */ #define MAX_CPUS 1024 +/* for collect_zone_lock(). It should be more than the actual zones. */ +#define MAX_ZONES 10 + +/* for do_lock_delay(). Arbitrarily set to 1 million. */ +#define MAX_LOOP (1U << 20) + /* lock contention flags from include/trace/events/lock.h */ #define LCB_F_SPIN (1U << 0) #define LCB_F_READ (1U << 1) @@ -27,6 +33,38 @@ struct { __uint(max_entries, MAX_ENTRIES); } stacks SEC(".maps"); +/* buffer for owner stacktrace */ +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, 1); +} stack_buf SEC(".maps"); + +/* a map for tracing owner stacktrace to owner stack id */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); // owner stacktrace + __uint(value_size, sizeof(__s32)); // owner stack id + __uint(max_entries, 1); +} owner_stacks SEC(".maps"); + +/* a map for tracing lock address to owner data */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); // lock address + __uint(value_size, sizeof(struct owner_tracing_data)); + __uint(max_entries, 1); +} owner_data SEC(".maps"); + +/* a map for contention_key (stores owner stack id) to contention data */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(struct contention_key)); + __uint(value_size, sizeof(struct contention_data)); + __uint(max_entries, 1); +} owner_stat SEC(".maps"); + /* maintain timestamp at the beginning of contention */ struct { __uint(type, BPF_MAP_TYPE_HASH); @@ -114,6 +152,13 @@ struct { __uint(max_entries, 1); } slab_caches SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(key_size, sizeof(__u64)); + __uint(value_size, sizeof(__u64)); + __uint(max_entries, 1); +} lock_delays SEC(".maps"); + struct rw_semaphore___old { struct task_struct *owner; } __attribute__((preserve_access_index)); @@ -143,6 +188,8 @@ const volatile int needs_callstack; const volatile int stack_skip; const volatile int lock_owner; const volatile int use_cgroup_v2; +const volatile int max_stack; +const volatile int lock_delay; /* determine the key of lock stat */ const volatile int aggr_mode; @@ -164,6 +211,9 @@ int data_fail; int task_map_full; int data_map_full; +struct task_struct *bpf_task_from_pid(s32 pid) __ksym __weak; +void bpf_task_release(struct task_struct *p) __ksym __weak; + static inline __u64 get_current_cgroup_id(void) { struct task_struct *task; @@ -348,6 +398,35 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags) return 0; } +static inline long delay_callback(__u64 idx, void *arg) +{ + __u64 target = *(__u64 *)arg; + + if (target <= bpf_ktime_get_ns()) + return 1; + + /* just to kill time */ + (void)bpf_get_prandom_u32(); + + return 0; +} + +static inline void do_lock_delay(__u64 duration) +{ + __u64 target = bpf_ktime_get_ns() + duration; + + bpf_loop(MAX_LOOP, delay_callback, &target, /*flags=*/0); +} + +static inline void check_lock_delay(__u64 lock) +{ + __u64 *delay; + + delay = bpf_map_lookup_elem(&lock_delays, &lock); + if (delay) + do_lock_delay(*delay); +} + static inline struct tstamp_data *get_tstamp_elem(__u32 flags) { __u32 pid; @@ -387,6 +466,61 @@ static inline struct tstamp_data *get_tstamp_elem(__u32 flags) return pelem; } +static inline s32 get_owner_stack_id(u64 *stacktrace) +{ + s32 *id, new_id; + static s64 id_gen = 1; + + id = bpf_map_lookup_elem(&owner_stacks, stacktrace); + if (id) + return *id; + + new_id = (s32)__sync_fetch_and_add(&id_gen, 1); + + bpf_map_update_elem(&owner_stacks, stacktrace, &new_id, BPF_NOEXIST); + + id = bpf_map_lookup_elem(&owner_stacks, stacktrace); + if (id) + return *id; + + return -1; +} + +static inline void update_contention_data(struct contention_data *data, u64 duration, u32 count) +{ + __sync_fetch_and_add(&data->total_time, duration); + __sync_fetch_and_add(&data->count, count); + + /* FIXME: need atomic operations */ + if (data->max_time < duration) + data->max_time = duration; + if (data->min_time > duration) + data->min_time = duration; +} + +static inline void update_owner_stat(u32 id, u64 duration, u32 flags) +{ + struct contention_key key = { + .stack_id = id, + .pid = 0, + .lock_addr_or_cgroup = 0, + }; + struct contention_data *data = bpf_map_lookup_elem(&owner_stat, &key); + + if (!data) { + struct contention_data first = { + .total_time = duration, + .max_time = duration, + .min_time = duration, + .count = 1, + .flags = flags, + }; + bpf_map_update_elem(&owner_stat, &key, &first, BPF_NOEXIST); + } else { + update_contention_data(data, duration, 1); + } +} + SEC("tp_btf/contention_begin") int contention_begin(u64 *ctx) { @@ -404,6 +538,72 @@ int contention_begin(u64 *ctx) pelem->flags = (__u32)ctx[1]; if (needs_callstack) { + u32 i = 0; + u32 id = 0; + int owner_pid; + u64 *buf; + struct task_struct *task; + struct owner_tracing_data *otdata; + + if (!lock_owner) + goto skip_owner; + + task = get_lock_owner(pelem->lock, pelem->flags); + if (!task) + goto skip_owner; + + owner_pid = BPF_CORE_READ(task, pid); + + buf = bpf_map_lookup_elem(&stack_buf, &i); + if (!buf) + goto skip_owner; + for (i = 0; i < max_stack; i++) + buf[i] = 0x0; + + if (!bpf_task_from_pid) + goto skip_owner; + + task = bpf_task_from_pid(owner_pid); + if (!task) + goto skip_owner; + + bpf_get_task_stack(task, buf, max_stack * sizeof(unsigned long), 0); + bpf_task_release(task); + + otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); + id = get_owner_stack_id(buf); + + /* + * Contention just happens, or corner case `lock` is owned by process not + * `owner_pid`. For the corner case we treat it as unexpected internal error and + * just ignore the precvious tracing record. + */ + if (!otdata || otdata->pid != owner_pid) { + struct owner_tracing_data first = { + .pid = owner_pid, + .timestamp = pelem->timestamp, + .count = 1, + .stack_id = id, + }; + bpf_map_update_elem(&owner_data, &pelem->lock, &first, BPF_ANY); + } + /* Contention is ongoing and new waiter joins */ + else { + __sync_fetch_and_add(&otdata->count, 1); + + /* + * The owner is the same, but stacktrace might be changed. In this case we + * store/update `owner_stat` based on current owner stack id. + */ + if (id != otdata->stack_id) { + update_owner_stat(id, pelem->timestamp - otdata->timestamp, + pelem->flags); + + otdata->timestamp = pelem->timestamp; + otdata->stack_id = id; + } + } +skip_owner: pelem->stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP | stack_skip); if (pelem->stack_id < 0) @@ -440,6 +640,7 @@ int contention_end(u64 *ctx) struct tstamp_data *pelem; struct contention_key key = {}; struct contention_data *data; + __u64 timestamp; __u64 duration; bool need_delete = false; @@ -467,12 +668,88 @@ int contention_end(u64 *ctx) need_delete = true; } - duration = bpf_ktime_get_ns() - pelem->timestamp; + timestamp = bpf_ktime_get_ns(); + duration = timestamp - pelem->timestamp; if ((__s64)duration < 0) { __sync_fetch_and_add(&time_fail, 1); goto out; } + if (needs_callstack && lock_owner) { + struct owner_tracing_data *otdata = bpf_map_lookup_elem(&owner_data, &pelem->lock); + + if (!otdata) + goto skip_owner; + + /* Update `owner_stat` */ + update_owner_stat(otdata->stack_id, timestamp - otdata->timestamp, pelem->flags); + + /* No contention is occurring, delete `lock` entry in `owner_data` */ + if (otdata->count <= 1) + bpf_map_delete_elem(&owner_data, &pelem->lock); + /* + * Contention is still ongoing, with a new owner (current task). `owner_data` + * should be updated accordingly. + */ + else { + u32 i = 0; + s32 ret = (s32)ctx[1]; + u64 *buf; + + otdata->timestamp = timestamp; + __sync_fetch_and_add(&otdata->count, -1); + + buf = bpf_map_lookup_elem(&stack_buf, &i); + if (!buf) + goto skip_owner; + for (i = 0; i < (u32)max_stack; i++) + buf[i] = 0x0; + + /* + * `ret` has the return code of the lock function. + * If `ret` is negative, the current task terminates lock waiting without + * acquiring it. Owner is not changed, but we still need to update the owner + * stack. + */ + if (ret < 0) { + s32 id = 0; + struct task_struct *task; + + if (!bpf_task_from_pid) + goto skip_owner; + + task = bpf_task_from_pid(otdata->pid); + if (!task) + goto skip_owner; + + bpf_get_task_stack(task, buf, + max_stack * sizeof(unsigned long), 0); + bpf_task_release(task); + + id = get_owner_stack_id(buf); + + /* + * If owner stack is changed, update owner stack id for this lock. + */ + if (id != otdata->stack_id) + otdata->stack_id = id; + } + /* + * Otherwise, update tracing data with the current task, which is the new + * owner. + */ + else { + otdata->pid = pid; + /* + * We don't want to retrieve callstack here, since it is where the + * current task acquires the lock and provides no additional + * information. We simply assign -1 to invalidate it. + */ + otdata->stack_id = -1; + } + } + } +skip_owner: switch (aggr_mode) { case LOCK_AGGR_CALLER: key.stack_id = pelem->stack_id; @@ -556,16 +833,12 @@ int contention_end(u64 *ctx) } found: - __sync_fetch_and_add(&data->total_time, duration); - __sync_fetch_and_add(&data->count, 1); - - /* FIXME: need atomic operations */ - if (data->max_time < duration) - data->max_time = duration; - if (data->min_time > duration) - data->min_time = duration; + update_contention_data(data, duration, 1); out: + if (lock_delay) + check_lock_delay(pelem->lock); + pelem->lock = 0; if (need_delete) bpf_map_delete_elem(&tstamp, &pid); @@ -574,6 +847,11 @@ out: extern struct rq runqueues __ksym; +const volatile __u64 contig_page_data_addr; +const volatile __u64 node_data_addr; +const volatile int nr_nodes; +const volatile int sizeof_zone; + struct rq___old { raw_spinlock_t lock; } __attribute__((preserve_access_index)); @@ -582,6 +860,59 @@ struct rq___new { raw_spinlock_t __lock; } __attribute__((preserve_access_index)); +static void collect_zone_lock(void) +{ + __u64 nr_zones, zone_off; + __u64 lock_addr, lock_off; + __u32 lock_flag = LOCK_CLASS_ZONE_LOCK; + + zone_off = offsetof(struct pglist_data, node_zones); + lock_off = offsetof(struct zone, lock); + + if (contig_page_data_addr) { + struct pglist_data *contig_page_data; + + contig_page_data = (void *)(long)contig_page_data_addr; + nr_zones = BPF_CORE_READ(contig_page_data, nr_zones); + + for (int i = 0; i < MAX_ZONES; i++) { + __u64 zone_addr; + + if (i >= nr_zones) + break; + + zone_addr = contig_page_data_addr + (sizeof_zone * i) + zone_off; + lock_addr = zone_addr + lock_off; + + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); + } + } else if (nr_nodes > 0) { + struct pglist_data **node_data = (void *)(long)node_data_addr; + + for (int i = 0; i < nr_nodes; i++) { + struct pglist_data *pgdat = NULL; + int err; + + err = bpf_core_read(&pgdat, sizeof(pgdat), &node_data[i]); + if (err < 0 || pgdat == NULL) + break; + + nr_zones = BPF_CORE_READ(pgdat, nr_zones); + for (int k = 0; k < MAX_ZONES; k++) { + __u64 zone_addr; + + if (k >= nr_zones) + break; + + zone_addr = (__u64)(void *)pgdat + (sizeof_zone * k) + zone_off; + lock_addr = zone_addr + lock_off; + + bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); + } + } + } +} + SEC("raw_tp/bpf_test_finish") int BPF_PROG(collect_lock_syms) { @@ -603,6 +934,9 @@ int BPF_PROG(collect_lock_syms) lock_flag = LOCK_CLASS_RQLOCK; bpf_map_update_elem(&lock_syms, &lock_addr, &lock_flag, BPF_ANY); } + + collect_zone_lock(); + return 0; } diff --git a/tools/perf/util/bpf_skel/lock_data.h b/tools/perf/util/bpf_skel/lock_data.h index c15f734d7fc4..28c5e5aced7f 100644 --- a/tools/perf/util/bpf_skel/lock_data.h +++ b/tools/perf/util/bpf_skel/lock_data.h @@ -3,6 +3,13 @@ #ifndef UTIL_BPF_SKEL_LOCK_DATA_H #define UTIL_BPF_SKEL_LOCK_DATA_H +struct owner_tracing_data { + u32 pid; // Who has the lock. + u32 count; // How many waiters for this lock. + u64 timestamp; // The time while the owner acquires lock and contention is going on. + s32 stack_id; // Identifier for `owner_stat`, which stores as value in `owner_stacks` +}; + struct tstamp_data { u64 timestamp; u64 lock; @@ -60,6 +67,7 @@ enum lock_aggr_mode { enum lock_class_sym { LOCK_CLASS_NONE, LOCK_CLASS_RQLOCK, + LOCK_CLASS_ZONE_LOCK, }; struct slab_cache_data { diff --git a/tools/perf/util/bpf_skel/off_cpu.bpf.c b/tools/perf/util/bpf_skel/off_cpu.bpf.c index c152116df72f..72763bb8d1de 100644 --- a/tools/perf/util/bpf_skel/off_cpu.bpf.c +++ b/tools/perf/util/bpf_skel/off_cpu.bpf.c @@ -18,10 +18,19 @@ #define MAX_STACKS 32 #define MAX_ENTRIES 102400 +#define MAX_CPUS 4096 +#define MAX_OFFCPU_LEN 37 + +// We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1 +struct __stack { + u64 array[MAX_STACKS]; +}; + struct tstamp_data { __u32 stack_id; __u32 state; __u64 timestamp; + struct __stack stack; }; struct offcpu_key { @@ -39,6 +48,24 @@ struct { __uint(max_entries, MAX_ENTRIES); } stacks SEC(".maps"); +struct offcpu_data { + u64 array[MAX_OFFCPU_LEN]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); + __uint(key_size, sizeof(int)); + __uint(value_size, sizeof(int)); + __uint(max_entries, MAX_CPUS); +} offcpu_output SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(key_size, sizeof(__u32)); + __uint(value_size, sizeof(struct offcpu_data)); + __uint(max_entries, 1); +} offcpu_payload SEC(".maps"); + struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); @@ -97,6 +124,8 @@ const volatile bool uses_cgroup_v1 = false; int perf_subsys_id = -1; +__u64 offcpu_thresh_ns; + /* * Old kernel used to call it task_struct->state and now it's '__state'. * Use BPF CO-RE "ignored suffix rule" to deal with it like below: @@ -183,6 +212,47 @@ static inline int can_record(struct task_struct *t, int state) return 1; } +static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n) +{ + int len = 0; + + for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len) + to->array[n + 2 + i] = from->array[i]; + + return len; +} + +/** + * off_cpu_dump - dump off-cpu samples to ring buffer + * @data: payload for dumping off-cpu samples + * @key: off-cpu data + * @stack: stack trace of the task before being scheduled out + * + * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id + * information of the task, and dump it as a raw sample to perf ring buffer + */ +static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key, + struct __stack *stack, __u64 delta) +{ + int n = 0, len = 0; + + data->array[n++] = (u64)key->tgid << 32 | key->pid; + data->array[n++] = delta; + + /* data->array[n] is callchain->nr (updated later) */ + data->array[n + 1] = PERF_CONTEXT_USER; + data->array[n + 2] = 0; + len = copy_stack(stack, data, n); + + /* update length of callchain */ + data->array[n] = len + 1; + n += len + 2; + + data->array[n++] = key->cgroup_id; + + return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64)); +} + static int off_cpu_stat(u64 *ctx, struct task_struct *prev, struct task_struct *next, int state) { @@ -207,6 +277,16 @@ static int off_cpu_stat(u64 *ctx, struct task_struct *prev, pelem->state = state; pelem->stack_id = stack_id; + /* + * If stacks are successfully collected by bpf_get_stackid(), collect them once more + * in task_storage for direct off-cpu sample dumping + */ + if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) { + /* + * This empty if block is used to avoid 'result unused warning' from bpf_get_stack(). + * If the collection fails, continue with the logic for the next task. + */ + } next: pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); @@ -221,11 +301,19 @@ next: __u64 delta = ts - pelem->timestamp; __u64 *total; - total = bpf_map_lookup_elem(&off_cpu, &key); - if (total) - *total += delta; - else - bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); + if (delta >= offcpu_thresh_ns) { + int zero = 0; + struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero); + + if (data) + off_cpu_dump(ctx, data, &key, &pelem->stack, delta); + } else { + total = bpf_map_lookup_elem(&off_cpu, &key); + if (total) + *total += delta; + else + bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); + } /* prevent to reuse the timestamp later */ pelem->timestamp = 0; diff --git a/tools/perf/util/bpf_skel/syscall_summary.bpf.c b/tools/perf/util/bpf_skel/syscall_summary.bpf.c new file mode 100644 index 000000000000..1bcd066a5199 --- /dev/null +++ b/tools/perf/util/bpf_skel/syscall_summary.bpf.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Trace raw_syscalls tracepoints to collect system call statistics. + */ + +#include "vmlinux.h" +#include "syscall_summary.h" + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <bpf/bpf_core_read.h> + +/* This is to calculate a delta between sys-enter and sys-exit for each thread */ +struct syscall_trace { + int nr; /* syscall number is only available at sys-enter */ + int unused; + u64 timestamp; +}; + +#define MAX_ENTRIES (128 * 1024) + +struct syscall_trace_map { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, int); /* tid */ + __type(value, struct syscall_trace); + __uint(max_entries, MAX_ENTRIES); +} syscall_trace_map SEC(".maps"); + +struct syscall_stats_map { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, struct syscall_key); + __type(value, struct syscall_stats); + __uint(max_entries, MAX_ENTRIES); +} syscall_stats_map SEC(".maps"); + +int enabled; /* controlled from userspace */ + +const volatile enum syscall_aggr_mode aggr_mode; +const volatile int use_cgroup_v2; + +int perf_subsys_id = -1; + +static inline __u64 get_current_cgroup_id(void) +{ + struct task_struct *task; + struct cgroup *cgrp; + + if (use_cgroup_v2) + return bpf_get_current_cgroup_id(); + + task = bpf_get_current_task_btf(); + + if (perf_subsys_id == -1) { +#if __has_builtin(__builtin_preserve_enum_value) + perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, + perf_event_cgrp_id); +#else + perf_subsys_id = perf_event_cgrp_id; +#endif + } + + cgrp = BPF_CORE_READ(task, cgroups, subsys[perf_subsys_id], cgroup); + return BPF_CORE_READ(cgrp, kn, id); +} + +static void update_stats(int cpu_or_tid, u64 cgroup_id, int nr, s64 duration, + long ret) +{ + struct syscall_key key = { + .cpu_or_tid = cpu_or_tid, + .cgroup = cgroup_id, + .nr = nr, + }; + struct syscall_stats *stats; + + stats = bpf_map_lookup_elem(&syscall_stats_map, &key); + if (stats == NULL) { + struct syscall_stats zero = {}; + + bpf_map_update_elem(&syscall_stats_map, &key, &zero, BPF_NOEXIST); + stats = bpf_map_lookup_elem(&syscall_stats_map, &key); + if (stats == NULL) + return; + } + + __sync_fetch_and_add(&stats->count, 1); + if (ret < 0) + __sync_fetch_and_add(&stats->error, 1); + + if (duration > 0) { + __sync_fetch_and_add(&stats->total_time, duration); + __sync_fetch_and_add(&stats->squared_sum, duration * duration); + if (stats->max_time < duration) + stats->max_time = duration; + if (stats->min_time > duration || stats->min_time == 0) + stats->min_time = duration; + } + + return; +} + +SEC("tp_btf/sys_enter") +int sys_enter(u64 *ctx) +{ + int tid; + struct syscall_trace st; + + if (!enabled) + return 0; + + st.nr = ctx[1]; /* syscall number */ + st.unused = 0; + st.timestamp = bpf_ktime_get_ns(); + + tid = bpf_get_current_pid_tgid(); + bpf_map_update_elem(&syscall_trace_map, &tid, &st, BPF_ANY); + + return 0; +} + +SEC("tp_btf/sys_exit") +int sys_exit(u64 *ctx) +{ + int tid; + int key = 0; + u64 cgroup = 0; + long ret = ctx[1]; /* return value of the syscall */ + struct syscall_trace *st; + s64 delta; + + if (!enabled) + return 0; + + tid = bpf_get_current_pid_tgid(); + st = bpf_map_lookup_elem(&syscall_trace_map, &tid); + if (st == NULL) + return 0; + + if (aggr_mode == SYSCALL_AGGR_THREAD) + key = tid; + else if (aggr_mode == SYSCALL_AGGR_CGROUP) + cgroup = get_current_cgroup_id(); + else + key = bpf_get_smp_processor_id(); + + delta = bpf_ktime_get_ns() - st->timestamp; + update_stats(key, cgroup, st->nr, delta, ret); + + bpf_map_delete_elem(&syscall_trace_map, &tid); + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/perf/util/bpf_skel/syscall_summary.h b/tools/perf/util/bpf_skel/syscall_summary.h new file mode 100644 index 000000000000..72ccccb45925 --- /dev/null +++ b/tools/perf/util/bpf_skel/syscall_summary.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +/* Data structures shared between BPF and tools. */ +#ifndef UTIL_BPF_SKEL_SYSCALL_SUMMARY_H +#define UTIL_BPF_SKEL_SYSCALL_SUMMARY_H + +enum syscall_aggr_mode { + SYSCALL_AGGR_THREAD, + SYSCALL_AGGR_CPU, + SYSCALL_AGGR_CGROUP, +}; + +struct syscall_key { + u64 cgroup; + int cpu_or_tid; + int nr; +}; + +struct syscall_stats { + u64 total_time; + u64 squared_sum; + u64 max_time; + u64 min_time; + u32 count; + u32 error; +}; + +#endif /* UTIL_BPF_SKEL_SYSCALL_SUMMARY_H */ diff --git a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h index 7b81d3173917..a59ce912be18 100644 --- a/tools/perf/util/bpf_skel/vmlinux/vmlinux.h +++ b/tools/perf/util/bpf_skel/vmlinux/vmlinux.h @@ -203,4 +203,13 @@ struct bpf_iter__kmem_cache { struct kmem_cache *s; } __attribute__((preserve_access_index)); +struct zone { + spinlock_t lock; +} __attribute__((preserve_access_index)); + +struct pglist_data { + struct zone node_zones[6]; /* value for all possible config */ + int nr_zones; +} __attribute__((preserve_access_index)); + #endif // __VMLINUX_H |