// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) // Copyright (c) 2022 Google #include "vmlinux.h" #include #include #include /* task->flags for off-cpu analysis */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ /* task->state for off-cpu analysis */ #define TASK_INTERRUPTIBLE 0x0001 #define TASK_UNINTERRUPTIBLE 0x0002 /* create a new thread */ #define CLONE_THREAD 0x10000 #define MAX_STACKS 32 #define MAX_ENTRIES 102400 #define MAX_CPUS 4096 #define MAX_OFFCPU_LEN 37 // We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1 struct __stack { u64 array[MAX_STACKS]; }; struct tstamp_data { __u32 stack_id; __u32 state; __u64 timestamp; struct __stack stack; }; struct offcpu_key { __u32 pid; __u32 tgid; __u32 stack_id; __u32 state; __u64 cgroup_id; }; struct { __uint(type, BPF_MAP_TYPE_STACK_TRACE); __uint(key_size, sizeof(__u32)); __uint(value_size, MAX_STACKS * sizeof(__u64)); __uint(max_entries, MAX_ENTRIES); } stacks SEC(".maps"); struct offcpu_data { u64 array[MAX_OFFCPU_LEN]; }; struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); __uint(key_size, sizeof(int)); __uint(value_size, sizeof(int)); __uint(max_entries, MAX_CPUS); } offcpu_output SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(struct offcpu_data)); __uint(max_entries, 1); } offcpu_payload SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_TASK_STORAGE); __uint(map_flags, BPF_F_NO_PREALLOC); __type(key, int); __type(value, struct tstamp_data); } tstamp SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(struct offcpu_key)); __uint(value_size, sizeof(__u64)); __uint(max_entries, MAX_ENTRIES); } off_cpu SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u8)); __uint(max_entries, 1); } cpu_filter SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(__u32)); __uint(value_size, sizeof(__u8)); __uint(max_entries, 1); } task_filter SEC(".maps"); struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(key_size, sizeof(__u64)); __uint(value_size, sizeof(__u8)); __uint(max_entries, 1); } cgroup_filter SEC(".maps"); /* new kernel task_struct definition */ struct task_struct___new { long __state; } __attribute__((preserve_access_index)); /* old kernel task_struct definition */ struct task_struct___old { long state; } __attribute__((preserve_access_index)); int enabled = 0; const volatile int has_cpu = 0; const volatile int has_task = 0; const volatile int has_cgroup = 0; const volatile int uses_tgid = 0; const volatile bool has_prev_state = false; const volatile bool needs_cgroup = false; const volatile bool uses_cgroup_v1 = false; int perf_subsys_id = -1; __u64 offcpu_thresh_ns; /* * Old kernel used to call it task_struct->state and now it's '__state'. * Use BPF CO-RE "ignored suffix rule" to deal with it like below: * * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes */ static inline int get_task_state(struct task_struct *t) { /* recast pointer to capture new type for compiler */ struct task_struct___new *t_new = (void *)t; if (bpf_core_field_exists(t_new->__state)) { return BPF_CORE_READ(t_new, __state); } else { /* recast pointer to capture old type for compiler */ struct task_struct___old *t_old = (void *)t; return BPF_CORE_READ(t_old, state); } } static inline __u64 get_cgroup_id(struct task_struct *t) { struct cgroup *cgrp; if (!uses_cgroup_v1) return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id); if (perf_subsys_id == -1) { #if __has_builtin(__builtin_preserve_enum_value) perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, perf_event_cgrp_id); #else perf_subsys_id = perf_event_cgrp_id; #endif } cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup); return BPF_CORE_READ(cgrp, kn, id); } static inline int can_record(struct task_struct *t, int state) { /* kernel threads don't have user stack */ if (t->flags & PF_KTHREAD) return 0; if (state != TASK_INTERRUPTIBLE && state != TASK_UNINTERRUPTIBLE) return 0; if (has_cpu) { __u32 cpu = bpf_get_smp_processor_id(); __u8 *ok; ok = bpf_map_lookup_elem(&cpu_filter, &cpu); if (!ok) return 0; } if (has_task) { __u8 *ok; __u32 pid; if (uses_tgid) pid = t->tgid; else pid = t->pid; ok = bpf_map_lookup_elem(&task_filter, &pid); if (!ok) return 0; } if (has_cgroup) { __u8 *ok; __u64 cgrp_id = get_cgroup_id(t); ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); if (!ok) return 0; } return 1; } static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n) { int len = 0; for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len) to->array[n + 2 + i] = from->array[i]; return len; } /** * off_cpu_dump - dump off-cpu samples to ring buffer * @data: payload for dumping off-cpu samples * @key: off-cpu data * @stack: stack trace of the task before being scheduled out * * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id * information of the task, and dump it as a raw sample to perf ring buffer */ static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key, struct __stack *stack, __u64 delta) { int n = 0, len = 0; data->array[n++] = (u64)key->tgid << 32 | key->pid; data->array[n++] = delta; /* data->array[n] is callchain->nr (updated later) */ data->array[n + 1] = PERF_CONTEXT_USER; data->array[n + 2] = 0; len = copy_stack(stack, data, n); /* update length of callchain */ data->array[n] = len + 1; n += len + 2; data->array[n++] = key->cgroup_id; return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64)); } static int off_cpu_stat(u64 *ctx, struct task_struct *prev, struct task_struct *next, int state) { __u64 ts; __u32 stack_id; struct tstamp_data *pelem; ts = bpf_ktime_get_ns(); if (!can_record(prev, state)) goto next; stack_id = bpf_get_stackid(ctx, &stacks, BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); pelem = bpf_task_storage_get(&tstamp, prev, NULL, BPF_LOCAL_STORAGE_GET_F_CREATE); if (!pelem) goto next; pelem->timestamp = ts; pelem->state = state; pelem->stack_id = stack_id; /* * If stacks are successfully collected by bpf_get_stackid(), collect them once more * in task_storage for direct off-cpu sample dumping */ if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) { /* * This empty if block is used to avoid 'result unused warning' from bpf_get_stack(). * If the collection fails, continue with the logic for the next task. */ } next: pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); if (pelem && pelem->timestamp) { struct offcpu_key key = { .pid = next->pid, .tgid = next->tgid, .stack_id = pelem->stack_id, .state = pelem->state, .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, }; __u64 delta = ts - pelem->timestamp; __u64 *total; if (delta >= offcpu_thresh_ns) { int zero = 0; struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero); if (data) off_cpu_dump(ctx, data, &key, &pelem->stack, delta); } else { total = bpf_map_lookup_elem(&off_cpu, &key); if (total) *total += delta; else bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); } /* prevent to reuse the timestamp later */ pelem->timestamp = 0; } return 0; } SEC("tp_btf/task_newtask") int on_newtask(u64 *ctx) { struct task_struct *task; u64 clone_flags; u32 pid; u8 val = 1; if (!uses_tgid) return 0; task = (struct task_struct *)bpf_get_current_task(); pid = BPF_CORE_READ(task, tgid); if (!bpf_map_lookup_elem(&task_filter, &pid)) return 0; task = (struct task_struct *)ctx[0]; clone_flags = ctx[1]; pid = task->tgid; if (!(clone_flags & CLONE_THREAD)) bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); return 0; } SEC("tp_btf/sched_switch") int on_switch(u64 *ctx) { struct task_struct *prev, *next; int prev_state; if (!enabled) return 0; prev = (struct task_struct *)ctx[1]; next = (struct task_struct *)ctx[2]; if (has_prev_state) prev_state = (int)ctx[3]; else prev_state = get_task_state(prev); return off_cpu_stat(ctx, prev, next, prev_state & 0xff); } char LICENSE[] SEC("license") = "Dual BSD/GPL";