diff options
Diffstat (limited to 'tools/sched_ext/include')
| -rw-r--r-- | tools/sched_ext/include/scx/bpf_arena_common.bpf.h | 175 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/bpf_arena_common.h | 33 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/common.bpf.h | 117 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/common.h | 5 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/compat.bpf.h | 330 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/compat.h | 14 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/user_exit_info.bpf.h | 40 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/user_exit_info.h | 49 | ||||
| -rw-r--r-- | tools/sched_ext/include/scx/user_exit_info_common.h | 30 |
9 files changed, 636 insertions, 157 deletions
diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h new file mode 100644 index 000000000000..4366fb3c91ce --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h @@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef PAGE_SIZE +#define PAGE_SIZE __PAGE_SIZE +/* + * for older kernels try sizeof(struct genradix_node) + * or flexible: + * static inline long __bpf_page_size(void) { + * return bpf_core_enum_value(enum page_size_enum___l, __PAGE_SIZE___l) ?: sizeof(struct genradix_node); + * } + * but generated code is not great. + */ +#endif + +#if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM) +#define __arena __attribute__((address_space(1))) +#define __arena_global __attribute__((address_space(1))) +#define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */ +#define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */ +#else + +/* emit instruction: + * rX = rX .off = BPF_ADDR_SPACE_CAST .imm32 = (dst_as << 16) | src_as + * + * This is a workaround for LLVM compiler versions without + * __BPF_FEATURE_ADDR_SPACE_CAST that do not automatically cast between arena + * pointers and native kernel/userspace ones. In this case we explicitly do so + * with cast_kern() and cast_user(). E.g., in the Linux kernel tree, + * tools/testing/selftests/bpf includes tests that use these macros to implement + * linked lists and hashtables backed by arena memory. In sched_ext, we use + * cast_kern() and cast_user() for compatibility with older LLVM toolchains. + */ +#ifndef bpf_addr_space_cast +#define bpf_addr_space_cast(var, dst_as, src_as)\ + asm volatile(".byte 0xBF; \ + .ifc %[reg], r0; \ + .byte 0x00; \ + .endif; \ + .ifc %[reg], r1; \ + .byte 0x11; \ + .endif; \ + .ifc %[reg], r2; \ + .byte 0x22; \ + .endif; \ + .ifc %[reg], r3; \ + .byte 0x33; \ + .endif; \ + .ifc %[reg], r4; \ + .byte 0x44; \ + .endif; \ + .ifc %[reg], r5; \ + .byte 0x55; \ + .endif; \ + .ifc %[reg], r6; \ + .byte 0x66; \ + .endif; \ + .ifc %[reg], r7; \ + .byte 0x77; \ + .endif; \ + .ifc %[reg], r8; \ + .byte 0x88; \ + .endif; \ + .ifc %[reg], r9; \ + .byte 0x99; \ + .endif; \ + .short %[off]; \ + .long %[as]" \ + : [reg]"+r"(var) \ + : [off]"i"(BPF_ADDR_SPACE_CAST) \ + , [as]"i"((dst_as << 16) | src_as)); +#endif + +#define __arena +#define __arena_global SEC(".addr_space.1") +#define cast_kern(ptr) bpf_addr_space_cast(ptr, 0, 1) +#define cast_user(ptr) bpf_addr_space_cast(ptr, 1, 0) +#endif + +void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt, + int node_id, __u64 flags) __ksym __weak; +void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak; + +/* + * Note that cond_break can only be portably used in the body of a breakable + * construct, whereas can_loop can be used anywhere. + */ +#ifdef TEST +#define can_loop true +#define __cond_break(expr) expr +#else +#ifdef __BPF_FEATURE_MAY_GOTO +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("may_goto %l[l_break]" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long ((%l[l_break] - 1b - 8) / 8) & 0xffff; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#else +#define can_loop \ + ({ __label__ l_break, l_continue; \ + bool ret = true; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: ret = false; \ + l_continue:; \ + ret; \ + }) + +#define __cond_break(expr) \ + ({ __label__ l_break, l_continue; \ + asm volatile goto("1:.byte 0xe5; \ + .byte 0; \ + .long (((%l[l_break] - 1b - 8) / 8) & 0xffff) << 16; \ + .short 0" \ + :::: l_break); \ + goto l_continue; \ + l_break: expr; \ + l_continue:; \ + }) +#endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ +#endif /* __BPF_FEATURE_MAY_GOTO */ +#endif /* TEST */ + +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) + + +void bpf_preempt_disable(void) __weak __ksym; +void bpf_preempt_enable(void) __weak __ksym; diff --git a/tools/sched_ext/include/scx/bpf_arena_common.h b/tools/sched_ext/include/scx/bpf_arena_common.h new file mode 100644 index 000000000000..10141db0b59d --- /dev/null +++ b/tools/sched_ext/include/scx/bpf_arena_common.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ +/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ +#pragma once + +#ifndef arena_container_of +#define arena_container_of(ptr, type, member) \ + ({ \ + void __arena *__mptr = (void __arena *)(ptr); \ + ((type *)(__mptr - offsetof(type, member))); \ + }) +#endif + +/* Provide the definition of PAGE_SIZE. */ +#include <sys/user.h> + +#define __arena +#define __arg_arena +#define cast_kern(ptr) /* nop for user space */ +#define cast_user(ptr) /* nop for user space */ +char __attribute__((weak)) arena[1]; + +#ifndef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) +#endif + +static inline void __arena* bpf_arena_alloc_pages(void *map, void *addr, __u32 page_cnt, + int node_id, __u64 flags) +{ + return NULL; +} +static inline void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) +{ +} diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 8787048c6762..821d5791bd42 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -24,14 +24,26 @@ #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include <asm-generic/errno.h> -#include "user_exit_info.h" +#include "user_exit_info.bpf.h" #include "enum_defs.autogen.h" +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ +#define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ +#define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_EXITING 0x00000004 #define CLOCK_MONOTONIC 1 +#ifndef NR_CPUS +#define NR_CPUS 1024 +#endif + +#ifndef NUMA_NO_NODE +#define NUMA_NO_NODE (-1) +#endif + extern int LINUX_KERNEL_VERSION __kconfig; extern const char CONFIG_CC_VERSION_TEXT[64] __kconfig __weak; extern const char CONFIG_LOCALVERSION[64] __kconfig __weak; @@ -48,19 +60,15 @@ static inline void ___vmlinux_h_sanity_check___(void) s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; -void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; -void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; +s32 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed, + struct scx_bpf_select_cpu_and_args *args) __ksym __weak; +bool __scx_bpf_dsq_insert_vtime(struct task_struct *p, struct scx_bpf_dsq_insert_vtime_args *args) __ksym __weak; u32 scx_bpf_dispatch_nr_slots(void) __ksym; void scx_bpf_dispatch_cancel(void) __ksym; -bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym __weak; -void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; -void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; -bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -u32 scx_bpf_reenqueue_local(void) __ksym; void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; +struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak; int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; @@ -89,7 +97,8 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; -struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; +struct rq *scx_bpf_locked_rq(void) __ksym; +struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; @@ -105,6 +114,9 @@ void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __ static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} +#define SCX_STRINGIFY(x) #x +#define SCX_TOSTRING(x) SCX_STRINGIFY(x) + /* * Helper macro for initializing the fmt and variadic argument inputs to both * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to @@ -139,13 +151,15 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments * instead of an array of u64. Invoking this macro will cause the scheduler to * exit in an erroneous state, with diagnostic information being passed to the - * user. + * user. It appends the file and line number to aid debugging. */ #define scx_bpf_error(fmt, args...) \ ({ \ - scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_bstr_preamble( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args) \ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ - ___scx_bpf_bstr_format_checker(fmt, ##args); \ + ___scx_bpf_bstr_format_checker( \ + __FILE__ ":" SCX_TOSTRING(__LINE__) ": " fmt, ##args); \ }) /* @@ -227,6 +241,7 @@ BPF_PROG(name, ##args) * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of * `MEMBER_VPTR(ptr, ->member)`. */ +#ifndef MEMBER_VPTR #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ ({ \ u64 __base = (u64)&(base); \ @@ -243,6 +258,7 @@ BPF_PROG(name, ##args) [max]"i"(sizeof(base) - sizeof((base) member))); \ __addr; \ }) +#endif /* MEMBER_VPTR */ /** * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element @@ -258,6 +274,7 @@ BPF_PROG(name, ##args) * size of the array to compute the max, which will result in rejection by * the verifier. */ +#ifndef ARRAY_ELEM_PTR #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ ({ \ u64 __base = (u64)arr; \ @@ -272,7 +289,7 @@ BPF_PROG(name, ##args) [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ __addr; \ }) - +#endif /* ARRAY_ELEM_PTR */ /* * BPF declarations and helpers @@ -436,8 +453,27 @@ static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) */ static inline bool is_migration_disabled(const struct task_struct *p) { - if (bpf_core_field_exists(p->migration_disabled)) - return p->migration_disabled; + /* + * Testing p->migration_disabled in a BPF code is tricky because the + * migration is _always_ disabled while running the BPF code. + * The prolog (__bpf_prog_enter) and epilog (__bpf_prog_exit) for BPF + * code execution disable and re-enable the migration of the current + * task, respectively. So, the _current_ task of the sched_ext ops is + * always migration-disabled. Moreover, p->migration_disabled could be + * two or greater when a sched_ext ops BPF code (e.g., ops.tick) is + * executed in the middle of the other BPF code execution. + * + * Therefore, we should decide that the _current_ task is + * migration-disabled only when its migration_disabled count is greater + * than one. In other words, when p->migration_disabled == 1, there is + * an ambiguity, so we should check if @p is the current task or not. + */ + if (bpf_core_field_exists(p->migration_disabled)) { + if (p->migration_disabled == 1) + return bpf_get_current_task_btf() != p; + else + return p->migration_disabled; + } return false; } @@ -474,7 +510,7 @@ static inline s64 time_delta(u64 after, u64 before) */ static inline bool time_after(u64 a, u64 b) { - return (s64)(b - a) < 0; + return (s64)(b - a) < 0; } /** @@ -498,7 +534,7 @@ static inline bool time_before(u64 a, u64 b) */ static inline bool time_after_eq(u64 a, u64 b) { - return (s64)(a - b) >= 0; + return (s64)(a - b) >= 0; } /** @@ -545,9 +581,15 @@ static inline bool time_in_range_open(u64 a, u64 b, u64 c) */ /* useful compiler attributes */ +#ifndef likely #define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#ifndef __maybe_unused #define __maybe_unused __attribute__((__unused__)) +#endif /* * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They @@ -631,6 +673,26 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s }) /* + * __calc_avg - Calculate exponential weighted moving average (EWMA) with + * @old and @new values. @decay represents how large the @old value remains. + * With a larger @decay value, the moving average changes slowly, exhibiting + * fewer fluctuations. + */ +#define __calc_avg(old, new, decay) ({ \ + typeof(decay) thr = 1 << (decay); \ + typeof(old) ret; \ + if (((old) < thr) || ((new) < thr)) { \ + if (((old) == 1) && ((new) == 0)) \ + ret = 0; \ + else \ + ret = ((old) - ((old) >> 1)) + ((new) >> 1); \ + } else { \ + ret = ((old) - ((old) >> (decay))) + ((new) >> (decay)); \ + } \ + ret; \ +}) + +/* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. */ @@ -661,6 +723,25 @@ static inline u32 log2_u64(u64 v) } /* + * sqrt_u64 - Calculate the square root of value @x using Newton's method. + */ +static inline u64 __sqrt_u64(u64 x) +{ + if (x == 0 || x == 1) + return x; + + u64 r = ((1ULL << 32) > x) ? x : (1ULL << 32); + + for (int i = 0; i < 8; ++i) { + u64 q = x / r; + if (r <= q) + break; + r = (r + q) >> 1; + } + return r; +} + +/* * Return a value proportionally scaled to the task's weight. */ static inline u64 scale_by_task_weight(const struct task_struct *p, u64 value) diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index 1dc76bd84296..b3c6372bcf81 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -75,8 +75,9 @@ typedef int64_t s64; #include "enums.h" /* not available when building kernel tools/sched_ext */ -#if __has_include(<lib/sdt_task.h>) -#include <lib/sdt_task.h> +#if __has_include(<lib/sdt_task_defs.h>) +#include "bpf_arena_common.h" +#include <lib/sdt_task_defs.h> #endif #endif /* __SCHED_EXT_COMMON_H */ diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 9252e1a00556..f2969c3061a7 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -16,114 +16,92 @@ }) /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ -#define __COMPAT_scx_bpf_task_cgroup(p) \ - (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ - scx_bpf_task_cgroup((p)) : NULL) +struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak; + +#define scx_bpf_task_cgroup(p) \ + (bpf_ksym_exists(scx_bpf_task_cgroup___new) ? \ + scx_bpf_task_cgroup___new((p)) : NULL) /* * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are * renamed to unload the verb. * - * Build error is triggered if old names are used. New binaries work with both - * new and old names. The compat macros will be removed on v6.15 release. - * * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()"). - * Preserve __COMPAT macros until v6.15. */ -void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; -void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; -bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak; -void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; -void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; -bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; -bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; - -#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_insert) ? \ - scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) : \ - scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags))) - -#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ? \ - scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \ - scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags))) +bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak; +void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; +bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dsq_move_vtime___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; + +bool scx_bpf_consume___old(u64 dsq_id) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_slice___old(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak; +void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak; +bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; +bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; #define scx_bpf_dsq_move_to_local(dsq_id) \ - (bpf_ksym_exists(scx_bpf_dsq_move_to_local) ? \ - scx_bpf_dsq_move_to_local((dsq_id)) : \ - scx_bpf_consume___compat((dsq_id))) - -#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice) \ - (bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ? \ - scx_bpf_dsq_move_set_slice((it__iter), (slice)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ? \ - scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) : \ + (bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ? \ + scx_bpf_dsq_move_to_local___new((dsq_id)) : \ + scx_bpf_consume___old((dsq_id))) + +#define scx_bpf_dsq_move_set_slice(it__iter, slice) \ + (bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ? \ + scx_bpf_dsq_move_set_slice___new((it__iter), (slice)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___old) ? \ + scx_bpf_dispatch_from_dsq_set_slice___old((it__iter), (slice)) : \ (void)0)) -#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ - (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ? \ - scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ? \ - scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) : \ - (void) 0)) - -#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_move) ? \ - scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ? \ - scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ - false)) +#define scx_bpf_dsq_move_set_vtime(it__iter, vtime) \ + (bpf_ksym_exists(scx_bpf_dsq_move_set_vtime___new) ? \ + scx_bpf_dsq_move_set_vtime___new((it__iter), (vtime)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___old) ? \ + scx_bpf_dispatch_from_dsq_set_vtime___old((it__iter), (vtime)) : \ + (void)0)) -#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ - (bpf_ksym_exists(scx_bpf_dsq_move_vtime) ? \ - scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) : \ - (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ? \ - scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \ +#define scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dsq_move___new) ? \ + scx_bpf_dsq_move___new((it__iter), (p), (dsq_id), (enq_flags)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___old) ? \ + scx_bpf_dispatch_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ false)) -#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags) \ - _Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()") - -#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags) \ - _Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()") - -#define scx_bpf_consume(dsq_id) ({ \ - _Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \ - false; \ -}) - -#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()") - -#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()") - -#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \ - false; \ -}) - -#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \ - false; \ -}) - -#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice) \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()") +#define scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags) \ + (bpf_ksym_exists(scx_bpf_dsq_move_vtime___new) ? \ + scx_bpf_dsq_move_vtime___new((it__iter), (p), (dsq_id), (enq_flags)) : \ + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___old) ? \ + scx_bpf_dispatch_vtime_from_dsq___old((it__iter), (p), (dsq_id), (enq_flags)) : \ + false)) -#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime) \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()") +/* + * v6.15: 950ad93df2fc ("bpf: add kfunc for populating cpumask bits") + * + * Compat macro will be dropped on v6.19 release. + */ +int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym __weak; -#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \ - false; \ -}) +#define __COMPAT_bpf_cpumask_populate(cpumask, src, size__sz) \ + (bpf_ksym_exists(bpf_cpumask_populate) ? \ + (bpf_cpumask_populate(cpumask, src, size__sz)) : -EOPNOTSUPP) -#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({ \ - _Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \ - false; \ -}) +/* + * v6.19: Introduce lockless peek API for user DSQs. + * + * Preserve the following macro until v6.21. + */ +static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id) +{ + struct task_struct *p = NULL; + struct bpf_iter_scx_dsq it; + + if (bpf_ksym_exists(scx_bpf_dsq_peek)) + return scx_bpf_dsq_peek(dsq_id); + if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0)) + p = bpf_iter_scx_dsq_next(&it); + bpf_iter_scx_dsq_destroy(&it); + return p; +} /** * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on @@ -226,6 +204,178 @@ static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) scx_bpf_pick_any_cpu(cpus_allowed, flags)) /* + * v6.18: Add a helper to retrieve the current task running on a CPU. + * + * Keep this helper available until v6.20 for compatibility. + */ +static inline struct task_struct *__COMPAT_scx_bpf_cpu_curr(int cpu) +{ + struct rq *rq; + + if (bpf_ksym_exists(scx_bpf_cpu_curr)) + return scx_bpf_cpu_curr(cpu); + + rq = scx_bpf_cpu_rq(cpu); + + return rq ? rq->curr : NULL; +} + +/* + * v6.19: To work around BPF maximum parameter limit, the following kfuncs are + * replaced with variants that pack scalar arguments in a struct. Wrappers are + * provided to maintain source compatibility. + * + * v6.13: scx_bpf_dsq_insert_vtime() renaming is also handled here. See the + * block on dispatch renaming above for more details. + * + * The kernel will carry the compat variants until v6.23 to maintain binary + * compatibility. After v6.23 release, remove the compat handling and move the + * wrappers to common.bpf.h. + */ +s32 scx_bpf_select_cpu_and___compat(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) __ksym __weak; +void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; +void scx_bpf_dsq_insert_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; + +/** + * scx_bpf_select_cpu_and - Pick an idle CPU usable by task @p + * @p: task_struct to select a CPU for + * @prev_cpu: CPU @p was on previously + * @wake_flags: %SCX_WAKE_* flags + * @cpus_allowed: cpumask of allowed CPUs + * @flags: %SCX_PICK_IDLE* flags + * + * Inline wrapper that packs scalar arguments into a struct and calls + * __scx_bpf_select_cpu_and(). See __scx_bpf_select_cpu_and() for details. + */ +static inline s32 +scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags, + const struct cpumask *cpus_allowed, u64 flags) +{ + if (bpf_core_type_exists(struct scx_bpf_select_cpu_and_args)) { + struct scx_bpf_select_cpu_and_args args = { + .prev_cpu = prev_cpu, + .wake_flags = wake_flags, + .flags = flags, + }; + + return __scx_bpf_select_cpu_and(p, cpus_allowed, &args); + } else { + return scx_bpf_select_cpu_and___compat(p, prev_cpu, wake_flags, + cpus_allowed, flags); + } +} + +/** + * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ + * @p: task_struct to insert + * @dsq_id: DSQ to insert into + * @slice: duration @p can run for in nsecs, 0 to keep the current value + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Inline wrapper that packs scalar arguments into a struct and calls + * __scx_bpf_dsq_insert_vtime(). See __scx_bpf_dsq_insert_vtime() for details. + */ +static inline bool +scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, + u64 enq_flags) +{ + if (bpf_core_type_exists(struct scx_bpf_dsq_insert_vtime_args)) { + struct scx_bpf_dsq_insert_vtime_args args = { + .dsq_id = dsq_id, + .slice = slice, + .vtime = vtime, + .enq_flags = enq_flags, + }; + + return __scx_bpf_dsq_insert_vtime(p, &args); + } else if (bpf_ksym_exists(scx_bpf_dsq_insert_vtime___compat)) { + scx_bpf_dsq_insert_vtime___compat(p, dsq_id, slice, vtime, + enq_flags); + return true; + } else { + scx_bpf_dispatch_vtime___compat(p, dsq_id, slice, vtime, + enq_flags); + return true; + } +} + +/* + * v6.19: scx_bpf_dsq_insert() now returns bool instead of void. Move + * scx_bpf_dsq_insert() decl to common.bpf.h and drop compat helper after v6.22. + * The extra ___compat suffix is to work around libbpf not ignoring __SUFFIX on + * kernel side. The entire suffix can be dropped later. + * + * v6.13: scx_bpf_dsq_insert() renaming is also handled here. See the block on + * dispatch renaming above for more details. + */ +bool scx_bpf_dsq_insert___v2___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; +void scx_bpf_dsq_insert___v1(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; +void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; + +static inline bool +scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) +{ + if (bpf_ksym_exists(scx_bpf_dsq_insert___v2___compat)) { + return scx_bpf_dsq_insert___v2___compat(p, dsq_id, slice, enq_flags); + } else if (bpf_ksym_exists(scx_bpf_dsq_insert___v1)) { + scx_bpf_dsq_insert___v1(p, dsq_id, slice, enq_flags); + return true; + } else { + scx_bpf_dispatch___compat(p, dsq_id, slice, enq_flags); + return true; + } +} + +/* + * v6.19: scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() added to for + * sub-sched authority checks. Drop the wrappers and move the decls to + * common.bpf.h after v6.22. + */ +bool scx_bpf_task_set_slice___new(struct task_struct *p, u64 slice) __ksym __weak; +bool scx_bpf_task_set_dsq_vtime___new(struct task_struct *p, u64 vtime) __ksym __weak; + +static inline void scx_bpf_task_set_slice(struct task_struct *p, u64 slice) +{ + if (bpf_ksym_exists(scx_bpf_task_set_slice___new)) + scx_bpf_task_set_slice___new(p, slice); + else + p->scx.slice = slice; +} + +static inline void scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime) +{ + if (bpf_ksym_exists(scx_bpf_task_set_dsq_vtime___new)) + scx_bpf_task_set_dsq_vtime___new(p, vtime); + else + p->scx.dsq_vtime = vtime; +} + +/* + * v6.19: The new void variant can be called from anywhere while the older v1 + * variant can only be called from ops.cpu_release(). The double ___ prefixes on + * the v2 variant need to be removed once libbpf is updated to ignore ___ prefix + * on kernel side. Drop the wrapper and move the decl to common.bpf.h after + * v6.22. + */ +u32 scx_bpf_reenqueue_local___v1(void) __ksym __weak; +void scx_bpf_reenqueue_local___v2___compat(void) __ksym __weak; + +static inline bool __COMPAT_scx_bpf_reenqueue_local_from_anywhere(void) +{ + return bpf_ksym_exists(scx_bpf_reenqueue_local___v2___compat); +} + +static inline void scx_bpf_reenqueue_local(void) +{ + if (__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + scx_bpf_reenqueue_local___v2___compat(); + else + scx_bpf_reenqueue_local___v1(); +} + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index 35c67c5174ac..8b4897fc8b99 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -151,6 +151,10 @@ static inline long scx_hotplug_seq(void) * * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is * the current minimum required kernel version. + * + * COMPAT: + * - v6.17: ops.cgroup_set_bandwidth() + * - v6.19: ops.cgroup_set_idle() */ #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ struct __scx_name *__skel; \ @@ -162,6 +166,16 @@ static inline long scx_hotplug_seq(void) SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ SCX_ENUM_INIT(__skel); \ + if (__skel->struct_ops.__ops_name->cgroup_set_bandwidth && \ + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_bandwidth")) { \ + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_bandwidth()\n"); \ + __skel->struct_ops.__ops_name->cgroup_set_bandwidth = NULL; \ + } \ + if (__skel->struct_ops.__ops_name->cgroup_set_idle && \ + !__COMPAT_struct_has_field("sched_ext_ops", "cgroup_set_idle")) { \ + fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \ + __skel->struct_ops.__ops_name->cgroup_set_idle = NULL; \ + } \ __skel; \ }) diff --git a/tools/sched_ext/include/scx/user_exit_info.bpf.h b/tools/sched_ext/include/scx/user_exit_info.bpf.h new file mode 100644 index 000000000000..e7ac6611a990 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info.bpf.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ + +#ifndef __USER_EXIT_INFO_BPF_H +#define __USER_EXIT_INFO_BPF_H + +#ifndef LSP +#include "vmlinux.h" +#endif +#include <bpf/bpf_core_read.h> + +#include "user_exit_info_common.h" + +#define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ + struct user_exit_info __name SEC(".data") + +#define UEI_RECORD(__uei_name, __ei) ({ \ + bpf_probe_read_kernel_str(__uei_name.reason, \ + sizeof(__uei_name.reason), (__ei)->reason); \ + bpf_probe_read_kernel_str(__uei_name.msg, \ + sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ + if (bpf_core_field_exists((__ei)->exit_code)) \ + __uei_name.exit_code = (__ei)->exit_code; \ + /* use __sync to force memory barrier */ \ + __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ + (__ei)->kind); \ +}) + +#endif /* __USER_EXIT_INFO_BPF_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 66f856640ee7..399697fa372f 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -10,55 +10,11 @@ #ifndef __USER_EXIT_INFO_H #define __USER_EXIT_INFO_H -#ifdef LSP -#define __bpf__ -#include "../vmlinux.h" -#endif - -enum uei_sizes { - UEI_REASON_LEN = 128, - UEI_MSG_LEN = 1024, - UEI_DUMP_DFL_LEN = 32768, -}; - -struct user_exit_info { - int kind; - s64 exit_code; - char reason[UEI_REASON_LEN]; - char msg[UEI_MSG_LEN]; -}; - -#ifdef __bpf__ - -#ifndef LSP -#include "vmlinux.h" -#endif -#include <bpf/bpf_core_read.h> - -#define UEI_DEFINE(__name) \ - char RESIZABLE_ARRAY(data, __name##_dump); \ - const volatile u32 __name##_dump_len; \ - struct user_exit_info __name SEC(".data") - -#define UEI_RECORD(__uei_name, __ei) ({ \ - bpf_probe_read_kernel_str(__uei_name.reason, \ - sizeof(__uei_name.reason), (__ei)->reason); \ - bpf_probe_read_kernel_str(__uei_name.msg, \ - sizeof(__uei_name.msg), (__ei)->msg); \ - bpf_probe_read_kernel_str(__uei_name##_dump, \ - __uei_name##_dump_len, (__ei)->dump); \ - if (bpf_core_field_exists((__ei)->exit_code)) \ - __uei_name.exit_code = (__ei)->exit_code; \ - /* use __sync to force memory barrier */ \ - __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ - (__ei)->kind); \ -}) - -#else /* !__bpf__ */ - #include <stdio.h> #include <stdbool.h> +#include "user_exit_info_common.h" + /* no need to call the following explicitly if SCX_OPS_LOAD() is used */ #define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ @@ -114,5 +70,4 @@ enum uei_ecode_mask { #define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) -#endif /* __bpf__ */ #endif /* __USER_EXIT_INFO_H */ diff --git a/tools/sched_ext/include/scx/user_exit_info_common.h b/tools/sched_ext/include/scx/user_exit_info_common.h new file mode 100644 index 000000000000..2d0981aedd89 --- /dev/null +++ b/tools/sched_ext/include/scx/user_exit_info_common.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Define struct user_exit_info which is shared between BPF and userspace parts + * to communicate exit status and other information. + * + * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2022 Tejun Heo <tj@kernel.org> + * Copyright (c) 2022 David Vernet <dvernet@meta.com> + */ +#ifndef __USER_EXIT_INFO_COMMON_H +#define __USER_EXIT_INFO_COMMON_H + +#ifdef LSP +#include "../vmlinux.h" +#endif + +enum uei_sizes { + UEI_REASON_LEN = 128, + UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, +}; + +struct user_exit_info { + int kind; + s64 exit_code; + char reason[UEI_REASON_LEN]; + char msg[UEI_MSG_LEN]; +}; + +#endif /* __USER_EXIT_INFO_COMMON_H */ |
