diff options
Diffstat (limited to 'tools')
-rw-r--r-- | tools/sched_ext/include/scx/common.bpf.h | 34 | ||||
-rw-r--r-- | tools/sched_ext/include/scx/common.h | 1 | ||||
-rw-r--r-- | tools/sched_ext/include/scx/compat.bpf.h | 95 | ||||
-rw-r--r-- | tools/sched_ext/include/scx/compat.h | 16 | ||||
-rw-r--r-- | tools/sched_ext/include/scx/enum_defs.autogen.h | 120 | ||||
-rw-r--r-- | tools/sched_ext/scx_central.c | 15 | ||||
-rw-r--r-- | tools/sched_ext/scx_qmap.bpf.c | 23 | ||||
-rw-r--r-- | tools/testing/selftests/sched_ext/Makefile | 1 | ||||
-rw-r--r-- | tools/testing/selftests/sched_ext/numa.bpf.c | 100 | ||||
-rw-r--r-- | tools/testing/selftests/sched_ext/numa.c | 59 |
10 files changed, 458 insertions, 6 deletions
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 7849405614b1..dc4333d23189 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -7,6 +7,13 @@ #ifndef __SCX_COMMON_BPF_H #define __SCX_COMMON_BPF_H +/* + * The generated kfunc prototypes in vmlinux.h are missing address space + * attributes which cause build failures. For now, suppress the generated + * prototypes. See https://github.com/sched-ext/scx/issues/1111. + */ +#define BPF_NO_KFUNC_PROTOTYPES + #ifdef LSP #define __bpf__ #include "../vmlinux.h" @@ -18,6 +25,7 @@ #include <bpf/bpf_tracing.h> #include <asm-generic/errno.h> #include "user_exit_info.h" +#include "enum_defs.autogen.h" #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ @@ -62,21 +70,28 @@ void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; +u32 scx_bpf_nr_node_ids(void) __ksym __weak; u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; +int scx_bpf_cpu_node(s32 cpu) __ksym __weak; const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; +const struct cpumask *scx_bpf_get_idle_cpumask_node(int node) __ksym __weak; const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; +const struct cpumask *scx_bpf_get_idle_smtmask_node(int node) __ksym __weak; const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; +s32 scx_bpf_pick_idle_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; +s32 scx_bpf_pick_any_cpu_node(const cpumask_t *cpus_allowed, int node, u64 flags) __ksym __weak; s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; bool scx_bpf_task_running(const struct task_struct *p) __ksym; s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; u64 scx_bpf_now(void) __ksym __weak; +void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak; /* * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from @@ -84,6 +99,9 @@ u64 scx_bpf_now(void) __ksym __weak; */ #define BPF_FOR_EACH_ITER (&___it) +#define scx_read_event(e, name) \ + (bpf_core_field_exists((e)->name) ? (e)->name : 0) + static inline __attribute__((format(printf, 1, 2))) void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} @@ -584,6 +602,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s __u.__val; \ }) +#define READ_ONCE_ARENA(type, x) \ +({ \ + union { type __val; char __c[1]; } __u = \ + { .__c = { 0 } }; \ + __read_once_size((void *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + +#define WRITE_ONCE_ARENA(type, x, val) \ +({ \ + union { type __val; char __c[1]; } __u = \ + { .__val = (val) }; \ + __write_once_size((void *)&(x), __u.__c, sizeof(x)); \ + __u.__val; \ +}) + /* * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. * @v: The value for which we're computing the base 2 logarithm. diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h index dc18b99e55cd..1dc76bd84296 100644 --- a/tools/sched_ext/include/scx/common.h +++ b/tools/sched_ext/include/scx/common.h @@ -16,6 +16,7 @@ #include <stdlib.h> #include <stdint.h> #include <errno.h> +#include "enum_defs.autogen.h" typedef uint8_t u8; typedef uint16_t u16; diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h index 50e1499ae093..9252e1a00556 100644 --- a/tools/sched_ext/include/scx/compat.bpf.h +++ b/tools/sched_ext/include/scx/compat.bpf.h @@ -125,12 +125,107 @@ bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, false; \ }) +/** + * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on + * in a compatible way. We will preserve this __COMPAT helper until v6.16. + * + * @enq_flags: enqueue flags from ops.enqueue() + * + * Return: True if SCX_ENQ_CPU_SELECTED is turned on in @enq_flags + */ +static inline bool __COMPAT_is_enq_cpu_selected(u64 enq_flags) +{ +#ifdef HAVE_SCX_ENQ_CPU_SELECTED + /* + * This is the case that a BPF code compiled against vmlinux.h + * where the enum SCX_ENQ_CPU_SELECTED exists. + */ + + /* + * We should temporarily suspend the macro expansion of + * 'SCX_ENQ_CPU_SELECTED'. This avoids 'SCX_ENQ_CPU_SELECTED' being + * rewritten to '__SCX_ENQ_CPU_SELECTED' when 'SCX_ENQ_CPU_SELECTED' + * is defined in 'scripts/gen_enums.py'. + */ +#pragma push_macro("SCX_ENQ_CPU_SELECTED") +#undef SCX_ENQ_CPU_SELECTED + u64 flag; + + /* + * When the kernel did not have SCX_ENQ_CPU_SELECTED, + * select_task_rq_scx() has never been skipped. Thus, this case + * should be considered that the CPU has already been selected. + */ + if (!bpf_core_enum_value_exists(enum scx_enq_flags, + SCX_ENQ_CPU_SELECTED)) + return true; + + flag = bpf_core_enum_value(enum scx_enq_flags, SCX_ENQ_CPU_SELECTED); + return enq_flags & flag; + + /* + * Once done, resume the macro expansion of 'SCX_ENQ_CPU_SELECTED'. + */ +#pragma pop_macro("SCX_ENQ_CPU_SELECTED") +#else + /* + * This is the case that a BPF code compiled against vmlinux.h + * where the enum SCX_ENQ_CPU_SELECTED does NOT exist. + */ + return true; +#endif /* HAVE_SCX_ENQ_CPU_SELECTED */ +} + + #define scx_bpf_now() \ (bpf_ksym_exists(scx_bpf_now) ? \ scx_bpf_now() : \ bpf_ktime_get_ns()) /* + * v6.15: Introduce event counters. + * + * Preserve the following macro until v6.17. + */ +#define __COMPAT_scx_bpf_events(events, size) \ + (bpf_ksym_exists(scx_bpf_events) ? \ + scx_bpf_events(events, size) : ({})) + +/* + * v6.15: Introduce NUMA-aware kfuncs to operate with per-node idle + * cpumasks. + * + * Preserve the following __COMPAT_scx_*_node macros until v6.17. + */ +#define __COMPAT_scx_bpf_nr_node_ids() \ + (bpf_ksym_exists(scx_bpf_nr_node_ids) ? \ + scx_bpf_nr_node_ids() : 1U) + +#define __COMPAT_scx_bpf_cpu_node(cpu) \ + (bpf_ksym_exists(scx_bpf_cpu_node) ? \ + scx_bpf_cpu_node(cpu) : 0) + +#define __COMPAT_scx_bpf_get_idle_cpumask_node(node) \ + (bpf_ksym_exists(scx_bpf_get_idle_cpumask_node) ? \ + scx_bpf_get_idle_cpumask_node(node) : \ + scx_bpf_get_idle_cpumask()) \ + +#define __COMPAT_scx_bpf_get_idle_smtmask_node(node) \ + (bpf_ksym_exists(scx_bpf_get_idle_smtmask_node) ? \ + scx_bpf_get_idle_smtmask_node(node) : \ + scx_bpf_get_idle_smtmask()) + +#define __COMPAT_scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) \ + (bpf_ksym_exists(scx_bpf_pick_idle_cpu_node) ? \ + scx_bpf_pick_idle_cpu_node(cpus_allowed, node, flags) : \ + scx_bpf_pick_idle_cpu(cpus_allowed, flags)) + +#define __COMPAT_scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) \ + (bpf_ksym_exists(scx_bpf_pick_any_cpu_node) ? \ + scx_bpf_pick_any_cpu_node(cpus_allowed, node, flags) : \ + scx_bpf_pick_any_cpu(cpus_allowed, flags)) + +/* * Define sched_ext_ops. This may be expanded to define multiple variants for * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). */ diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index b50280e2ba2b..35c67c5174ac 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -106,8 +106,20 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field return false; } -#define SCX_OPS_SWITCH_PARTIAL \ - __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") +#define SCX_OPS_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_ops_flags", #name) + +#define SCX_OPS_KEEP_BUILTIN_IDLE SCX_OPS_FLAG(SCX_OPS_KEEP_BUILTIN_IDLE) +#define SCX_OPS_ENQ_LAST SCX_OPS_FLAG(SCX_OPS_ENQ_LAST) +#define SCX_OPS_ENQ_EXITING SCX_OPS_FLAG(SCX_OPS_ENQ_EXITING) +#define SCX_OPS_SWITCH_PARTIAL SCX_OPS_FLAG(SCX_OPS_SWITCH_PARTIAL) +#define SCX_OPS_ENQ_MIGRATION_DISABLED SCX_OPS_FLAG(SCX_OPS_ENQ_MIGRATION_DISABLED) +#define SCX_OPS_ALLOW_QUEUED_WAKEUP SCX_OPS_FLAG(SCX_OPS_ALLOW_QUEUED_WAKEUP) +#define SCX_OPS_BUILTIN_IDLE_PER_NODE SCX_OPS_FLAG(SCX_OPS_BUILTIN_IDLE_PER_NODE) + +#define SCX_PICK_IDLE_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", #name) + +#define SCX_PICK_IDLE_CORE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_CORE) +#define SCX_PICK_IDLE_IN_NODE SCX_PICK_IDLE_FLAG(SCX_PICK_IDLE_IN_NODE) static inline long scx_hotplug_seq(void) { diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h new file mode 100644 index 000000000000..6e6c45f14fe1 --- /dev/null +++ b/tools/sched_ext/include/scx/enum_defs.autogen.h @@ -0,0 +1,120 @@ +/* + * WARNING: This file is autogenerated from gen_enum_defs.py [1]. + * + * [1] https://github.com/sched-ext/scx/blob/main/scripts/gen_enum_defs.py + */ + +#ifndef __ENUM_DEFS_AUTOGEN_H__ +#define __ENUM_DEFS_AUTOGEN_H__ + +#define HAVE_SCX_DSP_DFL_MAX_BATCH +#define HAVE_SCX_DSP_MAX_LOOPS +#define HAVE_SCX_WATCHDOG_MAX_TIMEOUT +#define HAVE_SCX_EXIT_BT_LEN +#define HAVE_SCX_EXIT_MSG_LEN +#define HAVE_SCX_EXIT_DUMP_DFL_LEN +#define HAVE_SCX_CPUPERF_ONE +#define HAVE_SCX_OPS_TASK_ITER_BATCH +#define HAVE_SCX_CPU_PREEMPT_RT +#define HAVE_SCX_CPU_PREEMPT_DL +#define HAVE_SCX_CPU_PREEMPT_STOP +#define HAVE_SCX_CPU_PREEMPT_UNKNOWN +#define HAVE_SCX_DEQ_SLEEP +#define HAVE_SCX_DEQ_CORE_SCHED_EXEC +#define HAVE_SCX_DSQ_FLAG_BUILTIN +#define HAVE_SCX_DSQ_FLAG_LOCAL_ON +#define HAVE_SCX_DSQ_INVALID +#define HAVE_SCX_DSQ_GLOBAL +#define HAVE_SCX_DSQ_LOCAL +#define HAVE_SCX_DSQ_LOCAL_ON +#define HAVE_SCX_DSQ_LOCAL_CPU_MASK +#define HAVE_SCX_DSQ_ITER_REV +#define HAVE___SCX_DSQ_ITER_HAS_SLICE +#define HAVE___SCX_DSQ_ITER_HAS_VTIME +#define HAVE___SCX_DSQ_ITER_USER_FLAGS +#define HAVE___SCX_DSQ_ITER_ALL_FLAGS +#define HAVE_SCX_DSQ_LNODE_ITER_CURSOR +#define HAVE___SCX_DSQ_LNODE_PRIV_SHIFT +#define HAVE_SCX_ENQ_WAKEUP +#define HAVE_SCX_ENQ_HEAD +#define HAVE_SCX_ENQ_CPU_SELECTED +#define HAVE_SCX_ENQ_PREEMPT +#define HAVE_SCX_ENQ_REENQ +#define HAVE_SCX_ENQ_LAST +#define HAVE___SCX_ENQ_INTERNAL_MASK +#define HAVE_SCX_ENQ_CLEAR_OPSS +#define HAVE_SCX_ENQ_DSQ_PRIQ +#define HAVE_SCX_TASK_DSQ_ON_PRIQ +#define HAVE_SCX_TASK_QUEUED +#define HAVE_SCX_TASK_RESET_RUNNABLE_AT +#define HAVE_SCX_TASK_DEQD_FOR_SLEEP +#define HAVE_SCX_TASK_STATE_SHIFT +#define HAVE_SCX_TASK_STATE_BITS +#define HAVE_SCX_TASK_STATE_MASK +#define HAVE_SCX_TASK_CURSOR +#define HAVE_SCX_ECODE_RSN_HOTPLUG +#define HAVE_SCX_ECODE_ACT_RESTART +#define HAVE_SCX_EXIT_NONE +#define HAVE_SCX_EXIT_DONE +#define HAVE_SCX_EXIT_UNREG +#define HAVE_SCX_EXIT_UNREG_BPF +#define HAVE_SCX_EXIT_UNREG_KERN +#define HAVE_SCX_EXIT_SYSRQ +#define HAVE_SCX_EXIT_ERROR +#define HAVE_SCX_EXIT_ERROR_BPF +#define HAVE_SCX_EXIT_ERROR_STALL +#define HAVE_SCX_KF_UNLOCKED +#define HAVE_SCX_KF_CPU_RELEASE +#define HAVE_SCX_KF_DISPATCH +#define HAVE_SCX_KF_ENQUEUE +#define HAVE_SCX_KF_SELECT_CPU +#define HAVE_SCX_KF_REST +#define HAVE___SCX_KF_RQ_LOCKED +#define HAVE___SCX_KF_TERMINAL +#define HAVE_SCX_KICK_IDLE +#define HAVE_SCX_KICK_PREEMPT +#define HAVE_SCX_KICK_WAIT +#define HAVE_SCX_OPI_BEGIN +#define HAVE_SCX_OPI_NORMAL_BEGIN +#define HAVE_SCX_OPI_NORMAL_END +#define HAVE_SCX_OPI_CPU_HOTPLUG_BEGIN +#define HAVE_SCX_OPI_CPU_HOTPLUG_END +#define HAVE_SCX_OPI_END +#define HAVE_SCX_OPS_ENABLING +#define HAVE_SCX_OPS_ENABLED +#define HAVE_SCX_OPS_DISABLING +#define HAVE_SCX_OPS_DISABLED +#define HAVE_SCX_OPS_KEEP_BUILTIN_IDLE +#define HAVE_SCX_OPS_ENQ_LAST +#define HAVE_SCX_OPS_ENQ_EXITING +#define HAVE_SCX_OPS_SWITCH_PARTIAL +#define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT +#define HAVE_SCX_OPS_ALL_FLAGS +#define HAVE_SCX_OPSS_NONE +#define HAVE_SCX_OPSS_QUEUEING +#define HAVE_SCX_OPSS_QUEUED +#define HAVE_SCX_OPSS_DISPATCHING +#define HAVE_SCX_OPSS_QSEQ_SHIFT +#define HAVE_SCX_PICK_IDLE_CORE +#define HAVE_SCX_OPS_NAME_LEN +#define HAVE_SCX_SLICE_DFL +#define HAVE_SCX_SLICE_INF +#define HAVE_SCX_RQ_ONLINE +#define HAVE_SCX_RQ_CAN_STOP_TICK +#define HAVE_SCX_RQ_BAL_PENDING +#define HAVE_SCX_RQ_BAL_KEEP +#define HAVE_SCX_RQ_BYPASSING +#define HAVE_SCX_RQ_IN_WAKEUP +#define HAVE_SCX_RQ_IN_BALANCE +#define HAVE_SCX_TASK_NONE +#define HAVE_SCX_TASK_INIT +#define HAVE_SCX_TASK_READY +#define HAVE_SCX_TASK_ENABLED +#define HAVE_SCX_TASK_NR_STATES +#define HAVE_SCX_TG_ONLINE +#define HAVE_SCX_TG_INITED +#define HAVE_SCX_WAKE_FORK +#define HAVE_SCX_WAKE_TTWU +#define HAVE_SCX_WAKE_SYNC + +#endif /* __ENUM_DEFS_AUTOGEN_H__ */ diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c index 1e9f74525d8f..6ba6e610eeaa 100644 --- a/tools/sched_ext/scx_central.c +++ b/tools/sched_ext/scx_central.c @@ -10,6 +10,7 @@ #include <unistd.h> #include <inttypes.h> #include <signal.h> +#include <assert.h> #include <libgen.h> #include <bpf/bpf.h> #include <scx/common.h> @@ -60,14 +61,22 @@ restart: skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL"); + assert(skel->rodata->nr_cpu_ids <= INT32_MAX); + while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; break; - case 'c': - skel->rodata->central_cpu = strtoul(optarg, NULL, 0); + case 'c': { + u32 central_cpu = strtoul(optarg, NULL, 0); + if (central_cpu >= skel->rodata->nr_cpu_ids) { + fprintf(stderr, "invalid central CPU id value, %u given (%u max)\n", central_cpu, skel->rodata->nr_cpu_ids); + return -1; + } + skel->rodata->central_cpu = (s32)central_cpu; break; + } case 'v': verbose = true; break; @@ -96,7 +105,7 @@ restart: */ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); - CPU_ZERO(cpuset); + CPU_ZERO_S(CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids), cpuset); CPU_SET(skel->rodata->central_cpu, cpuset); SCX_BUG_ON(sched_setaffinity(0, sizeof(*cpuset), cpuset), "Failed to affinitize to central CPU %d (max %d)", diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 3a20bb0c014a..26c40ca4f36c 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -231,7 +231,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) } /* if select_cpu() wasn't called, try direct dispatch */ - if (!(enq_flags & SCX_ENQ_CPU_SELECTED) && + if (!__COMPAT_is_enq_cpu_selected(enq_flags) && (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) { __sync_fetch_and_add(&nr_ddsp_from_enq, 1); scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags); @@ -763,6 +763,8 @@ static void dump_shared_dsq(void) static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) { + struct scx_event_stats events; + bpf_rcu_read_lock(); dispatch_highpri(true); bpf_rcu_read_unlock(); @@ -772,6 +774,25 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) if (print_shared_dsq) dump_shared_dsq(); + __COMPAT_scx_bpf_events(&events, sizeof(events)); + + bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK", + scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE", + scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE)); + bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST", + scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING", + scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING)); + bpf_printk("%35s: %lld", "SCX_EV_ENQ_SLICE_DFL", + scx_read_event(&events, SCX_EV_ENQ_SLICE_DFL)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION", + scx_read_event(&events, SCX_EV_BYPASS_DURATION)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH", + scx_read_event(&events, SCX_EV_BYPASS_DISPATCH)); + bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE", + scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE)); + bpf_timer_start(timer, ONE_SEC_IN_NS, 0); return 0; } diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 011762224600..f4531327b8e7 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -172,6 +172,7 @@ auto-test-targets := \ maximal \ maybe_null \ minimal \ + numa \ prog_run \ reload_loop \ select_cpu_dfl \ diff --git a/tools/testing/selftests/sched_ext/numa.bpf.c b/tools/testing/selftests/sched_ext/numa.bpf.c new file mode 100644 index 000000000000..a79d86ed54a1 --- /dev/null +++ b/tools/testing/selftests/sched_ext/numa.bpf.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that validates the behavior of the NUMA-aware + * functionalities. + * + * The scheduler creates a separate DSQ for each NUMA node, ensuring tasks + * are exclusively processed by CPUs within their respective nodes. Idle + * CPUs are selected only within the same node, so task migration can only + * occurs between CPUs belonging to the same node. + * + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +const volatile unsigned int __COMPAT_SCX_PICK_IDLE_IN_NODE; + +static bool is_cpu_idle(s32 cpu, int node) +{ + const struct cpumask *idle_cpumask; + bool idle; + + idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node); + idle = bpf_cpumask_test_cpu(cpu, idle_cpumask); + scx_bpf_put_cpumask(idle_cpumask); + + return idle; +} + +s32 BPF_STRUCT_OPS(numa_select_cpu, + struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); + s32 cpu; + + /* + * We could just use __COMPAT_scx_bpf_pick_any_cpu_node() here, + * since it already tries to pick an idle CPU within the node + * first, but let's use both functions for better testing coverage. + */ + cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node, + __COMPAT_SCX_PICK_IDLE_IN_NODE); + if (cpu < 0) + cpu = __COMPAT_scx_bpf_pick_any_cpu_node(p->cpus_ptr, node, + __COMPAT_SCX_PICK_IDLE_IN_NODE); + + if (is_cpu_idle(cpu, node)) + scx_bpf_error("CPU %d should be marked as busy", cpu); + + if (__COMPAT_scx_bpf_cpu_node(cpu) != node) + scx_bpf_error("CPU %d should be in node %d", cpu, node); + + return cpu; +} + +void BPF_STRUCT_OPS(numa_enqueue, struct task_struct *p, u64 enq_flags) +{ + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); + + scx_bpf_dsq_insert(p, node, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev) +{ + int node = __COMPAT_scx_bpf_cpu_node(cpu); + + scx_bpf_dsq_move_to_local(node); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init) +{ + int node, err; + + bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) { + err = scx_bpf_create_dsq(node, node); + if (err) + return err; + } + + return 0; +} + +void BPF_STRUCT_OPS(numa_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops numa_ops = { + .select_cpu = (void *)numa_select_cpu, + .enqueue = (void *)numa_enqueue, + .dispatch = (void *)numa_dispatch, + .init = (void *)numa_init, + .exit = (void *)numa_exit, + .name = "numa", +}; diff --git a/tools/testing/selftests/sched_ext/numa.c b/tools/testing/selftests/sched_ext/numa.c new file mode 100644 index 000000000000..b060c3b65c82 --- /dev/null +++ b/tools/testing/selftests/sched_ext/numa.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "numa.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct numa *skel; + + skel = numa__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + skel->rodata->__COMPAT_SCX_PICK_IDLE_IN_NODE = SCX_PICK_IDLE_IN_NODE; + skel->struct_ops.numa_ops->flags = SCX_OPS_BUILTIN_IDLE_PER_NODE; + SCX_FAIL_IF(numa__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct numa *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.numa_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + /* Just sleeping is fine, plenty of scheduling events happening */ + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct numa *skel = ctx; + + numa__destroy(skel); +} + +struct scx_test numa = { + .name = "numa", + .description = "Verify NUMA-aware functionalities", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&numa) |