diff options
Diffstat (limited to 'tools/sched_ext/scx_qmap.bpf.c')
-rw-r--r-- | tools/sched_ext/scx_qmap.bpf.c | 91 |
1 files changed, 87 insertions, 4 deletions
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 619078355bf5..c75c70d6a8eb 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -13,6 +13,7 @@ * - Sleepable per-task storage allocation using ops.prep_enable(). * - Using ops.cpu_release() to handle a higher priority scheduling class taking * the CPU away. + * - Core-sched support. * * This scheduler is primarily for demonstration and testing of sched_ext * features and unlikely to be useful for actual workloads. @@ -67,9 +68,21 @@ struct { }, }; +/* + * Per-queue sequence numbers to implement core-sched ordering. + * + * Tail seq is assigned to each queued task and incremented. Head seq tracks the + * sequence number of the latest dispatched task. The distance between the a + * task's seq and the associated queue's head seq is called the queue distance + * and used when comparing two tasks for ordering. See qmap_core_sched_before(). + */ +static u64 core_sched_head_seqs[5]; +static u64 core_sched_tail_seqs[5]; + /* Per-task scheduling context */ struct task_ctx { bool force_local; /* Dispatch directly to local_dsq */ + u64 core_sched_seq; }; struct { @@ -93,6 +106,7 @@ struct { /* Statistics */ u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; +u64 nr_core_sched_execed; s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) @@ -159,8 +173,18 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) return; } - /* Is select_cpu() is telling us to enqueue locally? */ - if (tctx->force_local) { + /* + * All enqueued tasks must have their core_sched_seq updated for correct + * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in + * qmap_ops.flags. + */ + tctx->core_sched_seq = core_sched_tail_seqs[idx]++; + + /* + * If qmap_select_cpu() is telling us to or this is the last runnable + * task on the CPU, enqueue locally. + */ + if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { tctx->force_local = false; scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); return; @@ -204,6 +228,19 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) { __sync_fetch_and_add(&nr_dequeued, 1); + if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) + __sync_fetch_and_add(&nr_core_sched_execed, 1); +} + +static void update_core_sched_head_seq(struct task_struct *p) +{ + struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + int idx = weight_to_idx(p->scx.weight); + + if (tctx) + core_sched_head_seqs[idx] = tctx->core_sched_seq; + else + scx_bpf_error("task_ctx lookup failed"); } void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) @@ -258,6 +295,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) if (!p) continue; + update_core_sched_head_seq(p); __sync_fetch_and_add(&nr_dispatched, 1); scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); bpf_task_release(p); @@ -275,6 +313,49 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) } } +/* + * The distance from the head of the queue scaled by the weight of the queue. + * The lower the number, the older the task and the higher the priority. + */ +static s64 task_qdist(struct task_struct *p) +{ + int idx = weight_to_idx(p->scx.weight); + struct task_ctx *tctx; + s64 qdist; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); + if (!tctx) { + scx_bpf_error("task_ctx lookup failed"); + return 0; + } + + qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; + + /* + * As queue index increments, the priority doubles. The queue w/ index 3 + * is dispatched twice more frequently than 2. Reflect the difference by + * scaling qdists accordingly. Note that the shift amount needs to be + * flipped depending on the sign to avoid flipping priority direction. + */ + if (qdist >= 0) + return qdist << (4 - idx); + else + return qdist << idx; +} + +/* + * This is called to determine the task ordering when core-sched is picking + * tasks to execute on SMT siblings and should encode about the same ordering as + * the regular scheduling path. Use the priority-scaled distances from the head + * of the queues to compare the two tasks which should be consistent with the + * dispatch path behavior. + */ +bool BPF_STRUCT_OPS(qmap_core_sched_before, + struct task_struct *a, struct task_struct *b) +{ + return task_qdist(a) > task_qdist(b); +} + void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) { u32 cnt; @@ -354,8 +435,8 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) return; - scx_bpf_dump("QMAP: force_local=%d", - taskc->force_local); + scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", + taskc->force_local, taskc->core_sched_seq); } /* @@ -428,6 +509,7 @@ SCX_OPS_DEFINE(qmap_ops, .enqueue = (void *)qmap_enqueue, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, + .core_sched_before = (void *)qmap_core_sched_before, .cpu_release = (void *)qmap_cpu_release, .init_task = (void *)qmap_init_task, .dump = (void *)qmap_dump, @@ -437,5 +519,6 @@ SCX_OPS_DEFINE(qmap_ops, .cpu_offline = (void *)qmap_cpu_offline, .init = (void *)qmap_init, .exit = (void *)qmap_exit, + .flags = SCX_OPS_ENQ_LAST, .timeout_ms = 5000U, .name = "qmap"); |