8 files changed, 204 insertions, 63 deletions
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index f538c75db183..2f36b7b6418d 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -7,7 +7,13 @@
 #ifndef __SCX_COMMON_BPF_H
 #define __SCX_COMMON_BPF_H
 
+#ifdef LSP
+#define __bpf__
+#include "../vmlinux/vmlinux.h"
+#else
 #include "vmlinux.h"
+#endif
+
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
 #include <asm-generic/errno.h>
@@ -30,15 +36,15 @@ static inline void ___vmlinux_h_sanity_check___(void)
 
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
-void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
-void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
+void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
-bool scx_bpf_consume(u64 dsq_id) __ksym;
-void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
-void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
-bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym;
+void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
+void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
@@ -65,11 +71,11 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
 bool scx_bpf_task_running(const struct task_struct *p) __ksym;
 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
-struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
 
 /*
- * Use the following as @it__iter when calling
- * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
+ * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
+ * within bpf_for_each() loops.
  */
 #define BPF_FOR_EACH_ITER	(&___it)
 
@@ -309,6 +315,15 @@ void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym
 u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
 				   const struct cpumask *src2) __ksym;
+u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym;
+
+/*
+ * Access a cpumask in read-only mode (typically to check bits).
+ */
+static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask)
+{
+	return (const struct cpumask *)mask;
+}
 
 /* rcu */
 void bpf_rcu_read_lock(void) __ksym;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 3d2fe1208900..d56520100a26 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -15,6 +15,116 @@
 	__ret;									\
 })
 
+/* v6.12: 819513666966 ("sched_ext: Add cgroup support") */
+#define __COMPAT_scx_bpf_task_cgroup(p)						\
+	(bpf_ksym_exists(scx_bpf_task_cgroup) ?					\
+	 scx_bpf_task_cgroup((p)) : NULL)
+
+/*
+ * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are
+ * renamed to unload the verb.
+ *
+ * Build error is triggered if old names are used. New binaries work with both
+ * new and old names. The compat macros will be removed on v6.15 release.
+ *
+ * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by
+ * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()").
+ * Preserve __COMPAT macros until v6.15.
+ */
+void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
+bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+
+#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags)				\
+	(bpf_ksym_exists(scx_bpf_dsq_insert) ?					\
+	 scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) :		\
+	 scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags)))
+
+#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags)		\
+	(bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ?				\
+	 scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \
+	 scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags)))
+
+#define scx_bpf_dsq_move_to_local(dsq_id)					\
+	(bpf_ksym_exists(scx_bpf_dsq_move_to_local) ?				\
+	 scx_bpf_dsq_move_to_local((dsq_id)) :					\
+	 scx_bpf_consume___compat((dsq_id)))
+
+#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice)			\
+	(bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ?				\
+	 scx_bpf_dsq_move_set_slice((it__iter), (slice)) :			\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ?	\
+	  scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) :	\
+	  (void)0))
+
+#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime)			\
+	(bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ?				\
+	 scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) :			\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ?	\
+	  scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) :	\
+	  (void) 0))
+
+#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags)		\
+	(bpf_ksym_exists(scx_bpf_dsq_move) ?					\
+	 scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) :		\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ?			\
+	  scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+	  false))
+
+#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags)		\
+	(bpf_ksym_exists(scx_bpf_dsq_move_vtime) ?				\
+	 scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) :	\
+	 (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ?		\
+	  scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+	  false))
+
+#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags)				\
+	_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
+
+#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags)		\
+	_Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()")
+
+#define scx_bpf_consume(dsq_id) ({						\
+	_Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \
+	false;									\
+})
+
+#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice)		\
+	_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()")
+
+#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime)		\
+	_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()")
+
+#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({	\
+	_Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \
+	false;									\
+})
+
+#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({  \
+	_Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \
+	false;									\
+})
+
+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice)		\
+	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()")
+
+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime)		\
+	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()")
+
+#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({	\
+	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \
+	false;									\
+})
+
+#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({  \
+	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \
+	false;									\
+})
+
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 891693ee604e..8ce2734402e1 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -25,7 +25,11 @@ struct user_exit_info {
 
 #ifdef __bpf__
 
+#ifdef LSP
+#include "../vmlinux/vmlinux.h"
+#else
 #include "vmlinux.h"
+#endif
 #include <bpf/bpf_core_read.h>
 
 #define UEI_DEFINE(__name)							\
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 8dd8eb73b6b8..e6fad6211f6c 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -118,14 +118,14 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
 		__sync_fetch_and_add(&nr_locals, 1);
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
-				 enq_flags | SCX_ENQ_PREEMPT);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+				   enq_flags | SCX_ENQ_PREEMPT);
 		return;
 	}
 
 	if (bpf_map_push_elem(&central_q, &pid, 0)) {
 		__sync_fetch_and_add(&nr_overflows, 1);
-		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
+		scx_bpf_dsq_insert(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
 		return;
 	}
 
@@ -158,7 +158,7 @@ static bool dispatch_to_cpu(s32 cpu)
 		 */
 		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 			__sync_fetch_and_add(&nr_mismatches, 1);
-			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
+			scx_bpf_dsq_insert(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
 			bpf_task_release(p);
 			/*
 			 * We might run out of dispatch buffer slots if we continue dispatching
@@ -172,7 +172,7 @@ static bool dispatch_to_cpu(s32 cpu)
 		}
 
 		/* dispatch to local and mark that @cpu doesn't need more */
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
 
 		if (cpu != central_cpu)
 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
@@ -219,13 +219,13 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 		}
 
 		/* look for a task to run on the central CPU */
-		if (scx_bpf_consume(FALLBACK_DSQ_ID))
+		if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ_ID))
 			return;
 		dispatch_to_cpu(central_cpu);
 	} else {
 		bool *gimme;
 
-		if (scx_bpf_consume(FALLBACK_DSQ_ID))
+		if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ_ID))
 			return;
 
 		gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 3ab2b60781a0..4e3afcd260bf 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -49,7 +49,10 @@
 /*
  * Maximum amount of retries to find a valid cgroup.
  */
-#define CGROUP_MAX_RETRIES 1024
+enum {
+	FALLBACK_DSQ		= 0,
+	CGROUP_MAX_RETRIES	= 1024,
+};
 
 char _license[] SEC("license") = "GPL";
 
@@ -225,7 +228,7 @@ static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
 				break;
 
 			/*
-			 * We can be oppotunistic here and not grab the
+			 * We can be opportunistic here and not grab the
 			 * cgv_tree_lock and deal with the occasional races.
 			 * However, hweight updates are already cached and
 			 * relatively low-frequency. Let's just do the
@@ -258,8 +261,7 @@ static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
 	 * and thus can't be updated and repositioned. Instead, we collect the
 	 * vtime deltas separately and apply it asynchronously here.
 	 */
-	delta = cgc->cvtime_delta;
-	__sync_fetch_and_sub(&cgc->cvtime_delta, delta);
+	delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta);
 	cvtime = cgv_node->cvtime + delta;
 
 	/*
@@ -339,7 +341,7 @@ s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake
 	if (is_idle) {
 		set_bypassed_at(p, taskc);
 		stat_inc(FCG_STAT_LOCAL);
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}
 
 	return cpu;
@@ -375,21 +377,23 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		 */
 		if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
 			stat_inc(FCG_STAT_LOCAL);
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
+					   enq_flags);
 		} else {
 			stat_inc(FCG_STAT_GLOBAL);
-			scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dsq_insert(p, FALLBACK_DSQ, SCX_SLICE_DFL,
+					   enq_flags);
 		}
 		return;
 	}
 
-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (!cgc)
 		goto out_release;
 
 	if (fifo_sched) {
-		scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+		scx_bpf_dsq_insert(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
 	} else {
 		u64 tvtime = p->scx.dsq_vtime;
 
@@ -400,8 +404,8 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
 			tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
 
-		scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
-				       tvtime, enq_flags);
+		scx_bpf_dsq_insert_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
+					 tvtime, enq_flags);
 	}
 
 	cgrp_enqueued(cgrp, cgc);
@@ -509,7 +513,7 @@ void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
 {
 	struct cgroup *cgrp;
 
-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	update_active_weight_sums(cgrp, true);
 	bpf_cgroup_release(cgrp);
 }
@@ -522,7 +526,7 @@ void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
 	if (fifo_sched)
 		return;
 
-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (cgc) {
 		/*
@@ -565,7 +569,7 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
 	if (!taskc->bypassed_at)
 		return;
 
-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	cgc = find_cgrp_ctx(cgrp);
 	if (cgc) {
 		__sync_fetch_and_add(&cgc->cvtime_delta,
@@ -579,7 +583,7 @@ void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
 {
 	struct cgroup *cgrp;
 
-	cgrp = scx_bpf_task_cgroup(p);
+	cgrp = __COMPAT_scx_bpf_task_cgroup(p);
 	update_active_weight_sums(cgrp, false);
 	bpf_cgroup_release(cgrp);
 }
@@ -661,7 +665,7 @@ static bool try_pick_next_cgroup(u64 *cgidp)
 		goto out_free;
 	}
 
-	if (!scx_bpf_consume(cgid)) {
+	if (!scx_bpf_dsq_move_to_local(cgid)) {
 		bpf_cgroup_release(cgrp);
 		stat_inc(FCG_STAT_PNC_EMPTY);
 		goto out_stash;
@@ -741,7 +745,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 		goto pick_next_cgroup;
 
 	if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
-		if (scx_bpf_consume(cpuc->cur_cgid)) {
+		if (scx_bpf_dsq_move_to_local(cpuc->cur_cgid)) {
 			stat_inc(FCG_STAT_CNS_KEEP);
 			return;
 		}
@@ -781,7 +785,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 pick_next_cgroup:
 	cpuc->cur_at = now;
 
-	if (scx_bpf_consume(SCX_DSQ_GLOBAL)) {
+	if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ)) {
 		cpuc->cur_cgid = 0;
 		return;
 	}
@@ -838,7 +842,7 @@ int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
 	int ret;
 
 	/*
-	 * Technically incorrect as cgroup ID is full 64bit while dq ID is
+	 * Technically incorrect as cgroup ID is full 64bit while dsq ID is
 	 * 63bit. Should not be a problem in practice and easy to spot in the
 	 * unlikely case that it breaks.
 	 */
@@ -926,6 +930,11 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
 	p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
 }
 
+s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
+{
+	return scx_bpf_create_dsq(FALLBACK_DSQ, -1);
+}
+
 void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
 {
 	UEI_RECORD(uei, ei);
@@ -944,6 +953,7 @@ SCX_OPS_DEFINE(flatcg_ops,
 	       .cgroup_init		= (void *)fcg_cgroup_init,
 	       .cgroup_exit		= (void *)fcg_cgroup_exit,
 	       .cgroup_move		= (void *)fcg_cgroup_move,
+	       .init			= (void *)fcg_init,
 	       .exit			= (void *)fcg_exit,
 	       .flags			= SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
 	       .name			= "flatcg");
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 83c8f54c1e31..ee264947e0c3 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -226,15 +226,15 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	if (tctx->force_local) {
 		tctx->force_local = false;
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
 	}
 
-	/* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */
-	if (!(enq_flags & SCX_ENQ_WAKEUP) &&
+	/* if select_cpu() wasn't called, try direct dispatch */
+	if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
 		return;
 	}
 
@@ -247,7 +247,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (enq_flags & SCX_ENQ_REENQ) {
 		s32 cpu;
 
-		scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
+		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
@@ -262,7 +262,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 
 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
 	if (bpf_map_push_elem(ring, &pid, 0)) {
-		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
+		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
 		return;
 	}
 
@@ -294,10 +294,10 @@ static void update_core_sched_head_seq(struct task_struct *p)
 }
 
 /*
- * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
- * selective priority boosting mechanism by scanning SHARED_DSQ looking for
- * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
- * makes minor difference only when dsp_batch is larger than 1.
+ * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
+ * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
+ * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
+ * difference only when dsp_batch is larger than 1.
  *
  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
  * non-rq-lock holding BPF programs. As demonstration, this function is called
@@ -318,11 +318,11 @@ static bool dispatch_highpri(bool from_timer)
 
 		if (tctx->highpri) {
 			/* exercise the set_*() and vtime interface too */
-			scx_bpf_dispatch_from_dsq_set_slice(
+			__COMPAT_scx_bpf_dsq_move_set_slice(
 				BPF_FOR_EACH_ITER, slice_ns * 2);
-			scx_bpf_dispatch_from_dsq_set_vtime(
+			__COMPAT_scx_bpf_dsq_move_set_vtime(
 				BPF_FOR_EACH_ITER, highpri_seq++);
-			scx_bpf_dispatch_vtime_from_dsq(
+			__COMPAT_scx_bpf_dsq_move_vtime(
 				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
 		}
 	}
@@ -340,7 +340,7 @@ static bool dispatch_highpri(bool from_timer)
 		else
 			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
 
-		if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
+		if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
 					      SCX_DSQ_LOCAL_ON | cpu,
 					      SCX_ENQ_PREEMPT)) {
 			if (cpu == this_cpu) {
@@ -374,7 +374,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	if (dispatch_highpri(false))
 		return;
 
-	if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
+	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
 		return;
 
 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -385,7 +385,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		 */
 		p = bpf_task_from_pid(2);
 		if (p) {
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
+			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
 			bpf_task_release(p);
 			return;
 		}
@@ -431,7 +431,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			update_core_sched_head_seq(p);
 			__sync_fetch_and_add(&nr_dispatched, 1);
 
-			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
+			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
 			bpf_task_release(p);
 
 			batch--;
@@ -439,7 +439,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
 				if (dispatch_highpri(false))
 					return;
-				scx_bpf_consume(SHARED_DSQ);
+				scx_bpf_dsq_move_to_local(SHARED_DSQ);
 				return;
 			}
 			if (!cpuc->dsp_cnt)
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index 8bc626ede1c4..b800d4f5f2e9 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -35,6 +35,8 @@ print(f'enabled       : {read_static_key("__scx_ops_enabled")}')
 print(f'switching_all : {read_int("scx_switching_all")}')
 print(f'switched_all  : {read_static_key("__scx_switched_all")}')
 print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
-print(f'bypass_depth  : {read_atomic("scx_ops_bypass_depth")}')
+print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
+print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
+print(f'bypass_depth  : {prog["scx_ops_bypass_depth"].value_()}')
 print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
 print(f'enable_seq    : {read_atomic("scx_enable_seq")}')
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index ed7e8d535fc5..31f915b286c6 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -31,10 +31,10 @@ UEI_DEFINE(uei);
 
 /*
  * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
- * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We
+ * (meaning, cannot be dispatched to with scx_bpf_dsq_insert_vtime()). We
  * therefore create a separate DSQ with ID 0 that we dispatch to and consume
- * from. If scx_simple only supported global FIFO scheduling, then we could
- * just use SCX_DSQ_GLOBAL.
+ * from. If scx_simple only supported global FIFO scheduling, then we could just
+ * use SCX_DSQ_GLOBAL.
  */
 #define SHARED_DSQ 0
 
@@ -65,7 +65,7 @@ s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 w
 	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
 	if (is_idle) {
 		stat_inc(0);	/* count local queueing */
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}
 
 	return cpu;
@@ -76,7 +76,7 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 	stat_inc(1);	/* count global queueing */
 
 	if (fifo_sched) {
-		scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
 	} else {
 		u64 vtime = p->scx.dsq_vtime;
 
@@ -87,14 +87,14 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
 			vtime = vtime_now - SCX_SLICE_DFL;
 
-		scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
-				       enq_flags);
+		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
+					 enq_flags);
 	}
 }
 
 void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
 {
-	scx_bpf_consume(SHARED_DSQ);
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
 }
 
 void BPF_STRUCT_OPS(simple_running, struct task_struct *p)