From b75aaea24c9fc776e5bd14df38147270a3c00450 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 18 Feb 2026 09:32:15 +0100
Subject: [PATCH 001/134] sched_ext: Properly mark SCX-internal migrations via
 sticky_cpu

Reposition the setting and clearing of sticky_cpu to better define the
scope of SCX-internal migrations.

This ensures @sticky_cpu is set for the entire duration of an internal
migration (from dequeue through enqueue), making it a reliable indicator
that an SCX-internal migration is in progress. The dequeue and enqueue
paths can then use @sticky_cpu to identify internal migrations and skip
BPF scheduler notifications accordingly.

This prepares for a later commit fixing the ops.dequeue() semantics.
No functional change intended.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 62b1f3ac5630..87397781c1bf 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1476,9 +1476,6 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 
 	enq_flags |= rq->scx.extra_enq_flags;
 
-	if (sticky_cpu >= 0)
-		p->scx.sticky_cpu = -1;
-
 	/*
 	 * Restoring a running task will be immediately followed by
 	 * set_next_task_scx() which expects the task to not be on the BPF
@@ -1509,6 +1506,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
 		dl_server_start(&rq->ext_server);
 
 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
+
+	if (sticky_cpu >= 0)
+		p->scx.sticky_cpu = -1;
 out:
 	rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
 
@@ -1670,10 +1670,13 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 {
 	lockdep_assert_rq_held(src_rq);
 
-	/* the following marks @p MIGRATING which excludes dequeue */
+	/*
+	 * Set sticky_cpu before deactivate_task() to properly mark the
+	 * beginning of an SCX-internal migration.
+	 */
+	p->scx.sticky_cpu = cpu_of(dst_rq);
 	deactivate_task(src_rq, p, 0);
 	set_task_cpu(p, cpu_of(dst_rq));
-	p->scx.sticky_cpu = cpu_of(dst_rq);
 
 	raw_spin_rq_unlock(src_rq);
 	raw_spin_rq_lock(dst_rq);

From 482bb06f83ab26cc055835eb7d94d615520e9de9 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 18 Feb 2026 09:32:16 +0100
Subject: [PATCH 002/134] sched_ext: Add rq parameter to dispatch_enqueue()

This prepares for a later commit fixing the ops.dequeue() semantics.
No functional change intended.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 87397781c1bf..044bb2168dd0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1010,8 +1010,9 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p
 		resched_curr(rq);
 }
 
-static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
-			     struct task_struct *p, u64 enq_flags)
+static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
+			     struct scx_dispatch_q *dsq, struct task_struct *p,
+			     u64 enq_flags)
 {
 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
 
@@ -1325,7 +1326,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
 		return;
 	}
 
-	dispatch_enqueue(sch, dsq, p,
+	dispatch_enqueue(sch, rq, dsq, p,
 			 p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
 }
 
@@ -1415,7 +1416,7 @@ direct:
 	direct_dispatch(sch, p, enq_flags);
 	return;
 local_norefill:
-	dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
+	dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, enq_flags);
 	return;
 local:
 	dsq = &rq->scx.local_dsq;
@@ -1435,7 +1436,7 @@ enqueue:
 	 */
 	touch_core_sched(rq, p);
 	refill_task_slice_dfl(sch, p);
-	dispatch_enqueue(sch, dsq, p, enq_flags);
+	dispatch_enqueue(sch, rq, dsq, p, enq_flags);
 }
 
 static bool task_runnable(const struct task_struct *p)
@@ -1888,7 +1889,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 		dispatch_dequeue_locked(p, src_dsq);
 		raw_spin_unlock(&src_dsq->lock);
 
-		dispatch_enqueue(sch, dst_dsq, p, enq_flags);
+		dispatch_enqueue(sch, dst_rq, dst_dsq, p, enq_flags);
 	}
 
 	return dst_rq;
@@ -1978,14 +1979,14 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
 	 * If dispatching to @rq that @p is already on, no lock dancing needed.
 	 */
 	if (rq == src_rq && rq == dst_rq) {
-		dispatch_enqueue(sch, dst_dsq, p,
+		dispatch_enqueue(sch, rq, dst_dsq, p,
 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
 		return;
 	}
 
 	if (src_rq != dst_rq &&
 	    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
-		dispatch_enqueue(sch, find_global_dsq(sch, p), p,
+		dispatch_enqueue(sch, rq, find_global_dsq(sch, p), p,
 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
 		return;
 	}
@@ -2023,7 +2024,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
 		 */
 		if (src_rq == dst_rq) {
 			p->scx.holding_cpu = -1;
-			dispatch_enqueue(sch, &dst_rq->scx.local_dsq, p,
+			dispatch_enqueue(sch, dst_rq, &dst_rq->scx.local_dsq, p,
 					 enq_flags);
 		} else {
 			move_remote_task_to_local_dsq(p, enq_flags,
@@ -2122,7 +2123,7 @@ retry:
 	if (dsq->id == SCX_DSQ_LOCAL)
 		dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
 	else
-		dispatch_enqueue(sch, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
+		dispatch_enqueue(sch, rq, dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
 }
 
 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
@@ -2423,7 +2424,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 		 * DSQ.
 		 */
 		if (p->scx.slice && !scx_rq_bypassing(rq)) {
-			dispatch_enqueue(sch, &rq->scx.local_dsq, p,
+			dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p,
 					 SCX_ENQ_HEAD);
 			goto switch_class;
 		}
@@ -3954,7 +3955,7 @@ resume:
 		 * between bypass DSQs.
 		 */
 		dispatch_dequeue_locked(p, donor_dsq);
-		dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+		dispatch_enqueue(sch, donee_rq, donee_dsq, p, SCX_ENQ_NESTED);
 
 		/*
 		 * $donee might have been idle and need to be woken up. No need

From ebf1ccff79c43f860cbd2f9d6cfab9a462d0cb2d Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 18 Feb 2026 09:32:17 +0100
Subject: [PATCH 003/134] sched_ext: Fix ops.dequeue() semantics

Currently, ops.dequeue() is only invoked when the sched_ext core knows
that a task resides in BPF-managed data structures, which causes it to
miss scheduling property change events. In addition, ops.dequeue()
callbacks are completely skipped when tasks are dispatched to non-local
DSQs from ops.select_cpu(). As a result, BPF schedulers cannot reliably
track task state.

Fix this by guaranteeing that each task entering the BPF scheduler's
custody triggers exactly one ops.dequeue() call when it leaves that
custody, whether the exit is due to a dispatch (regular or via a core
scheduling pick) or to a scheduling property change (e.g.
sched_setaffinity(), sched_setscheduler(), set_user_nice(), NUMA
balancing, etc.).

BPF scheduler custody concept: a task is considered to be in the BPF
scheduler's custody when the scheduler is responsible for managing its
lifecycle. This includes tasks dispatched to user-created DSQs or stored
in the BPF scheduler's internal data structures from ops.enqueue().
Custody ends when the task is dispatched to a terminal DSQ (such as the
local DSQ or %SCX_DSQ_GLOBAL), selected by core scheduling, or removed
due to a property change.

Tasks directly dispatched to terminal DSQs bypass the BPF scheduler
entirely and are never in its custody. Terminal DSQs include:
 - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON): per-CPU queues
   where tasks go directly to execution.
 - Global DSQ (%SCX_DSQ_GLOBAL): the built-in fallback queue where the
   BPF scheduler is considered "done" with the task.

As a result, ops.dequeue() is not invoked for tasks directly dispatched
to terminal DSQs.

To identify dequeues triggered by scheduling property changes, introduce
the new ops.dequeue() flag %SCX_DEQ_SCHED_CHANGE: when this flag is set,
the dequeue was caused by a scheduling property change.

New ops.dequeue() semantics:
 - ops.dequeue() is invoked exactly once when the task leaves the BPF
   scheduler's custody, in one of the following cases:
   a) regular dispatch: a task dispatched to a user DSQ or stored in
      internal BPF data structures is moved to a terminal DSQ
      (ops.dequeue() called without any special flags set),
   b) core scheduling dispatch: core-sched picks task before dispatch
      (ops.dequeue() called with %SCX_DEQ_CORE_SCHED_EXEC flag set),
   c) property change: task properties modified before dispatch,
      (ops.dequeue() called with %SCX_DEQ_SCHED_CHANGE flag set).

This allows BPF schedulers to:
 - reliably track task ownership and lifecycle,
 - maintain accurate accounting of managed tasks,
 - update internal state when tasks change properties.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst         |  78 +++++++++++--
 include/linux/sched/ext.h                     |   1 +
 kernel/sched/ext.c                            | 110 ++++++++++++++++--
 kernel/sched/ext_internal.h                   |   7 ++
 .../sched_ext/include/scx/enum_defs.autogen.h |   1 +
 .../sched_ext/include/scx/enums.autogen.bpf.h |   2 +
 tools/sched_ext/include/scx/enums.autogen.h   |   1 +
 7 files changed, 184 insertions(+), 16 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 9e2882d937b4..7cb77fd2e4d7 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -228,16 +228,23 @@ The following briefly shows how a waking task is scheduled and executed.
    scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper,
    using ``ops.select_cpu()`` judiciously can be simpler and more efficient.
 
-   A task can be immediately inserted into a DSQ from ``ops.select_cpu()``
-   by calling ``scx_bpf_dsq_insert()``. If the task is inserted into
-   ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be inserted into the
-   local DSQ of whichever CPU is returned from ``ops.select_cpu()``.
-   Additionally, inserting directly from ``ops.select_cpu()`` will cause the
-   ``ops.enqueue()`` callback to be skipped.
-
    Note that the scheduler core will ignore an invalid CPU selection, for
    example, if it's outside the allowed cpumask of the task.
 
+   A task can be immediately inserted into a DSQ from ``ops.select_cpu()``
+   by calling ``scx_bpf_dsq_insert()`` or ``scx_bpf_dsq_insert_vtime()``.
+
+   If the task is inserted into ``SCX_DSQ_LOCAL`` from
+   ``ops.select_cpu()``, it will be added to the local DSQ of whichever CPU
+   is returned from ``ops.select_cpu()``. Additionally, inserting directly
+   from ``ops.select_cpu()`` will cause the ``ops.enqueue()`` callback to
+   be skipped.
+
+   Any other attempt to store a task in BPF-internal data structures from
+   ``ops.select_cpu()`` does not prevent ``ops.enqueue()`` from being
+   invoked. This is discouraged, as it can introduce racy behavior or
+   inconsistent state.
+
 2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the
    task was inserted directly from ``ops.select_cpu()``). ``ops.enqueue()``
    can make one of the following decisions:
@@ -251,6 +258,61 @@ The following briefly shows how a waking task is scheduled and executed.
 
    * Queue the task on the BPF side.
 
+   **Task State Tracking and ops.dequeue() Semantics**
+
+   A task is in the "BPF scheduler's custody" when the BPF scheduler is
+   responsible for managing its lifecycle. A task enters custody when it is
+   dispatched to a user DSQ or stored in the BPF scheduler's internal data
+   structures. Custody is entered only from ``ops.enqueue()`` for those
+   operations. The only exception is dispatching to a user DSQ from
+   ``ops.select_cpu()``: although the task is not yet technically in BPF
+   scheduler custody at that point, the dispatch has the same semantic
+   effect as dispatching from ``ops.enqueue()`` for custody-related
+   purposes.
+
+   Once ``ops.enqueue()`` is called, the task may or may not enter custody
+   depending on what the scheduler does:
+
+   * **Directly dispatched to terminal DSQs** (``SCX_DSQ_LOCAL``,
+     ``SCX_DSQ_LOCAL_ON | cpu``, or ``SCX_DSQ_GLOBAL``): the BPF scheduler
+     is done with the task - it either goes straight to a CPU's local run
+     queue or to the global DSQ as a fallback. The task never enters (or
+     exits) BPF custody, and ``ops.dequeue()`` will not be called.
+
+   * **Dispatch to user-created DSQs** (custom DSQs): the task enters the
+     BPF scheduler's custody. When the task later leaves BPF custody
+     (dispatched to a terminal DSQ, picked by core-sched, or dequeued for
+     sleep/property changes), ``ops.dequeue()`` will be called exactly
+     once.
+
+   * **Stored in BPF data structures** (e.g., internal BPF queues): the
+     task is in BPF custody. ``ops.dequeue()`` will be called when it
+     leaves (e.g., when ``ops.dispatch()`` moves it to a terminal DSQ, or
+     on property change / sleep).
+
+   When a task leaves BPF scheduler custody, ``ops.dequeue()`` is invoked.
+   The dequeue can happen for different reasons, distinguished by flags:
+
+   1. **Regular dispatch**: when a task in BPF custody is dispatched to a
+      terminal DSQ from ``ops.dispatch()`` (leaving BPF custody for
+      execution), ``ops.dequeue()`` is triggered without any special flags.
+
+   2. **Core scheduling pick**: when ``CONFIG_SCHED_CORE`` is enabled and
+      core scheduling picks a task for execution while it's still in BPF
+      custody, ``ops.dequeue()`` is called with the
+      ``SCX_DEQ_CORE_SCHED_EXEC`` flag.
+
+   3. **Scheduling property change**: when a task property changes (via
+      operations like ``sched_setaffinity()``, ``sched_setscheduler()``,
+      priority changes, CPU migrations, etc.) while the task is still in
+      BPF custody, ``ops.dequeue()`` is called with the
+      ``SCX_DEQ_SCHED_CHANGE`` flag set in ``deq_flags``.
+
+   **Important**: Once a task has left BPF custody (e.g., after being
+   dispatched to a terminal DSQ), property changes will not trigger
+   ``ops.dequeue()``, since the task is no longer managed by the BPF
+   scheduler.
+
 3. When a CPU is ready to schedule, it first looks at its local DSQ. If
    empty, it then looks at the global DSQ. If there still isn't a task to
    run, ``ops.dispatch()`` is invoked which can use the following two
@@ -318,6 +380,8 @@ by a sched_ext scheduler:
                 /* Any usable CPU becomes available */
 
                 ops.dispatch(); /* Task is moved to a local DSQ */
+
+                ops.dequeue(); /* Exiting BPF scheduler */
             }
             ops.running();      /* Task starts running on its assigned CPU */
             while (task->scx.slice > 0 && task is runnable)
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index bcb962d5ee7d..4601e5ecb43c 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -84,6 +84,7 @@ struct scx_dispatch_q {
 /* scx_entity.flags */
 enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
+	SCX_TASK_IN_CUSTODY	= 1 << 1, /* in custody, needs ops.dequeue() when leaving */
 	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 044bb2168dd0..d5e688b9acc0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -986,12 +986,45 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
 	__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
 }
 
+/*
+ * Return true if @p is moving due to an internal SCX migration, false
+ * otherwise.
+ */
+static inline bool task_scx_migrating(struct task_struct *p)
+{
+	/*
+	 * We only need to check sticky_cpu: it is set to the destination
+	 * CPU in move_remote_task_to_local_dsq() before deactivate_task()
+	 * and cleared when the task is enqueued on the destination, so it
+	 * is only non-negative during an internal SCX migration.
+	 */
+	return p->scx.sticky_cpu >= 0;
+}
+
+/*
+ * Call ops.dequeue() if the task is in BPF custody and not migrating.
+ * Clears %SCX_TASK_IN_CUSTODY when the callback is invoked.
+ */
+static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
+			      struct task_struct *p, u64 deq_flags)
+{
+	if (!(p->scx.flags & SCX_TASK_IN_CUSTODY) || task_scx_migrating(p))
+		return;
+
+	if (SCX_HAS_OP(sch, dequeue))
+		SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, p, deq_flags);
+
+	p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
+}
+
 static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p,
 			       u64 enq_flags)
 {
 	struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
 	bool preempt = false;
 
+	call_task_dequeue(scx_root, rq, p, 0);
+
 	/*
 	 * If @rq is in balance, the CPU is already vacant and looking for the
 	 * next task to run. No need to preempt or trigger resched after moving
@@ -1115,17 +1148,34 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
 	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
 	p->scx.ddsp_enq_flags = 0;
 
+	/*
+	 * Update custody and call ops.dequeue() before clearing ops_state:
+	 * once ops_state is cleared, waiters in ops_dequeue() can proceed
+	 * and dequeue_task_scx() will RMW p->scx.flags. If we clear
+	 * ops_state first, both sides would modify p->scx.flags
+	 * concurrently in a non-atomic way.
+	 */
+	if (is_local) {
+		local_dsq_post_enq(dsq, p, enq_flags);
+	} else {
+		/*
+		 * Task on global/bypass DSQ: leave custody, task on
+		 * non-terminal DSQ: enter custody.
+		 */
+		if (dsq->id == SCX_DSQ_GLOBAL || dsq->id == SCX_DSQ_BYPASS)
+			call_task_dequeue(sch, rq, p, 0);
+		else
+			p->scx.flags |= SCX_TASK_IN_CUSTODY;
+
+		raw_spin_unlock(&dsq->lock);
+	}
+
 	/*
 	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
 	 * match waiters' load_acquire.
 	 */
 	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
-
-	if (is_local)
-		local_dsq_post_enq(dsq, p, enq_flags);
-	else
-		raw_spin_unlock(&dsq->lock);
 }
 
 static void task_unlink_from_dsq(struct task_struct *p,
@@ -1405,6 +1455,12 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
 		goto direct;
 
+	/*
+	 * Task is now in BPF scheduler's custody. Set %SCX_TASK_IN_CUSTODY
+	 * so ops.dequeue() is called when it leaves custody.
+	 */
+	p->scx.flags |= SCX_TASK_IN_CUSTODY;
+
 	/*
 	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
 	 * dequeue may be waiting. The store_release matches their load_acquire.
@@ -1522,6 +1578,14 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 {
 	struct scx_sched *sch = scx_root;
 	unsigned long opss;
+	u64 op_deq_flags = deq_flags;
+
+	/*
+	 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property
+	 * change (not sleep or core-sched pick).
+	 */
+	if (!(op_deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC)))
+		op_deq_flags |= SCX_DEQ_SCHED_CHANGE;
 
 	/* dequeue is always temporary, don't reset runnable_at */
 	clr_task_runnable(p, false);
@@ -1539,10 +1603,8 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 		 */
 		BUG();
 	case SCX_OPSS_QUEUED:
-		if (SCX_HAS_OP(sch, dequeue))
-			SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq,
-					 p, deq_flags);
-
+		/* A queued task must always be in BPF scheduler's custody */
+		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_IN_CUSTODY));
 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
 					    SCX_OPSS_NONE))
 			break;
@@ -1565,6 +1627,22 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
 		break;
 	}
+
+	/*
+	 * Call ops.dequeue() if the task is still in BPF custody.
+	 *
+	 * The code that clears ops_state to %SCX_OPSS_NONE does not always
+	 * clear %SCX_TASK_IN_CUSTODY: in dispatch_to_local_dsq(), when
+	 * we're moving a task that was in %SCX_OPSS_DISPATCHING to a
+	 * remote CPU's local DSQ, we only set ops_state to %SCX_OPSS_NONE
+	 * so that a concurrent dequeue can proceed, but we clear
+	 * %SCX_TASK_IN_CUSTODY only when we later enqueue or move the
+	 * task. So we can see NONE + IN_CUSTODY here and we must handle
+	 * it. Similarly, after waiting on %SCX_OPSS_DISPATCHING we see
+	 * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until
+	 * it is enqueued on the destination.
+	 */
+	call_task_dequeue(sch, rq, p, op_deq_flags);
 }
 
 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
@@ -2935,6 +3013,13 @@ static void scx_enable_task(struct task_struct *p)
 
 	lockdep_assert_rq_held(rq);
 
+	/*
+	 * Verify the task is not in BPF scheduler's custody. If flag
+	 * transitions are consistent, the flag should always be clear
+	 * here.
+	 */
+	WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
+
 	/*
 	 * Set the weight before calling ops.enable() so that the scheduler
 	 * doesn't see a stale value if they inspect the task struct.
@@ -2966,6 +3051,13 @@ static void scx_disable_task(struct task_struct *p)
 	if (SCX_HAS_OP(sch, disable))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
 	scx_set_task_state(p, SCX_TASK_READY);
+
+	/*
+	 * Verify the task is not in BPF scheduler's custody. If flag
+	 * transitions are consistent, the flag should always be clear
+	 * here.
+	 */
+	WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
 }
 
 static void scx_exit_task(struct task_struct *p)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 386c677e4c9a..befa9a5d6e53 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -982,6 +982,13 @@ enum scx_deq_flags {
 	 * it hasn't been dispatched yet. Dequeue from the BPF side.
 	 */
 	SCX_DEQ_CORE_SCHED_EXEC	= 1LLU << 32,
+
+	/*
+	 * The task is being dequeued due to a property change (e.g.,
+	 * sched_setaffinity(), sched_setscheduler(), set_user_nice(),
+	 * etc.).
+	 */
+	SCX_DEQ_SCHED_CHANGE	= 1LLU << 33,
 };
 
 enum scx_pick_idle_cpu_flags {
diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h
index c2c33df9292c..dcc945304760 100644
--- a/tools/sched_ext/include/scx/enum_defs.autogen.h
+++ b/tools/sched_ext/include/scx/enum_defs.autogen.h
@@ -21,6 +21,7 @@
 #define HAVE_SCX_CPU_PREEMPT_UNKNOWN
 #define HAVE_SCX_DEQ_SLEEP
 #define HAVE_SCX_DEQ_CORE_SCHED_EXEC
+#define HAVE_SCX_DEQ_SCHED_CHANGE
 #define HAVE_SCX_DSQ_FLAG_BUILTIN
 #define HAVE_SCX_DSQ_FLAG_LOCAL_ON
 #define HAVE_SCX_DSQ_INVALID
diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h
index 2f8002bcc19a..5da50f937684 100644
--- a/tools/sched_ext/include/scx/enums.autogen.bpf.h
+++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h
@@ -127,3 +127,5 @@ const volatile u64 __SCX_ENQ_CLEAR_OPSS __weak;
 const volatile u64 __SCX_ENQ_DSQ_PRIQ __weak;
 #define SCX_ENQ_DSQ_PRIQ __SCX_ENQ_DSQ_PRIQ
 
+const volatile u64 __SCX_DEQ_SCHED_CHANGE __weak;
+#define SCX_DEQ_SCHED_CHANGE __SCX_DEQ_SCHED_CHANGE
diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h
index fedec938584b..fc9a7a4d9dea 100644
--- a/tools/sched_ext/include/scx/enums.autogen.h
+++ b/tools/sched_ext/include/scx/enums.autogen.h
@@ -46,4 +46,5 @@
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_LAST); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_CLEAR_OPSS); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_DSQ_PRIQ); \
+	SCX_ENUM_SET(skel, scx_deq_flags, SCX_DEQ_SCHED_CHANGE); \
 } while (0)

From 658ad2259b3e95aea21e548f7ca3440f620bf95f Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Sat, 21 Feb 2026 21:02:20 +0100
Subject: [PATCH 004/134] selftests/sched_ext: Add test to validate
 ops.dequeue() semantics

Add a new kselftest to validate that the new ops.dequeue() semantics
work correctly for all task lifecycle scenarios, including the
distinction between terminal DSQs (where BPF scheduler is done with the
task), user DSQs (where BPF scheduler manages the task lifecycle) and
BPF data structures, regardless of which event performs the dispatch.

The test validates the following scenarios:

 - From ops.select_cpu():
     - scenario 0 (local DSQ): tasks dispatched to the local DSQ bypass
       the BPF scheduler entirely; they never enter BPF custody, so
       ops.dequeue() is not called,
     - scenario 1 (global DSQ): tasks dispatched to SCX_DSQ_GLOBAL also
       bypass the BPF scheduler, like the local DSQ; ops.dequeue() is
       not called,
     - scenario 2 (user DSQ): tasks dispatched to user DSQs from
       ops.select_cpu(): tasks enter BPF scheduler's custody with full
       enqueue/dequeue lifecycle tracking and state machine validation,
       expects 1:1 enqueue/dequeue pairing,

   - From ops.enqueue():
     - scenario 3 (local DSQ): same behavior as scenario 0,
     - scenario 4 (global DSQ): same behavior as scenario 1,
     - scenario 5 (user DSQ): same behavior as scenario 2,
     - scenario 6 (BPF internal queue): tasks are stored in a BPF queue
       from ops.enqueue() and consumed from ops.dispatch(); similarly to
       scenario 5, tasks enter BPF scheduler's custody with full
       lifecycle tracking and 1:1 enqueue/dequeue validation.

This verifies that:
 - terminal DSQ dispatch (local, global) don't trigger ops.dequeue(),
 - tasks dispatched to user DSQs, either from ops.select_cpu() or
   ops.enqueue(), enter BPF scheduler's custody and have exact 1:1
   enqueue/dequeue pairing,
 - tasks stored to internal BPF data structures from ops.enqueue() enter
   BPF scheduler's custody and have exact 1:1 enqueue/dequeue pairing,
 - dispatch dequeues have no flags (normal workflow),
 - property change dequeues have the %SCX_DEQ_SCHED_CHANGE flag set,
 - no duplicate enqueues or invalid state transitions are happening.

Cc: Tejun Heo <tj@kernel.org>
Cc: Emil Tsalapatis <emil@etsalapatis.com>
Cc: Kuba Piecuch <jpiecuch@google.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/Makefile    |   1 +
 .../testing/selftests/sched_ext/dequeue.bpf.c | 389 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dequeue.c   | 274 ++++++++++++
 3 files changed, 664 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dequeue.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 2c601a7eaff5..2815a875bde2 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -161,6 +161,7 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
 
 auto-test-targets :=			\
 	create_dsq			\
+	dequeue				\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 000000000000..597b88563d7d
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
+ *   scheduler entirely: no ops.dequeue() should be called
+ * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody:
+ *   ops.dequeue() must be called when they leave custody
+ * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by
+ *   exactly one ops.dequeue() (validate 1:1 pairing and state machine)
+ *
+ * Copyright (c) 2026 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ	0
+
+/*
+ * BPF internal queue.
+ *
+ * Tasks are stored here and consumed from ops.dispatch(), validating that
+ * tasks on BPF internal structures still get ops.dequeue() when they
+ * leave.
+ */
+struct {
+	__uint(type, BPF_MAP_TYPE_QUEUE);
+	__uint(max_entries, 32768);
+	__type(value, s32);
+} global_queue SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - change_dequeue_cnt: Number of property change dequeues
+ * - bpf_queue_full: Number of times the BPF internal queue was full
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full;
+
+/*
+ * Test scenarios:
+ * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
+ *    dequeue callbacks expected)
+ * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ *    scheduler, no dequeue callbacks)
+ * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
+ *    dequeue callbacks expected)
+ * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(),
+ *    consume in ops.dispatch() and dispatch to local DSQ (validates dequeue
+ *    for tasks stored in internal BPF data structures)
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ *   NONE -> ENQUEUED (on enqueue)
+ *   NONE -> DISPATCHED (on direct dispatch to terminal DSQ)
+ *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
+ *   ENQUEUED -> NONE (on property change dequeue before dispatch)
+ */
+enum task_state {
+	TASK_NONE = 0,
+	TASK_ENQUEUED,
+	TASK_DISPATCHED,
+};
+
+struct task_ctx {
+	enum task_state state; /* Current state in the workflow */
+	u64 enqueue_seq;       /* Sequence number for debugging */
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+		   s32 prev_cpu, u64 wake_flags)
+{
+	struct task_ctx *tctx;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return prev_cpu;
+
+	switch (test_scenario) {
+	case 0:
+		/*
+		 * Direct dispatch to the local DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no ops.dequeue() callbacks.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+		tctx->state = TASK_DISPATCHED;
+		break;
+	case 1:
+		/*
+		 * Direct dispatch to the global DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no ops.dequeue() callbacks.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+		tctx->state = TASK_DISPATCHED;
+		break;
+	case 2:
+		/*
+		 * Dispatch to a shared user DSQ.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
+
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+		break;
+	}
+
+	return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	struct task_ctx *tctx;
+	s32 pid = p->pid;
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	switch (test_scenario) {
+	case 3:
+		/*
+		 * Direct dispatch to the local DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no ops.dequeue() callbacks.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+		tctx->state = TASK_DISPATCHED;
+		break;
+	case 4:
+		/*
+		 * Direct dispatch to the global DSQ.
+		 *
+		 * Task bypasses BPF scheduler entirely: no enqueue
+		 * tracking, no ops.dequeue() callbacks.
+		 */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		tctx->state = TASK_DISPATCHED;
+		break;
+	case 5:
+		/*
+		 * Dispatch to shared user DSQ.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+
+		__sync_fetch_and_add(&enqueue_cnt, 1);
+
+		tctx->state = TASK_ENQUEUED;
+		tctx->enqueue_seq++;
+		break;
+	case 6:
+		/*
+		 * Store task in BPF internal queue.
+		 *
+		 * Task enters BPF scheduler management: track
+		 * enqueue/dequeue lifecycle and validate state
+		 * transitions.
+		 */
+		if (tctx->state == TASK_ENQUEUED)
+			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+				      p->pid, p->comm, tctx->enqueue_seq);
+
+		if (bpf_map_push_elem(&global_queue, &pid, 0)) {
+			scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+			__sync_fetch_and_add(&bpf_queue_full, 1);
+
+			tctx->state = TASK_DISPATCHED;
+		} else {
+			__sync_fetch_and_add(&enqueue_cnt, 1);
+
+			tctx->state = TASK_ENQUEUED;
+			tctx->enqueue_seq++;
+		}
+		break;
+	default:
+		/* For all other scenarios, dispatch to the global DSQ */
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+		tctx->state = TASK_DISPATCHED;
+		break;
+	}
+
+	scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+	struct task_ctx *tctx;
+
+	__sync_fetch_and_add(&dequeue_cnt, 1);
+
+	tctx = try_lookup_task_ctx(p);
+	if (!tctx)
+		return;
+
+	/*
+	 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
+	 * ops.dequeue() should never be called because tasks bypass the
+	 * BPF scheduler entirely. If we get here, it's a kernel bug.
+	 */
+	if (test_scenario == 0 || test_scenario == 3) {
+		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario",
+			      p->pid, p->comm);
+		return;
+	}
+
+	if (test_scenario == 1 || test_scenario == 4) {
+		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario",
+			      p->pid, p->comm);
+		return;
+	}
+
+	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
+		/*
+		 * Property change interrupting the workflow. Valid from
+		 * both ENQUEUED and DISPATCHED states. Transitions task
+		 * back to NONE state.
+		 */
+		__sync_fetch_and_add(&change_dequeue_cnt, 1);
+
+		/* Validate state transition */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		/*
+		 * Transition back to NONE: task outside scheduler control.
+		 *
+		 * Scenario 6: dispatch() checks tctx->state after popping a
+		 * PID, if the task is in state NONE, it was dequeued by
+		 * property change and must not be dispatched (this
+		 * prevents "target CPU not allowed").
+		 */
+		tctx->state = TASK_NONE;
+	} else {
+		/*
+		 * Regular dispatch dequeue: kernel is moving the task from
+		 * BPF custody to a terminal DSQ. Normally we come from
+		 * ENQUEUED state. We can also see TASK_NONE if the task
+		 * was dequeued by property change (SCX_DEQ_SCHED_CHANGE)
+		 * while it was already on a DSQ (dispatched but not yet
+		 * consumed); in that case we just leave state as NONE.
+		 */
+		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+		/*
+		 * Must be ENQUEUED (normal path) or NONE (already dequeued
+		 * by property change while on a DSQ).
+		 */
+		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE)
+			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
+				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+		if (tctx->state == TASK_ENQUEUED)
+			tctx->state = TASK_DISPATCHED;
+
+		/* NONE: leave as-is, task was already property-change dequeued */
+	}
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (test_scenario == 6) {
+		struct task_ctx *tctx;
+		struct task_struct *p;
+		s32 pid;
+
+		if (bpf_map_pop_elem(&global_queue, &pid))
+			return;
+
+		p = bpf_task_from_pid(pid);
+		if (!p)
+			return;
+
+		/*
+		 * If the task was dequeued by property change
+		 * (ops.dequeue() set tctx->state = TASK_NONE), skip
+		 * dispatch.
+		 */
+		tctx = try_lookup_task_ctx(p);
+		if (!tctx || tctx->state == TASK_NONE) {
+			bpf_task_release(p);
+			return;
+		}
+
+		/*
+		 * Dispatch to this CPU's local DSQ if allowed, otherwise
+		 * fallback to the global DSQ.
+		 */
+		if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
+			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+		else
+			scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+
+		bpf_task_release(p);
+	} else {
+		scx_bpf_dsq_move_to_local(SHARED_DSQ);
+	}
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+		   struct scx_init_task_args *args)
+{
+	struct task_ctx *tctx;
+
+	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+				   BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!tctx)
+		return -ENOMEM;
+
+	return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+	.select_cpu		= (void *)dequeue_select_cpu,
+	.enqueue		= (void *)dequeue_enqueue,
+	.dequeue		= (void *)dequeue_dequeue,
+	.dispatch		= (void *)dequeue_dispatch,
+	.init_task		= (void *)dequeue_init_task,
+	.init			= (void *)dequeue_init,
+	.exit			= (void *)dequeue_exit,
+	.flags			= SCX_OPS_ENQ_LAST,
+	.name			= "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 000000000000..4e93262703ca
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <time.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+#define AFFINITY_HAMMER_MS 500
+
+/*
+ * Worker function that creates enqueue/dequeue events via CPU work and
+ * sleep.
+ */
+static void worker_fn(int id)
+{
+	int i;
+	volatile int sum = 0;
+
+	for (i = 0; i < 1000; i++) {
+		volatile int j;
+
+		/* Do some work to trigger scheduling events */
+		for (j = 0; j < 10000; j++)
+			sum += j;
+
+		/* Sleep to trigger dequeue */
+		usleep(1000 + (id * 100));
+	}
+
+	exit(0);
+}
+
+/*
+ * This thread changes workers' affinity from outside so that some changes
+ * hit tasks while they are still in the scheduler's queue and trigger
+ * property-change dequeues.
+ */
+static void *affinity_hammer_fn(void *arg)
+{
+	pid_t *pids = arg;
+	cpu_set_t cpuset;
+	int i = 0, n = NUM_WORKERS;
+	struct timespec start, now;
+
+	clock_gettime(CLOCK_MONOTONIC, &start);
+	while (1) {
+		int w = i % n;
+		int cpu = (i / n) % 4;
+
+		CPU_ZERO(&cpuset);
+		CPU_SET(cpu, &cpuset);
+		sched_setaffinity(pids[w], sizeof(cpuset), &cpuset);
+		i++;
+
+		/* Check elapsed time every 256 iterations to limit gettime cost */
+		if ((i & 255) == 0) {
+			long long elapsed_ms;
+
+			clock_gettime(CLOCK_MONOTONIC, &now);
+			elapsed_ms = (now.tv_sec - start.tv_sec) * 1000LL +
+				     (now.tv_nsec - start.tv_nsec) / 1000000;
+			if (elapsed_ms >= AFFINITY_HAMMER_MS)
+				break;
+		}
+	}
+	return NULL;
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+					 const char *scenario_name)
+{
+	struct bpf_link *link;
+	pid_t pids[NUM_WORKERS];
+	pthread_t hammer;
+
+	int i, status;
+	u64 enq_start, deq_start,
+	    dispatch_deq_start, change_deq_start, bpf_queue_full_start;
+	u64 enq_delta, deq_delta,
+	    dispatch_deq_delta, change_deq_delta, bpf_queue_full_delta;
+
+	/* Set the test scenario */
+	skel->bss->test_scenario = scenario;
+
+	/* Record starting counts */
+	enq_start = skel->bss->enqueue_cnt;
+	deq_start = skel->bss->dequeue_cnt;
+	dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+	change_deq_start = skel->bss->change_dequeue_cnt;
+	bpf_queue_full_start = skel->bss->bpf_queue_full;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+	SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+	/* Fork worker processes to generate enqueue/dequeue events */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		pids[i] = fork();
+		SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+		if (pids[i] == 0) {
+			worker_fn(i);
+			/* Should not reach here */
+			exit(1);
+		}
+	}
+
+	/*
+	 * Run an "affinity hammer" so that some property changes hit tasks
+	 * while they are still in BPF custody (e.g., in user DSQ or BPF
+	 * queue), triggering SCX_DEQ_SCHED_CHANGE dequeues.
+	 */
+	SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0,
+		    "Failed to create affinity hammer thread");
+	pthread_join(hammer, NULL);
+
+	/* Wait for all workers to complete */
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+			    "Failed to wait for worker %d", i);
+		SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+	}
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+	/* Calculate deltas */
+	enq_delta = skel->bss->enqueue_cnt - enq_start;
+	deq_delta = skel->bss->dequeue_cnt - deq_start;
+	dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+	change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
+	bpf_queue_full_delta = skel->bss->bpf_queue_full - bpf_queue_full_start;
+
+	printf("%s:\n", scenario_name);
+	printf("  enqueues: %lu\n", (unsigned long)enq_delta);
+	printf("  dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
+	       (unsigned long)deq_delta,
+	       (unsigned long)dispatch_deq_delta,
+	       (unsigned long)change_deq_delta);
+	printf("  BPF queue full: %lu\n", (unsigned long)bpf_queue_full_delta);
+
+	/*
+	 * Validate enqueue/dequeue lifecycle tracking.
+	 *
+	 * For scenarios 0, 1, 3, 4 (local and global DSQs from
+	 * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues
+	 * should be 0 because tasks bypass the BPF scheduler entirely:
+	 * tasks never enter BPF scheduler's custody.
+	 *
+	 * For scenarios 2, 5, 6 (user DSQ or BPF internal queue) we expect
+	 * both enqueues and dequeues.
+	 *
+	 * The BPF code does strict state machine validation with
+	 * scx_bpf_error() to ensure the workflow semantics are correct.
+	 *
+	 * If we reach this point without errors, the semantics are
+	 * validated correctly.
+	 */
+	if (scenario == 0 || scenario == 1 ||
+	    scenario == 3 || scenario == 4) {
+		/* Tasks bypass BPF scheduler completely */
+		SCX_EQ(enq_delta, 0);
+		SCX_EQ(deq_delta, 0);
+		SCX_EQ(dispatch_deq_delta, 0);
+		SCX_EQ(change_deq_delta, 0);
+	} else {
+		/*
+		 * User DSQ from ops.enqueue() or ops.select_cpu(): tasks
+		 * enter BPF scheduler's custody.
+		 *
+		 * Also validate 1:1 enqueue/dequeue pairing.
+		 */
+		SCX_GT(enq_delta, 0);
+		SCX_GT(deq_delta, 0);
+		SCX_EQ(enq_delta, deq_delta);
+	}
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dequeue *skel;
+
+	skel = dequeue__open();
+	SCX_FAIL_IF(!skel, "Failed to open skel");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dequeue *skel = ctx;
+	enum scx_test_status status;
+
+	status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	status = run_scenario(skel, 6, "Scenario 6: BPF queue from ops.enqueue()");
+	if (status != SCX_TEST_PASS)
+		return status;
+
+	printf("\n=== Summary ===\n");
+	printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+	printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+	printf("  Dispatch dequeues: %lu (no flag, normal workflow)\n",
+	       (unsigned long)skel->bss->dispatch_dequeue_cnt);
+	printf("  Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
+	       (unsigned long)skel->bss->change_dequeue_cnt);
+	printf("  BPF queue full: %lu\n",
+	       (unsigned long)skel->bss->bpf_queue_full);
+	printf("\nAll scenarios passed - no state machine violations detected\n");
+	printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler\n");
+	printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler\n");
+	printf("-> Validated: User DSQ dispatch triggers ops.dequeue() callbacks\n");
+	printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+	printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
+	printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dequeue *skel = ctx;
+
+	dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+	.name = "dequeue",
+	.description = "Verify ops.dequeue() semantics",
+	.setup = setup,
+	.run = run,
+	.cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)

From 477174ac35c510d0ed3043f5bd4fba25546a21ce Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Tue, 24 Feb 2026 05:56:37 +0000
Subject: [PATCH 005/134] sched_ext: Optimize sched_ext_entity layout for cache
 locality

Reorder struct sched_ext_entity to place ops_state, ddsp_dsq_id, and
ddsp_enq_flags immediately after dsq. These fields are accessed together
in the do_enqueue_task() and finish_dispatch() hot paths but were
previously spread across three different cache lines. Grouping them on
the same cache line reduces cache misses on every enqueue and dispatch
operation.

Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/sched/ext.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 4601e5ecb43c..0150b3fe6230 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -163,6 +163,9 @@ struct scx_dsq_list_node {
  */
 struct sched_ext_entity {
 	struct scx_dispatch_q	*dsq;
+	atomic_long_t		ops_state;
+	u64			ddsp_dsq_id;
+	u64			ddsp_enq_flags;
 	struct scx_dsq_list_node dsq_list;	/* dispatch order */
 	struct rb_node		dsq_priq;	/* p->scx.dsq_vtime order */
 	u32			dsq_seq;
@@ -174,7 +177,6 @@ struct sched_ext_entity {
 	s32			selected_cpu;
 	u32			kf_mask;	/* see scx_kf_mask above */
 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
-	atomic_long_t		ops_state;
 
 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
 	unsigned long		runnable_at;
@@ -182,8 +184,6 @@ struct sched_ext_entity {
 #ifdef CONFIG_SCHED_CORE
 	u64			core_sched_at;	/* see scx_prio_less() */
 #endif
-	u64			ddsp_dsq_id;
-	u64			ddsp_enq_flags;
 
 	/* BPF scheduler modifiable fields */
 

From b0e4c2f8a0f0a60d10c427db4080181060014cac Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:02 -1000
Subject: [PATCH 006/134] sched_ext: Implement cgroup subtree iteration for
 scx_task_iter

For the planned cgroup sub-scheduler support, enable/disable operations are
going to be subtree specific and iterating all tasks in the system for those
operations can be unnecessarily expensive and disruptive.

cgroup already has mechanisms to perform subtree task iterations. Implement
cgroup subtree iteration for scx_task_iter:

- Add optional @cgrp to scx_task_iter_start() which enables cgroup subtree
  iteration.

- Make scx_task_iter use css_next_descendant_pre() and css_task_iter to
  iterate all tasks in the cgroup subtree.

- Update all existing callers to pass NULL to maintain current behavior.

The two iteration mechanisms are independent and duplicate. It's likely that
scx_tasks can be removed in favor of always using cgroup iteration if
CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 64 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8d48a1385835..1ff4cf93413b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -514,14 +514,31 @@ struct scx_task_iter {
 	struct rq_flags			rf;
 	u32				cnt;
 	bool				list_locked;
+#ifdef CONFIG_CGROUPS
+	struct cgroup			*cgrp;
+	struct cgroup_subsys_state	*css_pos;
+	struct css_task_iter		css_iter;
+#endif
 };
 
 /**
  * scx_task_iter_start - Lock scx_tasks_lock and start a task iteration
  * @iter: iterator to init
+ * @cgrp: Optional root of cgroup subhierarchy to iterate
  *
- * Initialize @iter and return with scx_tasks_lock held. Once initialized, @iter
- * must eventually be stopped with scx_task_iter_stop().
+ * Initialize @iter. Once initialized, @iter must eventually be stopped with
+ * scx_task_iter_stop().
+ *
+ * If @cgrp is %NULL, scx_tasks is used for iteration and this function returns
+ * with scx_tasks_lock held and @iter->cursor inserted into scx_tasks.
+ *
+ * If @cgrp is not %NULL, @cgrp and its descendants' tasks are walked using
+ * @iter->css_iter. The caller must be holding cgroup_lock() to prevent cgroup
+ * task migrations.
+ *
+ * The two modes of iterations are largely independent and it's likely that
+ * scx_tasks can be removed in favor of always using cgroup iteration if
+ * CONFIG_SCHED_CLASS_EXT depends on CONFIG_CGROUPS.
  *
  * scx_tasks_lock and the rq lock may be released using scx_task_iter_unlock()
  * between this and the first next() call or between any two next() calls. If
@@ -532,10 +549,19 @@ struct scx_task_iter {
  * All tasks which existed when the iteration started are guaranteed to be
  * visited as long as they are not dead.
  */
-static void scx_task_iter_start(struct scx_task_iter *iter)
+static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
 {
 	memset(iter, 0, sizeof(*iter));
 
+#ifdef CONFIG_CGROUPS
+	if (cgrp) {
+		lockdep_assert_held(&cgroup_mutex);
+		iter->cgrp = cgrp;
+		iter->css_pos = css_next_descendant_pre(NULL, &iter->cgrp->self);
+		css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+		return;
+	}
+#endif
 	raw_spin_lock_irq(&scx_tasks_lock);
 
 	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
@@ -588,6 +614,14 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
  */
 static void scx_task_iter_stop(struct scx_task_iter *iter)
 {
+#ifdef CONFIG_CGROUPS
+	if (iter->cgrp) {
+		if (iter->css_pos)
+			css_task_iter_end(&iter->css_iter);
+		__scx_task_iter_rq_unlock(iter);
+		return;
+	}
+#endif
 	__scx_task_iter_maybe_relock(iter);
 	list_del_init(&iter->cursor.tasks_node);
 	scx_task_iter_unlock(iter);
@@ -611,6 +645,24 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
 		cond_resched();
 	}
 
+#ifdef CONFIG_CGROUPS
+	if (iter->cgrp) {
+		while (iter->css_pos) {
+			struct task_struct *p;
+
+			p = css_task_iter_next(&iter->css_iter);
+			if (p)
+				return p;
+
+			css_task_iter_end(&iter->css_iter);
+			iter->css_pos = css_next_descendant_pre(iter->css_pos,
+								&iter->cgrp->self);
+			if (iter->css_pos)
+				css_task_iter_start(iter->css_pos, 0, &iter->css_iter);
+		}
+		return NULL;
+	}
+#endif
 	__scx_task_iter_maybe_relock(iter);
 
 	list_for_each_entry(pos, cursor, tasks_node) {
@@ -4440,7 +4492,7 @@ static void scx_disable_workfn(struct kthread_work *work)
 
 	scx_init_task_enabled = false;
 
-	scx_task_iter_start(&sti);
+	scx_task_iter_start(&sti, NULL);
 	while ((p = scx_task_iter_next_locked(&sti))) {
 		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
 		const struct sched_class *old_class = p->sched_class;
@@ -5230,7 +5282,7 @@ static void scx_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_disable_unlock_all;
 
-	scx_task_iter_start(&sti);
+	scx_task_iter_start(&sti, NULL);
 	while ((p = scx_task_iter_next_locked(&sti))) {
 		/*
 		 * @p may already be dead, have lost all its usages counts and
@@ -5272,7 +5324,7 @@ static void scx_enable_workfn(struct kthread_work *work)
 	 * scx_tasks_lock.
 	 */
 	percpu_down_write(&scx_fork_rwsem);
-	scx_task_iter_start(&sti);
+	scx_task_iter_start(&sti, NULL);
 	while ((p = scx_task_iter_next_locked(&sti))) {
 		unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 		const struct sched_class *old_class = p->sched_class;

From e3715e397720c9f6068aff4f7babccd3cb6ef618 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:02 -1000
Subject: [PATCH 007/134] sched_ext: Add @kargs to scx_fork()

Make sched_cgroup_fork() pass @kargs to scx_fork(). This will be used to
determine @p's cgroup for cgroup sub-sched support.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/core.c | 2 +-
 kernel/sched/ext.c  | 2 +-
 kernel/sched/ext.h  | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7f77c165a6e..dfe680e78be3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4721,7 +4721,7 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 		p->sched_class->task_fork(p);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
-	return scx_fork(p);
+	return scx_fork(p, kargs);
 }
 
 void sched_cancel_fork(struct task_struct *p)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1ff4cf93413b..6f98a979391f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3171,7 +3171,7 @@ void scx_pre_fork(struct task_struct *p)
 	percpu_down_read(&scx_fork_rwsem);
 }
 
-int scx_fork(struct task_struct *p)
+int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 {
 	percpu_rwsem_assert_held(&scx_fork_rwsem);
 
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 43429b33e52c..0b7fc46aee08 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -11,7 +11,7 @@
 void scx_tick(struct rq *rq);
 void init_scx_entity(struct sched_ext_entity *scx);
 void scx_pre_fork(struct task_struct *p);
-int scx_fork(struct task_struct *p);
+int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs);
 void scx_post_fork(struct task_struct *p);
 void scx_cancel_fork(struct task_struct *p);
 bool scx_can_stop_tick(struct rq *rq);
@@ -44,7 +44,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 
 static inline void scx_tick(struct rq *rq) {}
 static inline void scx_pre_fork(struct task_struct *p) {}
-static inline int scx_fork(struct task_struct *p) { return 0; }
+static inline int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; }
 static inline void scx_post_fork(struct task_struct *p) {}
 static inline void scx_cancel_fork(struct task_struct *p) {}
 static inline u32 scx_cpuperf_target(s32 cpu) { return 0; }

From 19d0e98c20f079352f7c9098338520e09086e5ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:02 -1000
Subject: [PATCH 008/134] sched/core: Swap the order between sched_post_fork()
 and cgroup_post_fork()

The planned sched_ext cgroup sub-scheduler support needs the newly forked
task to be associated with its cgroup in its post_fork() hook. There is no
existing ordering requirement between the two now. Swap them and note the
new ordering requirement.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Cc: Ingo Molnar <mingo@redhat.com>
---
 kernel/fork.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..4759b6214df4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2463,8 +2463,12 @@ __latent_entropy struct task_struct *copy_process(
 		fd_install(pidfd, pidfile);
 
 	proc_fork_connector(p);
-	sched_post_fork(p);
+	/*
+	 * sched_ext needs @p to be associated with its cgroup in its post_fork
+	 * hook. cgroup_post_fork() should come before sched_post_fork().
+	 */
 	cgroup_post_fork(p, args);
+	sched_post_fork(p);
 	perf_event_fork(p);
 
 	trace_task_newtask(p, clone_flags);

From 0454a604b98a9bf301e82860cd216ec4ac563668 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:02 -1000
Subject: [PATCH 009/134] sched_ext: Update p->scx.disallow warning in
 scx_init_task()

- Always trigger the warning if p->scx.disallow is set for fork inits. There
  is no reason to set it during forks.

- Flip the positions of if/else arms to ease adding error conditions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6f98a979391f..d42a22f5a098 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3032,7 +3032,10 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
 	scx_set_task_state(p, SCX_TASK_INIT);
 
 	if (p->scx.disallow) {
-		if (!fork) {
+		if (unlikely(fork)) {
+			scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
+				  p->comm, p->pid);
+		} else {
 			struct rq *rq;
 			struct rq_flags rf;
 
@@ -3051,9 +3054,6 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
 			}
 
 			task_rq_unlock(rq, p, &rf);
-		} else if (p->policy == SCHED_EXT) {
-			scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
-				  p->comm, p->pid);
 		}
 	}
 

From dbd542a8fac7bcfba91e353f2a522e1bf2fbee27 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:02 -1000
Subject: [PATCH 010/134] sched_ext: Reorganize enable/disable path for
 multi-scheduler support

In preparation for multiple scheduler support, reorganize the enable and
disable paths to make scheduler instances explicit. Extract
scx_root_disable() from scx_disable_workfn(). Rename scx_enable_workfn()
to scx_root_enable_workfn(). Change scx_disable() to take @sch parameter
and only queue disable_work if scx_claim_exit() succeeds for consistency.
Move exit_kind validation into scx_claim_exit(). The sysrq handler now
prints a message when no scheduler is loaded.

These changes don't materially affect user-visible behavior.

v2: Keep scx_enable() name as-is and only rename the workfn to
    scx_root_enable_workfn(). Change scx_enable() return type to s32.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 80 +++++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d42a22f5a098..142845bcddaa 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3267,8 +3267,8 @@ void sched_ext_dead(struct task_struct *p)
 	raw_spin_unlock_irqrestore(&scx_tasks_lock, flags);
 
 	/*
-	 * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED
-	 * transitions can't race us. Disable ops for @p.
+	 * @p is off scx_tasks and wholly ours. scx_root_enable()'s READY ->
+	 * ENABLED transitions can't race us. Disable ops for @p.
 	 */
 	if (scx_get_task_state(p) != SCX_TASK_NONE) {
 		struct rq_flags rf;
@@ -4430,24 +4430,12 @@ static void free_kick_syncs(void)
 	}
 }
 
-static void scx_disable_workfn(struct kthread_work *work)
+static void scx_root_disable(struct scx_sched *sch)
 {
-	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
 	struct scx_exit_info *ei = sch->exit_info;
 	struct scx_task_iter sti;
 	struct task_struct *p;
-	int kind, cpu;
-
-	kind = atomic_read(&sch->exit_kind);
-	while (true) {
-		if (kind == SCX_EXIT_DONE)	/* already disabled? */
-			return;
-		WARN_ON_ONCE(kind == SCX_EXIT_NONE);
-		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
-			break;
-	}
-	ei->kind = kind;
-	ei->reason = scx_exit_reason(ei->kind);
+	int cpu;
 
 	/* guarantee forward progress by bypassing scx_ops */
 	scx_bypass(true);
@@ -4591,6 +4579,9 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
 
 	lockdep_assert_preemption_disabled();
 
+	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
+		kind = SCX_EXIT_ERROR;
+
 	if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind))
 		return false;
 
@@ -4603,21 +4594,31 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
 	return true;
 }
 
-static void scx_disable(enum scx_exit_kind kind)
+static void scx_disable_workfn(struct kthread_work *work)
 {
-	struct scx_sched *sch;
+	struct scx_sched *sch = container_of(work, struct scx_sched, disable_work);
+	struct scx_exit_info *ei = sch->exit_info;
+	int kind;
 
-	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
-		kind = SCX_EXIT_ERROR;
-
-	rcu_read_lock();
-	sch = rcu_dereference(scx_root);
-	if (sch) {
-		guard(preempt)();
-		scx_claim_exit(sch, kind);
-		kthread_queue_work(sch->helper, &sch->disable_work);
+	kind = atomic_read(&sch->exit_kind);
+	while (true) {
+		if (kind == SCX_EXIT_DONE)	/* already disabled? */
+			return;
+		WARN_ON_ONCE(kind == SCX_EXIT_NONE);
+		if (atomic_try_cmpxchg(&sch->exit_kind, &kind, SCX_EXIT_DONE))
+			break;
 	}
-	rcu_read_unlock();
+	ei->kind = kind;
+	ei->reason = scx_exit_reason(ei->kind);
+
+	scx_root_disable(sch);
+}
+
+static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
+{
+	guard(preempt)();
+	if (scx_claim_exit(sch, kind))
+		kthread_queue_work(sch->helper, &sch->disable_work);
 }
 
 static void dump_newline(struct seq_buf *s)
@@ -5135,10 +5136,9 @@ struct scx_enable_cmd {
 	int			ret;
 };
 
-static void scx_enable_workfn(struct kthread_work *work)
+static void scx_root_enable_workfn(struct kthread_work *work)
 {
-	struct scx_enable_cmd *cmd =
-		container_of(work, struct scx_enable_cmd, work);
+	struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
 	struct sched_ext_ops *ops = cmd->ops;
 	struct scx_sched *sch;
 	struct scx_task_iter sti;
@@ -5387,12 +5387,12 @@ err_disable:
 	 * Flush scx_disable_work to ensure that error is reported before init
 	 * completion. sch's base reference will be put by bpf_scx_unreg().
 	 */
-	scx_error(sch, "scx_enable() failed (%d)", ret);
+	scx_error(sch, "scx_root_enable() failed (%d)", ret);
 	kthread_flush_work(&sch->disable_work);
 	cmd->ret = 0;
 }
 
-static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
+static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 {
 	static struct kthread_worker *helper;
 	static DEFINE_MUTEX(helper_mutex);
@@ -5418,7 +5418,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		mutex_unlock(&helper_mutex);
 	}
 
-	kthread_init_work(&cmd.work, scx_enable_workfn);
+	kthread_init_work(&cmd.work, scx_root_enable_workfn);
 	cmd.ops = ops;
 
 	kthread_queue_work(READ_ONCE(helper), &cmd.work);
@@ -5561,7 +5561,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 	struct sched_ext_ops *ops = kdata;
 	struct scx_sched *sch = ops->priv;
 
-	scx_disable(SCX_EXIT_UNREG);
+	scx_disable(sch, SCX_EXIT_UNREG);
 	kthread_flush_work(&sch->disable_work);
 	kobject_put(&sch->kobj);
 }
@@ -5689,7 +5689,15 @@ static struct bpf_struct_ops bpf_sched_ext_ops = {
 
 static void sysrq_handle_sched_ext_reset(u8 key)
 {
-	scx_disable(SCX_EXIT_SYSRQ);
+	struct scx_sched *sch;
+
+	rcu_read_lock();
+	sch = rcu_dereference(scx_root);
+	if (likely(sch))
+		scx_disable(sch, SCX_EXIT_SYSRQ);
+	else
+		pr_info("sched_ext: BPF schedulers not loaded\n");
+	rcu_read_unlock();
 }
 
 static const struct sysrq_key_op sysrq_sched_ext_reset_op = {

From ebeca1f930eac8f11f815d58eb38fa5d07e7c16e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 011/134] sched_ext: Introduce cgroup sub-sched support

A system often runs multiple workloads especially in multi-tenant server
environments where a system is split into partitions servicing separate
more-or-less independent workloads each requiring an application-specific
scheduler. To support such and other use cases, sched_ext is in the process
of growing multiple scheduler support.

When partitioning a system in terms of CPUs for such use cases, an
oft-taken approach is hard partitioning the system using cpuset. While it
would be possible to tie sched_ext multiple scheduler support to cpuset
partitions, such an approach would have fundamental limitations stemming
from the lack of dynamism and flexibility.

Users often don't care which specific CPUs are assigned to which workload
and want to take advantage of optimizations which are enabled by running
workloads on a larger machine - e.g. opportunistic over-commit, improving
latency critical workload characteristics while maintaining bandwidth
fairness, employing control mechanisms based on different criteria than
on-CPU time for e.g. flexible memory bandwidth isolation, packing similar
parts from different workloads on same L3s to improve cache efficiency,
and so on.

As this sort of dynamic behaviors are impossible or difficult to implement
with hard partitioning, sched_ext is implementing cgroup sub-sched support
where schedulers can be attached to the cgroup hierarchy and a parent
scheduler is responsible for controlling the CPUs that each child can use
at any given moment. This makes CPU distribution dynamically controlled by
BPF allowing high flexibility.

This patch adds the skeletal sched_ext cgroup sub-sched support:

- sched_ext_ops.sub_cgroup_id and .sub_attach/detach() are added. Non-zero
  sub_cgroup_id indicates that the scheduler is to be attached to the
  identified cgroup. A sub-sched is attached to the cgroup iff the nearest
  ancestor scheduler implements .sub_attach() and grants the attachment. Max
  nesting depth is limited by SCX_SUB_MAX_DEPTH.

- When a scheduler exits, all its descendant schedulers are exited
  together. Also, cgroup.scx_sched added which points to the effective
  scheduler instance for the cgroup. This is updated on scheduler
  init/exit and inherited on cgroup online. When a cgroup is offlined, the
  attached scheduler is automatically exited.

- Sub-sched support is gated on CONFIG_EXT_SUB_SCHED which is
  automatically enabled if both SCX and cgroups are enabled. Sub-sched
  support is not tied to the CPU controller but rather the cgroup
  hierarchy itself. This is intentional as the support for cpu.weight and
  cpu.max based resource control is orthogonal to sub-sched support. Note
  that CONFIG_CGROUPS around cgroup subtree iteration support for
  scx_task_iter is replaced with CONFIG_EXT_SUB_SCHED for consistency.

- This allows loading sub-scheds and most framework operations such as
  propagating disable down the hierarchy work. However, sub-scheds are not
  operational yet and all tasks stay with the root sched. This will serve
  as the basis for building up full sub-sched support.

- DSQs point to the scx_sched they belong to.

- scx_qmap is updated to allow attachment of sub-scheds and also serving
  as sub-scheds.

- scx_is_descendant() is added but not yet used in this patch. It is used by
  later changes in the series and placed here as this is where the function
  belongs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/cgroup-defs.h    |   4 +
 include/linux/sched/ext.h      |   3 +
 init/Kconfig                   |   4 +
 kernel/sched/ext.c             | 532 +++++++++++++++++++++++++++++++--
 kernel/sched/ext_internal.h    |  67 ++++-
 tools/sched_ext/scx_qmap.bpf.c |   9 +-
 tools/sched_ext/scx_qmap.c     |  13 +-
 7 files changed, 596 insertions(+), 36 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index bb92f5c169ca..dd61767cf9bb 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -17,6 +17,7 @@
 #include <linux/refcount.h>
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
+#include <linux/sched.h>
 #include <linux/u64_stats_sync.h>
 #include <linux/workqueue.h>
 #include <linux/bpf-cgroup-defs.h>
@@ -624,6 +625,9 @@ struct cgroup {
 #ifdef CONFIG_BPF_SYSCALL
 	struct bpf_local_storage __rcu  *bpf_cgrp_storage;
 #endif
+#ifdef CONFIG_EXT_SUB_SCHED
+	struct scx_sched __rcu *scx_sched;
+#endif
 
 	/* All ancestors including self */
 	union {
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 0150b3fe6230..fa4349b319e6 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -78,6 +78,7 @@ struct scx_dispatch_q {
 	u64			id;
 	struct rhash_head	hash_node;
 	struct llist_node	free_node;
+	struct scx_sched	*sched;
 	struct rcu_head		rcu;
 };
 
@@ -157,6 +158,8 @@ struct scx_dsq_list_node {
 		.priv = (__priv),						\
 	}
 
+struct scx_sched;
+
 /*
  * The following is embedded in task_struct and contains all fields necessary
  * for a task to be scheduled by SCX.
diff --git a/init/Kconfig b/init/Kconfig
index b55deae9256c..06abd8e272cb 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1176,6 +1176,10 @@ config EXT_GROUP_SCHED
 
 endif #CGROUP_SCHED
 
+config EXT_SUB_SCHED
+        def_bool y
+        depends on SCHED_CLASS_EXT
+
 config SCHED_MM_CID
 	def_bool y
 	depends on SMP && RSEQ
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 142845bcddaa..bb3e33b660da 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9,6 +9,8 @@
 #include <linux/btf_ids.h>
 #include "ext_idle.h"
 
+static DEFINE_RAW_SPINLOCK(scx_sched_lock);
+
 /*
  * NOTE: sched_ext is in the process of growing multiple scheduler support and
  * scx_root usage is in a transitional state. Naked dereferences are safe if the
@@ -19,6 +21,12 @@
  */
 static struct scx_sched __rcu *scx_root;
 
+/*
+ * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock.
+ * Readers can hold either or rcu_read_lock().
+ */
+static LIST_HEAD(scx_sched_all);
+
 /*
  * During exit, a task may schedule after losing its PIDs. When disabling the
  * BPF scheduler, we need to be able to iterate tasks in every state to
@@ -197,6 +205,7 @@ static void process_ddsp_deferred_locals(struct rq *rq);
 static bool task_dead_and_done(struct task_struct *p);
 static u32 reenq_local(struct rq *rq);
 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
+static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
 		      s64 exit_code, const char *fmt, va_list args);
 
@@ -245,6 +254,88 @@ static bool u32_before(u32 a, u32 b)
 	return (s32)(a - b) < 0;
 }
 
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_parent - Find the parent sched
+ * @sch: sched to find the parent of
+ *
+ * Returns the parent scheduler or %NULL if @sch is root.
+ */
+static struct scx_sched *scx_parent(struct scx_sched *sch)
+{
+	if (sch->level)
+		return sch->ancestors[sch->level - 1];
+	else
+		return NULL;
+}
+
+/**
+ * scx_next_descendant_pre - find the next descendant for pre-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: sched whose descendants to walk
+ *
+ * To be used by scx_for_each_descendant_pre(). Find the next descendant to
+ * visit for pre-order traversal of @root's descendants. @root is included in
+ * the iteration and the first node to be visited.
+ */
+static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos,
+						 struct scx_sched *root)
+{
+	struct scx_sched *next;
+
+	lockdep_assert(lockdep_is_held(&scx_enable_mutex) ||
+		       lockdep_is_held(&scx_sched_lock));
+
+	/* if first iteration, visit @root */
+	if (!pos)
+		return root;
+
+	/* visit the first child if exists */
+	next = list_first_entry_or_null(&pos->children, struct scx_sched, sibling);
+	if (next)
+		return next;
+
+	/* no child, visit my or the closest ancestor's next sibling */
+	while (pos != root) {
+		if (!list_is_last(&pos->sibling, &scx_parent(pos)->children))
+			return list_next_entry(pos, sibling);
+		pos = scx_parent(pos);
+	}
+
+	return NULL;
+}
+#else	/* CONFIG_EXT_SUB_SCHED */
+static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
+static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
+/**
+ * scx_is_descendant - Test whether sched is a descendant
+ * @sch: sched to test
+ * @ancestor: ancestor sched to test against
+ *
+ * Test whether @sch is a descendant of @ancestor.
+ */
+static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor)
+{
+	if (sch->level < ancestor->level)
+		return false;
+	return sch->ancestors[ancestor->level] == ancestor;
+}
+
+/**
+ * scx_for_each_descendant_pre - pre-order walk of a sched's descendants
+ * @pos: iteration cursor
+ * @root: sched to walk the descendants of
+ *
+ * Walk @root's descendants. @root is included in the iteration and the first
+ * node to be visited. Must be called with either scx_enable_mutex or
+ * scx_sched_lock held.
+ */
+#define scx_for_each_descendant_pre(pos, root)					\
+	for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos);		\
+	     (pos) = scx_next_descendant_pre((pos), (root)))
+
 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
 					      struct task_struct *p)
 {
@@ -514,7 +605,7 @@ struct scx_task_iter {
 	struct rq_flags			rf;
 	u32				cnt;
 	bool				list_locked;
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
 	struct cgroup			*cgrp;
 	struct cgroup_subsys_state	*css_pos;
 	struct css_task_iter		css_iter;
@@ -553,7 +644,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter, struct cgroup *cgrp)
 {
 	memset(iter, 0, sizeof(*iter));
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
 	if (cgrp) {
 		lockdep_assert_held(&cgroup_mutex);
 		iter->cgrp = cgrp;
@@ -614,7 +705,7 @@ static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter)
  */
 static void scx_task_iter_stop(struct scx_task_iter *iter)
 {
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
 	if (iter->cgrp) {
 		if (iter->css_pos)
 			css_task_iter_end(&iter->css_iter);
@@ -645,7 +736,7 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
 		cond_resched();
 	}
 
-#ifdef CONFIG_CGROUPS
+#ifdef CONFIG_EXT_SUB_SCHED
 	if (iter->cgrp) {
 		while (iter->css_pos) {
 			struct task_struct *p;
@@ -3032,7 +3123,10 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
 	scx_set_task_state(p, SCX_TASK_INIT);
 
 	if (p->scx.disallow) {
-		if (unlikely(fork)) {
+		if (unlikely(scx_parent(sch))) {
+			scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]",
+				  p->comm, p->pid);
+		} else if (unlikely(fork)) {
 			scx_error(sch, "ops.init_task() set task->scx.disallow for %s[%d] during fork",
 				  p->comm, p->pid);
 		} else {
@@ -3555,25 +3649,51 @@ void scx_group_set_bandwidth(struct task_group *tg,
 
 	percpu_up_read(&scx_cgroup_ops_rwsem);
 }
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+
+#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
+static struct cgroup *root_cgroup(void)
+{
+	return &cgrp_dfl_root.cgrp;
+}
+
+static struct cgroup *sch_cgroup(struct scx_sched *sch)
+{
+	return sch->cgrp;
+}
+
+/* for each descendant of @cgrp including self, set ->scx_sched to @sch */
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch)
+{
+	struct cgroup *pos;
+	struct cgroup_subsys_state *css;
+
+	cgroup_for_each_live_descendant_pre(pos, css, cgrp)
+		rcu_assign_pointer(pos->scx_sched, sch);
+}
 
 static void scx_cgroup_lock(void)
 {
+#ifdef CONFIG_EXT_GROUP_SCHED
 	percpu_down_write(&scx_cgroup_ops_rwsem);
+#endif
 	cgroup_lock();
 }
 
 static void scx_cgroup_unlock(void)
 {
 	cgroup_unlock();
+#ifdef CONFIG_EXT_GROUP_SCHED
 	percpu_up_write(&scx_cgroup_ops_rwsem);
+#endif
 }
-
-#else	/* CONFIG_EXT_GROUP_SCHED */
-
+#else	/* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
+static struct cgroup *root_cgroup(void) { return NULL; }
+static struct cgroup *sch_cgroup(struct scx_sched *sch) { return NULL; }
+static void set_cgroup_sched(struct cgroup *cgrp, struct scx_sched *sch) {}
 static void scx_cgroup_lock(void) {}
 static void scx_cgroup_unlock(void) {}
-
-#endif	/* CONFIG_EXT_GROUP_SCHED */
+#endif	/* CONFIG_EXT_GROUP_SCHED || CONFIG_EXT_SUB_SCHED */
 
 /*
  * Omitted operations:
@@ -3622,13 +3742,15 @@ DEFINE_SCHED_CLASS(ext) = {
 #endif
 };
 
-static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
+		     struct scx_sched *sch)
 {
 	memset(dsq, 0, sizeof(*dsq));
 
 	raw_spin_lock_init(&dsq->lock);
 	INIT_LIST_HEAD(&dsq->list);
 	dsq->id = dsq_id;
+	dsq->sched = sch;
 }
 
 static void free_dsq_irq_workfn(struct irq_work *irq_work)
@@ -3826,6 +3948,12 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	irq_work_sync(&sch->error_irq_work);
 	kthread_destroy_worker(sch->helper);
 
+#ifdef CONFIG_EXT_SUB_SCHED
+	kfree(sch->cgrp_path);
+	if (sch_cgroup(sch))
+		cgroup_put(sch_cgroup(sch));
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 	free_percpu(sch->pcpu);
 
 	for_each_node_state(node, N_POSSIBLE)
@@ -4405,6 +4533,8 @@ static const char *scx_exit_reason(enum scx_exit_kind kind)
 		return "unregistered from the main kernel";
 	case SCX_EXIT_SYSRQ:
 		return "disabled by sysrq-S";
+	case SCX_EXIT_PARENT:
+		return "parent exiting";
 	case SCX_EXIT_ERROR:
 		return "runtime error";
 	case SCX_EXIT_ERROR_BPF:
@@ -4430,6 +4560,69 @@ static void free_kick_syncs(void)
 	}
 }
 
+#ifdef CONFIG_EXT_SUB_SCHED
+static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
+
+static void drain_descendants(struct scx_sched *sch)
+{
+	/*
+	 * Child scheds that finished the critical part of disabling will take
+	 * themselves off @sch->children. Wait for it to drain. As propagation
+	 * is recursive, empty @sch->children means that all proper descendant
+	 * scheds reached unlinking stage.
+	 */
+	wait_event(scx_unlink_waitq, list_empty(&sch->children));
+}
+
+static void scx_sub_disable(struct scx_sched *sch)
+{
+	struct scx_sched *parent = scx_parent(sch);
+
+	drain_descendants(sch);
+
+	mutex_lock(&scx_enable_mutex);
+	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
+
+	set_cgroup_sched(sch_cgroup(sch), parent);
+
+	/* TODO - perform actual disabling here */
+
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
+
+	raw_spin_lock_irq(&scx_sched_lock);
+	list_del_init(&sch->sibling);
+	list_del_rcu(&sch->all);
+	raw_spin_unlock_irq(&scx_sched_lock);
+
+	mutex_unlock(&scx_enable_mutex);
+
+	/*
+	 * @sch is now unlinked from the parent's children list. Notify and call
+	 * ops.sub_detach/exit(). Note that ops.sub_detach/exit() must be called
+	 * after unlinking and releasing all locks. See scx_claim_exit().
+	 */
+	wake_up_all(&scx_unlink_waitq);
+
+	if (sch->ops.sub_detach && sch->sub_attached) {
+		struct scx_sub_detach_args sub_detach_args = {
+			.ops = &sch->ops,
+			.cgroup_path = sch->cgrp_path,
+		};
+		SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
+			    &sub_detach_args);
+	}
+
+	if (sch->ops.exit)
+		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+	kobject_del(&sch->kobj);
+}
+#else	/* CONFIG_EXT_SUB_SCHED */
+static void drain_descendants(struct scx_sched *sch) { }
+static void scx_sub_disable(struct scx_sched *sch) { }
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 static void scx_root_disable(struct scx_sched *sch)
 {
 	struct scx_exit_info *ei = sch->exit_info;
@@ -4437,9 +4630,10 @@ static void scx_root_disable(struct scx_sched *sch)
 	struct task_struct *p;
 	int cpu;
 
-	/* guarantee forward progress by bypassing scx_ops */
+	/* guarantee forward progress and wait for descendants to be disabled */
 	scx_bypass(true);
 	WRITE_ONCE(scx_aborting, false);
+	drain_descendants(sch);
 
 	switch (scx_set_enable_state(SCX_DISABLING)) {
 	case SCX_DISABLING:
@@ -4498,6 +4692,11 @@ static void scx_root_disable(struct scx_sched *sch)
 		scx_exit_task(p);
 	}
 	scx_task_iter_stop(&sti);
+
+	scx_cgroup_lock();
+	set_cgroup_sched(sch_cgroup(sch), NULL);
+	scx_cgroup_unlock();
+
 	percpu_up_write(&scx_fork_rwsem);
 
 	/*
@@ -4534,6 +4733,10 @@ static void scx_root_disable(struct scx_sched *sch)
 
 	cancel_delayed_work_sync(&scx_watchdog_work);
 
+	raw_spin_lock_irq(&scx_sched_lock);
+	list_del_rcu(&sch->all);
+	raw_spin_unlock_irq(&scx_sched_lock);
+
 	/*
 	 * scx_root clearing must be inside cpus_read_lock(). See
 	 * handle_hotplug().
@@ -4591,6 +4794,24 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
 	 * successfully reach scx_bypass().
 	 */
 	WRITE_ONCE(scx_aborting, true);
+
+	/*
+	 * Propagate exits to descendants immediately. Each has a dedicated
+	 * helper kthread and can run in parallel. While most of disabling is
+	 * serialized, running them in separate threads allows parallelizing
+	 * ops.exit(), which can take arbitrarily long prolonging bypass mode.
+	 *
+	 * This doesn't cause recursions as propagation only takes place for
+	 * non-propagation exits.
+	 */
+	if (kind != SCX_EXIT_PARENT) {
+		scoped_guard (raw_spinlock_irqsave, &scx_sched_lock) {
+			struct scx_sched *pos;
+			scx_for_each_descendant_pre(pos, sch)
+				scx_disable(pos, SCX_EXIT_PARENT);
+		}
+	}
+
 	return true;
 }
 
@@ -4611,7 +4832,10 @@ static void scx_disable_workfn(struct kthread_work *work)
 	ei->kind = kind;
 	ei->reason = scx_exit_reason(ei->kind);
 
-	scx_root_disable(sch);
+	if (scx_parent(sch))
+		scx_sub_disable(sch);
+	else
+		scx_root_disable(sch);
 }
 
 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
@@ -4987,12 +5211,15 @@ static int alloc_kick_syncs(void)
 	return 0;
 }
 
-static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
+static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
+						 struct cgroup *cgrp,
+						 struct scx_sched *parent)
 {
 	struct scx_sched *sch;
+	s32 level = parent ? parent->level + 1 : 0;
 	int node, ret;
 
-	sch = kzalloc_obj(*sch);
+	sch = kzalloc_flex(*sch, ancestors, level);
 	if (!sch)
 		return ERR_PTR(-ENOMEM);
 
@@ -5021,7 +5248,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 			goto err_free_gdsqs;
 		}
 
-		init_dsq(dsq, SCX_DSQ_GLOBAL);
+		init_dsq(dsq, SCX_DSQ_GLOBAL, sch);
 		sch->global_dsqs[node] = dsq;
 	}
 
@@ -5039,6 +5266,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 
 	sched_set_fifo(sch->helper->task);
 
+	if (parent)
+		memcpy(sch->ancestors, parent->ancestors,
+		       level * sizeof(parent->ancestors[0]));
+	sch->ancestors[level] = sch;
+	sch->level = level;
+
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
 	init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
 	kthread_init_work(&sch->disable_work, scx_disable_workfn);
@@ -5046,10 +5279,46 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops)
 	ops->priv = sch;
 
 	sch->kobj.kset = scx_kset;
+
+#ifdef CONFIG_EXT_SUB_SCHED
+	char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		goto err_stop_helper;
+	cgroup_path(cgrp, buf, PATH_MAX);
+	sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
+	kfree(buf);
+	if (!sch->cgrp_path)
+		goto err_stop_helper;
+
+	sch->cgrp = cgrp;
+	INIT_LIST_HEAD(&sch->children);
+	INIT_LIST_HEAD(&sch->sibling);
+
+	if (parent)
+		ret = kobject_init_and_add(&sch->kobj, &scx_ktype,
+					   &parent->sub_kset->kobj,
+					   "sub-%llu", cgroup_id(cgrp));
+	else
+		ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
+
+	if (ret < 0) {
+		kfree(sch->cgrp_path);
+		goto err_stop_helper;
+	}
+
+	if (ops->sub_attach) {
+		sch->sub_kset = kset_create_and_add("sub", NULL, &sch->kobj);
+		if (!sch->sub_kset) {
+			kobject_put(&sch->kobj);
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
+#else	/* CONFIG_EXT_SUB_SCHED */
 	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
 	if (ret < 0)
 		goto err_stop_helper;
-
+#endif	/* CONFIG_EXT_SUB_SCHED */
 	return sch;
 
 err_stop_helper:
@@ -5157,7 +5426,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_unlock;
 
-	sch = scx_alloc_and_add_sched(ops);
+	sch = scx_alloc_and_add_sched(ops, root_cgroup(), NULL);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);
 		goto err_free_ksyncs;
@@ -5174,8 +5443,13 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 
 	atomic_long_set(&scx_nr_rejected, 0);
 
-	for_each_possible_cpu(cpu)
-		cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE;
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		rq->scx.local_dsq.sched = sch;
+		rq->scx.bypass_dsq.sched = sch;
+		rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
+	}
 
 	/*
 	 * Keep CPUs stable during enable so that the BPF scheduler can track
@@ -5189,6 +5463,10 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	 */
 	rcu_assign_pointer(scx_root, sch);
 
+	raw_spin_lock_irq(&scx_sched_lock);
+	list_add_tail_rcu(&sch->all, &scx_sched_all);
+	raw_spin_unlock_irq(&scx_sched_lock);
+
 	scx_idle_enable(ops);
 
 	if (sch->ops.init) {
@@ -5278,6 +5556,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	 * never sees uninitialized tasks.
 	 */
 	scx_cgroup_lock();
+	set_cgroup_sched(sch_cgroup(sch), sch);
 	ret = scx_cgroup_init(sch);
 	if (ret)
 		goto err_disable_unlock_all;
@@ -5392,6 +5671,185 @@ err_disable:
 	cmd->ret = 0;
 }
 
+#ifdef CONFIG_EXT_SUB_SCHED
+/* verify that a scheduler can be attached to @cgrp and return the parent */
+static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
+{
+	struct scx_sched *parent = cgrp->scx_sched;
+	struct scx_sched *pos;
+
+	lockdep_assert_held(&scx_sched_lock);
+
+	/* can't attach twice to the same cgroup */
+	if (parent->cgrp == cgrp)
+		return ERR_PTR(-EBUSY);
+
+	/* does $parent allow sub-scheds? */
+	if (!parent->ops.sub_attach)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	/* can't insert between $parent and its exiting children */
+	list_for_each_entry(pos, &parent->children, sibling)
+		if (cgroup_is_descendant(pos->cgrp, cgrp))
+			return ERR_PTR(-EBUSY);
+
+	return parent;
+}
+
+static void scx_sub_enable_workfn(struct kthread_work *work)
+{
+	struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
+	struct sched_ext_ops *ops = cmd->ops;
+	struct cgroup *cgrp;
+	struct scx_sched *parent, *sch;
+	s32 ret;
+
+	mutex_lock(&scx_enable_mutex);
+
+	if (!scx_enabled()) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	cgrp = cgroup_get_from_id(ops->sub_cgroup_id);
+	if (IS_ERR(cgrp)) {
+		ret = PTR_ERR(cgrp);
+		goto out_unlock;
+	}
+
+	raw_spin_lock_irq(&scx_sched_lock);
+	parent = find_parent_sched(cgrp);
+	if (IS_ERR(parent)) {
+		raw_spin_unlock_irq(&scx_sched_lock);
+		ret = PTR_ERR(parent);
+		goto out_put_cgrp;
+	}
+	kobject_get(&parent->kobj);
+	raw_spin_unlock_irq(&scx_sched_lock);
+
+	sch = scx_alloc_and_add_sched(ops, cgrp, parent);
+	kobject_put(&parent->kobj);
+	if (IS_ERR(sch)) {
+		ret = PTR_ERR(sch);
+		goto out_put_cgrp;
+	}
+
+	raw_spin_lock_irq(&scx_sched_lock);
+	list_add_tail(&sch->sibling, &parent->children);
+	list_add_tail_rcu(&sch->all, &scx_sched_all);
+	raw_spin_unlock_irq(&scx_sched_lock);
+
+	if (sch->level >= SCX_SUB_MAX_DEPTH) {
+		scx_error(sch, "max nesting depth %d violated",
+			  SCX_SUB_MAX_DEPTH);
+		goto err_disable;
+	}
+
+	if (sch->ops.init) {
+		ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
+		if (ret) {
+			ret = ops_sanitize_err(sch, "init", ret);
+			scx_error(sch, "ops.init() failed (%d)", ret);
+			goto err_disable;
+		}
+		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
+	}
+
+	if (validate_ops(sch, ops))
+		goto err_disable;
+
+	struct scx_sub_attach_args sub_attach_args = {
+		.ops = &sch->ops,
+		.cgroup_path = sch->cgrp_path,
+	};
+
+	ret = SCX_CALL_OP_RET(parent, SCX_KF_UNLOCKED, sub_attach, NULL,
+			      &sub_attach_args);
+	if (ret) {
+		ret = ops_sanitize_err(sch, "sub_attach", ret);
+		scx_error(sch, "parent rejected (%d)", ret);
+		goto err_disable;
+	}
+	sch->sub_attached = true;
+
+	percpu_down_write(&scx_fork_rwsem);
+	scx_cgroup_lock();
+
+	/*
+	 * Set cgroup->scx_sched's and check CSS_ONLINE. Either we see
+	 * !CSS_ONLINE or scx_cgroup_lifetime_notify() sees and shoots us down.
+	 */
+	set_cgroup_sched(sch_cgroup(sch), sch);
+	if (!(cgrp->self.flags & CSS_ONLINE)) {
+		scx_error(sch, "cgroup is not online");
+		goto err_unlock_and_disable;
+	}
+
+	/* TODO - perform actual enabling here */
+
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
+
+	pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
+	kobject_uevent(&sch->kobj, KOBJ_ADD);
+	ret = 0;
+	goto out_unlock;
+
+out_put_cgrp:
+	cgroup_put(cgrp);
+out_unlock:
+	mutex_unlock(&scx_enable_mutex);
+	cmd->ret = ret;
+	return;
+
+err_unlock_and_disable:
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
+err_disable:
+	mutex_unlock(&scx_enable_mutex);
+	kthread_flush_work(&sch->disable_work);
+	cmd->ret = 0;
+}
+
+static s32 scx_cgroup_lifetime_notify(struct notifier_block *nb,
+				      unsigned long action, void *data)
+{
+	struct cgroup *cgrp = data;
+	struct cgroup *parent = cgroup_parent(cgrp);
+
+	if (!cgroup_on_dfl(cgrp))
+		return NOTIFY_OK;
+
+	switch (action) {
+	case CGROUP_LIFETIME_ONLINE:
+		/* inherit ->scx_sched from $parent */
+		if (parent)
+			rcu_assign_pointer(cgrp->scx_sched, parent->scx_sched);
+		break;
+	case CGROUP_LIFETIME_OFFLINE:
+		/* if there is a sched attached, shoot it down */
+		if (cgrp->scx_sched && cgrp->scx_sched->cgrp == cgrp)
+			scx_exit(cgrp->scx_sched, SCX_EXIT_UNREG_KERN,
+				 SCX_ECODE_RSN_CGROUP_OFFLINE,
+				 "cgroup %llu going offline", cgroup_id(cgrp));
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block scx_cgroup_lifetime_nb = {
+	.notifier_call = scx_cgroup_lifetime_notify,
+};
+
+static s32 __init scx_cgroup_lifetime_notifier_init(void)
+{
+	return blocking_notifier_chain_register(&cgroup_lifetime_notifier,
+						&scx_cgroup_lifetime_nb);
+}
+core_initcall(scx_cgroup_lifetime_notifier_init);
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 {
 	static struct kthread_worker *helper;
@@ -5418,7 +5876,12 @@ static s32 scx_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 		mutex_unlock(&helper_mutex);
 	}
 
-	kthread_init_work(&cmd.work, scx_root_enable_workfn);
+#ifdef CONFIG_EXT_SUB_SCHED
+	if (ops->sub_cgroup_id > 1)
+		kthread_init_work(&cmd.work, scx_sub_enable_workfn);
+	else
+#endif	/* CONFIG_EXT_SUB_SCHED */
+		kthread_init_work(&cmd.work, scx_root_enable_workfn);
 	cmd.ops = ops;
 
 	kthread_queue_work(READ_ONCE(helper), &cmd.work);
@@ -5520,6 +5983,11 @@ static int bpf_scx_init_member(const struct btf_type *t,
 	case offsetof(struct sched_ext_ops, hotplug_seq):
 		ops->hotplug_seq = *(u64 *)(udata + moff);
 		return 1;
+#ifdef CONFIG_EXT_SUB_SCHED
+	case offsetof(struct sched_ext_ops, sub_cgroup_id):
+		ops->sub_cgroup_id = *(u64 *)(udata + moff);
+		return 1;
+#endif	/* CONFIG_EXT_SUB_SCHED */
 	}
 
 	return 0;
@@ -5542,6 +6010,8 @@ static int bpf_scx_check_member(const struct btf_type *t,
 	case offsetof(struct sched_ext_ops, cpu_offline):
 	case offsetof(struct sched_ext_ops, init):
 	case offsetof(struct sched_ext_ops, exit):
+	case offsetof(struct sched_ext_ops, sub_attach):
+	case offsetof(struct sched_ext_ops, sub_detach):
 		break;
 	default:
 		if (prog->sleepable)
@@ -5619,7 +6089,9 @@ static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgro
 static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
 static void sched_ext_ops__cgroup_set_bandwidth(struct cgroup *cgrp, u64 period_us, u64 quota_us, u64 burst_us) {}
 static void sched_ext_ops__cgroup_set_idle(struct cgroup *cgrp, bool idle) {}
-#endif
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+static s32 sched_ext_ops__sub_attach(struct scx_sub_attach_args *args) { return -EINVAL; }
+static void sched_ext_ops__sub_detach(struct scx_sub_detach_args *args) {}
 static void sched_ext_ops__cpu_online(s32 cpu) {}
 static void sched_ext_ops__cpu_offline(s32 cpu) {}
 static s32 sched_ext_ops__init(void) { return -EINVAL; }
@@ -5659,6 +6131,8 @@ static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
 	.cgroup_set_bandwidth	= sched_ext_ops__cgroup_set_bandwidth,
 	.cgroup_set_idle	= sched_ext_ops__cgroup_set_idle,
 #endif
+	.sub_attach		= sched_ext_ops__sub_attach,
+	.sub_detach		= sched_ext_ops__sub_detach,
 	.cpu_online		= sched_ext_ops__cpu_online,
 	.cpu_offline		= sched_ext_ops__cpu_offline,
 	.init			= sched_ext_ops__init,
@@ -5941,8 +6415,10 @@ void __init init_sched_ext_class(void)
 		struct rq *rq = cpu_rq(cpu);
 		int  n = cpu_to_node(cpu);
 
-		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
-		init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS);
+		/* local/bypass dsq's sch will be set during scx_root_enable() */
+		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL);
+		init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS, NULL);
+
 		INIT_LIST_HEAD(&rq->scx.runnable_list);
 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
 
@@ -6598,16 +7074,16 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 	if (!dsq)
 		return -ENOMEM;
 
-	init_dsq(dsq, dsq_id);
-
 	rcu_read_lock();
 
 	sch = rcu_dereference(scx_root);
-	if (sch)
+	if (sch) {
+		init_dsq(dsq, dsq_id, sch);
 		ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
 						    dsq_hash_params);
-	else
+	} else {
 		ret = -ENODEV;
+	}
 
 	rcu_read_unlock();
 	if (ret)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 417d3c6f02fe..75b7f57e20ab 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -28,6 +28,8 @@ enum scx_consts {
 	SCX_BYPASS_LB_DONOR_PCT		= 125,
 	SCX_BYPASS_LB_MIN_DELTA_DIV	= 4,
 	SCX_BYPASS_LB_BATCH		= 256,
+
+	SCX_SUB_MAX_DEPTH		= 4,
 };
 
 enum scx_exit_kind {
@@ -38,6 +40,7 @@ enum scx_exit_kind {
 	SCX_EXIT_UNREG_BPF,	/* BPF-initiated unregistration */
 	SCX_EXIT_UNREG_KERN,	/* kernel-initiated unregistration */
 	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
+	SCX_EXIT_PARENT,	/* parent exiting */
 
 	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
 	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
@@ -62,6 +65,7 @@ enum scx_exit_kind {
 enum scx_exit_code {
 	/* Reasons */
 	SCX_ECODE_RSN_HOTPLUG	= 1LLU << 32,
+	SCX_ECODE_RSN_CGROUP_OFFLINE = 2LLU << 32,
 
 	/* Actions */
 	SCX_ECODE_ACT_RESTART	= 1LLU << 48,
@@ -213,7 +217,7 @@ struct scx_exit_task_args {
 	bool cancelled;
 };
 
-/* argument container for ops->cgroup_init() */
+/* argument container for ops.cgroup_init() */
 struct scx_cgroup_init_args {
 	/* the weight of the cgroup [1..10000] */
 	u32			weight;
@@ -236,12 +240,12 @@ enum scx_cpu_preempt_reason {
 };
 
 /*
- * Argument container for ops->cpu_acquire(). Currently empty, but may be
+ * Argument container for ops.cpu_acquire(). Currently empty, but may be
  * expanded in the future.
  */
 struct scx_cpu_acquire_args {};
 
-/* argument container for ops->cpu_release() */
+/* argument container for ops.cpu_release() */
 struct scx_cpu_release_args {
 	/* the reason the CPU was preempted */
 	enum scx_cpu_preempt_reason reason;
@@ -250,9 +254,7 @@ struct scx_cpu_release_args {
 	struct task_struct	*task;
 };
 
-/*
- * Informational context provided to dump operations.
- */
+/* informational context provided to dump operations */
 struct scx_dump_ctx {
 	enum scx_exit_kind	kind;
 	s64			exit_code;
@@ -261,6 +263,18 @@ struct scx_dump_ctx {
 	u64			at_jiffies;
 };
 
+/* argument container for ops.sub_attach() */
+struct scx_sub_attach_args {
+	struct sched_ext_ops	*ops;
+	char			*cgroup_path;
+};
+
+/* argument container for ops.sub_detach() */
+struct scx_sub_detach_args {
+	struct sched_ext_ops	*ops;
+	char			*cgroup_path;
+};
+
 /**
  * struct sched_ext_ops - Operation table for BPF scheduler implementation
  *
@@ -721,6 +735,20 @@ struct sched_ext_ops {
 
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
+	/**
+	 * @sub_attach: Attach a sub-scheduler
+	 * @args: argument container, see the struct definition
+	 *
+	 * Return 0 to accept the sub-scheduler. -errno to reject.
+	 */
+	s32 (*sub_attach)(struct scx_sub_attach_args *args);
+
+	/**
+	 * @sub_detach: Detach a sub-scheduler
+	 * @args: argument container, see the struct definition
+	 */
+	void (*sub_detach)(struct scx_sub_detach_args *args);
+
 	/*
 	 * All online ops must come before ops.cpu_online().
 	 */
@@ -762,6 +790,10 @@ struct sched_ext_ops {
 	 */
 	void (*exit)(struct scx_exit_info *info);
 
+	/*
+	 * Data fields must comes after all ops fields.
+	 */
+
 	/**
 	 * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
 	 */
@@ -796,6 +828,12 @@ struct sched_ext_ops {
 	 */
 	u64 hotplug_seq;
 
+	/**
+	 * @cgroup_id: When >1, attach the scheduler as a sub-scheduler on the
+	 * specified cgroup.
+	 */
+	u64 sub_cgroup_id;
+
 	/**
 	 * @name: BPF scheduler's name
 	 *
@@ -900,6 +938,8 @@ struct scx_sched {
 	struct scx_dispatch_q	**global_dsqs;
 	struct scx_sched_pcpu __percpu *pcpu;
 
+	s32			level;
+
 	/*
 	 * Updates to the following warned bitfields can race causing RMW issues
 	 * but it doesn't really matter.
@@ -907,6 +947,18 @@ struct scx_sched {
 	bool			warned_zero_slice:1;
 	bool			warned_deprecated_rq:1;
 
+	struct list_head	all;
+
+#ifdef CONFIG_EXT_SUB_SCHED
+	struct list_head	children;
+	struct list_head	sibling;
+	struct cgroup		*cgrp;
+	char			*cgrp_path;
+	struct kset		*sub_kset;
+
+	bool			sub_attached;
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 	atomic_t		exit_kind;
 	struct scx_exit_info	*exit_info;
 
@@ -916,6 +968,9 @@ struct scx_sched {
 	struct irq_work		error_irq_work;
 	struct kthread_work	disable_work;
 	struct rcu_work		rcu_work;
+
+	/* all ancestors including self */
+	struct scx_sched	*ancestors[];
 };
 
 enum scx_wake_flags {
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index d51d8c38f1cf..ff6ff34177ab 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -41,6 +41,7 @@ const volatile u32 dsp_batch;
 const volatile bool highpri_boosting;
 const volatile bool print_dsqs_and_events;
 const volatile bool print_msgs;
+const volatile u64 sub_cgroup_id;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
 
@@ -862,7 +863,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 	struct bpf_timer *timer;
 	s32 ret;
 
-	if (print_msgs)
+	if (print_msgs && !sub_cgroup_id)
 		print_cpus();
 
 	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
@@ -892,6 +893,11 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 	UEI_RECORD(uei, ei);
 }
 
+s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
+{
+	return 0;
+}
+
 SCX_OPS_DEFINE(qmap_ops,
 	       .select_cpu		= (void *)qmap_select_cpu,
 	       .enqueue			= (void *)qmap_enqueue,
@@ -907,6 +913,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .cgroup_init		= (void *)qmap_cgroup_init,
 	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
 	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
+	       .sub_attach		= (void *)qmap_sub_attach,
 	       .cpu_online		= (void *)qmap_cpu_online,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index ef701d45ba43..5d762d10f4db 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -10,6 +10,7 @@
 #include <inttypes.h>
 #include <signal.h>
 #include <libgen.h>
+#include <sys/stat.h>
 #include <bpf/bpf.h>
 #include <scx/common.h>
 #include "scx_qmap.bpf.skel.h"
@@ -67,7 +68,7 @@ int main(int argc, char **argv)
 
 	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHd:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:Spvh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -96,6 +97,16 @@ int main(int argc, char **argv)
 		case 'H':
 			skel->rodata->highpri_boosting = true;
 			break;
+		case 'c': {
+			struct stat st;
+			if (stat(optarg, &st) < 0) {
+				perror("stat");
+				return 1;
+			}
+			skel->struct_ops.qmap_ops->sub_cgroup_id = st.st_ino;
+			skel->rodata->sub_cgroup_id = st.st_ino;
+			break;
+		}
 		case 'd':
 			skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
 			if (skel->rodata->disallow_tgid < 0)

From 88234b075c3fc23d57406e1867523b6aba783ebf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 012/134] sched_ext: Introduce scx_task_sched[_rcu]()

In preparation of multiple scheduler support, add p->scx.sched which points
to the scx_sched instance that the task is scheduled by, which is currently
always scx_root. Add scx_task_sched[_rcu]() accessors which return the
associated scx_sched of the specified task and replace the raw scx_root
dereferences with it where applicable. scx_task_on_sched() is also added to
test whether a given task is on the specified sched.

As scx_root is still the only scheduler, this shouldn't introduce
user-visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h   |  7 +++++
 kernel/sched/ext.c          | 63 +++++++++++++++++++++++--------------
 kernel/sched/ext_internal.h | 59 ++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 24 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index fa4349b319e6..3213e31c7979 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -165,6 +165,13 @@ struct scx_sched;
  * for a task to be scheduled by SCX.
  */
 struct sched_ext_entity {
+#ifdef CONFIG_CGROUPS
+	/*
+	 * Associated scx_sched. Updated either during fork or while holding
+	 * both p->pi_lock and rq lock.
+	 */
+	struct scx_sched __rcu	*sched;
+#endif
 	struct scx_dispatch_q	*dsq;
 	atomic_long_t		ops_state;
 	u64			ddsp_dsq_id;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bb3e33b660da..d56539449f26 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -19,7 +19,7 @@ static DEFINE_RAW_SPINLOCK(scx_sched_lock);
  * are used as temporary markers to indicate that the dereferences need to be
  * updated to point to the associated scheduler instances rather than scx_root.
  */
-static struct scx_sched __rcu *scx_root;
+struct scx_sched __rcu *scx_root;
 
 /*
  * All scheds, writers must hold both scx_enable_mutex and scx_sched_lock.
@@ -304,9 +304,15 @@ static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos,
 
 	return NULL;
 }
+
+static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
+{
+	rcu_assign_pointer(p->scx.sched, sch);
+}
 #else	/* CONFIG_EXT_SUB_SCHED */
 static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
 /**
@@ -1542,7 +1548,7 @@ static bool scx_rq_online(struct rq *rq)
 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 			    int sticky_cpu)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 	struct task_struct **ddsp_taskp;
 	struct scx_dispatch_q *dsq;
 	unsigned long qseq;
@@ -1672,7 +1678,7 @@ static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
 
 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 	int sticky_cpu = p->scx.sticky_cpu;
 
 	if (enq_flags & ENQUEUE_WAKEUP)
@@ -1723,7 +1729,7 @@ out:
 
 static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 	unsigned long opss;
 	u64 op_deq_flags = deq_flags;
 
@@ -1794,7 +1800,7 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 
 static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 
 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
 		WARN_ON_ONCE(task_runnable(p));
@@ -1838,8 +1844,8 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 
 static void yield_task_scx(struct rq *rq)
 {
-	struct scx_sched *sch = scx_root;
 	struct task_struct *p = rq->donor;
+	struct scx_sched *sch = scx_task_sched(p);
 
 	if (SCX_HAS_OP(sch, yield))
 		SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
@@ -1849,10 +1855,10 @@ static void yield_task_scx(struct rq *rq)
 
 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
 {
-	struct scx_sched *sch = scx_root;
 	struct task_struct *from = rq->donor;
+	struct scx_sched *sch = scx_task_sched(from);
 
-	if (SCX_HAS_OP(sch, yield))
+	if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to))
 		return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
 					      from, to);
 	else
@@ -2517,7 +2523,7 @@ static void process_ddsp_deferred_locals(struct rq *rq)
 	 */
 	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
 				struct task_struct, scx.dsq_list.node))) {
-		struct scx_sched *sch = scx_root;
+		struct scx_sched *sch = scx_task_sched(p);
 		struct scx_dispatch_q *dsq;
 
 		list_del_init(&p->scx.dsq_list.node);
@@ -2531,7 +2537,7 @@ static void process_ddsp_deferred_locals(struct rq *rq)
 
 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
 		/*
@@ -2628,7 +2634,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
 static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 			      struct task_struct *next)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 
 	/* see kick_cpus_irq_workfn() */
 	smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
@@ -2722,14 +2728,14 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
 	if (keep_prev) {
 		p = prev;
 		if (!p->scx.slice)
-			refill_task_slice_dfl(rcu_dereference_sched(scx_root), p);
+			refill_task_slice_dfl(scx_task_sched(p), p);
 	} else {
 		p = first_local_task(rq);
 		if (!p)
 			return NULL;
 
 		if (unlikely(!p->scx.slice)) {
-			struct scx_sched *sch = rcu_dereference_sched(scx_root);
+			struct scx_sched *sch = scx_task_sched(p);
 
 			if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
 				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
@@ -2817,7 +2823,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 
 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 	bool rq_bypass;
 
 	/*
@@ -2878,7 +2884,7 @@ static void task_woken_scx(struct rq *rq, struct task_struct *p)
 static void set_cpus_allowed_scx(struct task_struct *p,
 				 struct affinity_context *ac)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 
 	set_cpus_allowed_common(p, ac);
 
@@ -3022,7 +3028,7 @@ void scx_tick(struct rq *rq)
 
 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(curr);
 
 	update_curr_scx(rq);
 
@@ -3212,11 +3218,12 @@ static void scx_disable_task(struct task_struct *p)
 
 static void scx_exit_task(struct task_struct *p)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 	struct scx_exit_task_args args = {
 		.cancelled = false,
 	};
 
+	lockdep_assert_held(&p->pi_lock);
 	lockdep_assert_rq_held(task_rq(p));
 
 	switch (scx_get_task_state(p)) {
@@ -3238,6 +3245,7 @@ static void scx_exit_task(struct task_struct *p)
 	if (SCX_HAS_OP(sch, exit_task))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
 				 p, &args);
+	scx_set_task_sched(p, NULL);
 	scx_set_task_state(p, SCX_TASK_NONE);
 }
 
@@ -3267,12 +3275,18 @@ void scx_pre_fork(struct task_struct *p)
 
 int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 {
+	s32 ret;
+
 	percpu_rwsem_assert_held(&scx_fork_rwsem);
 
-	if (scx_init_task_enabled)
-		return scx_init_task(p, task_group(p), true);
-	else
-		return 0;
+	if (scx_init_task_enabled) {
+		ret = scx_init_task(p, task_group(p), true);
+		if (!ret)
+			scx_set_task_sched(p, scx_root);
+		return ret;
+	}
+
+	return 0;
 }
 
 void scx_post_fork(struct task_struct *p)
@@ -3377,7 +3391,7 @@ void sched_ext_dead(struct task_struct *p)
 static void reweight_task_scx(struct rq *rq, struct task_struct *p,
 			      const struct load_weight *lw)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 
 	lockdep_assert_rq_held(task_rq(p));
 
@@ -3396,7 +3410,7 @@ static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
 
 static void switching_to_scx(struct rq *rq, struct task_struct *p)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_task_sched(p);
 
 	if (task_dead_and_done(p))
 		return;
@@ -4062,7 +4076,7 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
 	if (!scx_enabled())
 		return true;
 
-	sch = rcu_dereference_sched(scx_root);
+	sch = scx_task_sched(p);
 	if (unlikely(!sch))
 		return true;
 
@@ -5582,6 +5596,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 			goto err_disable_unlock_all;
 		}
 
+		scx_set_task_sched(p, sch);
 		scx_set_task_state(p, SCX_TASK_READY);
 
 		put_task_struct(p);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 75b7f57e20ab..0612006019da 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1223,6 +1223,7 @@ enum scx_ops_state {
 #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
 #define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
 
+extern struct scx_sched __rcu *scx_root;
 DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
 
 /*
@@ -1243,3 +1244,61 @@ static inline bool scx_rq_bypassing(struct rq *rq)
 {
 	return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
 }
+
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_task_sched - Find scx_sched scheduling a task
+ * @p: task of interest
+ *
+ * Return @p's scheduler instance. Must be called with @p's pi_lock or rq lock
+ * held.
+ */
+static inline struct scx_sched *scx_task_sched(const struct task_struct *p)
+{
+	return rcu_dereference_protected(p->scx.sched,
+					 lockdep_is_held(&p->pi_lock) ||
+					 lockdep_is_held(__rq_lockp(task_rq(p))));
+}
+
+/**
+ * scx_task_sched_rcu - Find scx_sched scheduling a task
+ * @p: task of interest
+ *
+ * Return @p's scheduler instance. The returned scx_sched is RCU protected.
+ */
+static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p)
+{
+	return rcu_dereference_all(p->scx.sched);
+}
+
+/**
+ * scx_task_on_sched - Is a task on the specified sched?
+ * @sch: sched to test against
+ * @p: task of interest
+ *
+ * Returns %true if @p is on @sch, %false otherwise.
+ */
+static inline bool scx_task_on_sched(struct scx_sched *sch,
+				     const struct task_struct *p)
+{
+	return rcu_access_pointer(p->scx.sched) == sch;
+}
+#else	/* CONFIG_EXT_SUB_SCHED */
+static inline struct scx_sched *scx_task_sched(const struct task_struct *p)
+{
+	return rcu_dereference_protected(scx_root,
+					 lockdep_is_held(&p->pi_lock) ||
+					 lockdep_is_held(__rq_lockp(task_rq(p))));
+}
+
+static inline struct scx_sched *scx_task_sched_rcu(const struct task_struct *p)
+{
+	return rcu_dereference_all(scx_root);
+}
+
+static inline bool scx_task_on_sched(struct scx_sched *sch,
+				     const struct task_struct *p)
+{
+	return true;
+}
+#endif	/* CONFIG_EXT_SUB_SCHED */

From 105dcd005be2ac1d5541921db8feb1d0f98d59d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 013/134] sched_ext: Introduce scx_prog_sched()

In preparation for multiple scheduler support, introduce scx_prog_sched()
accessor which returns the scx_sched instance associated with a BPF program.
The association is determined via the special KF_IMPLICIT_ARGS kfunc
parameter, which provides access to bpf_prog_aux. This aux can be used to
retrieve the struct_ops (sched_ext_ops) that the program is associated with,
and from there, the corresponding scx_sched instance.

For compatibility, when ops.sub_attach is not implemented (older schedulers
without sub-scheduler support), unassociated programs fall back to scx_root.
A warning is logged once per scheduler for such programs.

As scx_root is still the only scheduler, this shouldn't introduce
user-visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c                   | 157 ++++++++++++++++-----------
 kernel/sched/ext_idle.c              |  90 +++++++++------
 kernel/sched/ext_internal.h          |  44 +++++++-
 tools/sched_ext/include/scx/compat.h |  10 ++
 4 files changed, 199 insertions(+), 102 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d56539449f26..98b927aa20d1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5290,7 +5290,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
 	kthread_init_work(&sch->disable_work, scx_disable_workfn);
 	sch->ops = *ops;
-	ops->priv = sch;
+	rcu_assign_pointer(ops->priv, sch);
 
 	sch->kobj.kset = scx_kset;
 
@@ -6044,10 +6044,11 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 {
 	struct sched_ext_ops *ops = kdata;
-	struct scx_sched *sch = ops->priv;
+	struct scx_sched *sch = rcu_dereference_protected(ops->priv, true);
 
 	scx_disable(sch, SCX_EXIT_UNREG);
 	kthread_flush_work(&sch->disable_work);
+	RCU_INIT_POINTER(ops->priv, NULL);
 	kobject_put(&sch->kobj);
 }
 
@@ -6511,6 +6512,7 @@ __bpf_kfunc_start_defs();
  * @dsq_id: DSQ to insert into
  * @slice: duration @p can run for in nsecs, 0 to keep the current value
  * @enq_flags: SCX_ENQ_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
  * call this function spuriously. Can be called from ops.enqueue(),
@@ -6545,12 +6547,13 @@ __bpf_kfunc_start_defs();
  * to check the return value.
  */
 __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
-					 u64 slice, u64 enq_flags)
+					 u64 slice, u64 enq_flags,
+					 const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return false;
 
@@ -6571,9 +6574,10 @@ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
  * COMPAT: Will be removed in v6.23 along with the ___v2 suffix.
  */
 __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id,
-					     u64 slice, u64 enq_flags)
+				    u64 slice, u64 enq_flags,
+				    const struct bpf_prog_aux *aux)
 {
-	scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags);
+	scx_bpf_dsq_insert___v2(p, dsq_id, slice, enq_flags, aux);
 }
 
 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
@@ -6610,6 +6614,7 @@ struct scx_bpf_dsq_insert_vtime_args {
  *       @args->slice: duration @p can run for in nsecs, 0 to keep the current value
  *       @args->vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
  *       @args->enq_flags: SCX_ENQ_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
  * limit. BPF programs should use scx_bpf_dsq_insert_vtime() which is provided
@@ -6634,13 +6639,14 @@ struct scx_bpf_dsq_insert_vtime_args {
  */
 __bpf_kfunc bool
 __scx_bpf_dsq_insert_vtime(struct task_struct *p,
-			   struct scx_bpf_dsq_insert_vtime_args *args)
+			   struct scx_bpf_dsq_insert_vtime_args *args,
+			   const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return false;
 
@@ -6668,9 +6674,9 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
-BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_RCU)
-BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert___v2, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_dsq_insert_vtime, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
 
@@ -6770,16 +6776,17 @@ __bpf_kfunc_start_defs();
 
 /**
  * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Can only be called from ops.dispatch().
  */
-__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
+__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return 0;
 
@@ -6791,18 +6798,19 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
 
 /**
  * scx_bpf_dispatch_cancel - Cancel the latest dispatch
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Cancel the latest dispatch. Can be called multiple times to cancel further
  * dispatches. Can only be called from ops.dispatch().
  */
-__bpf_kfunc void scx_bpf_dispatch_cancel(void)
+__bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return;
 
@@ -6818,6 +6826,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
 /**
  * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
  * @dsq_id: DSQ to move task from
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
  * local DSQ for execution. Can only be called from ops.dispatch().
@@ -6829,7 +6838,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
  * Returns %true if a task has been moved, %false if there isn't any task to
  * move.
  */
-__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
+__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_dispatch_q *dsq;
@@ -6837,7 +6846,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return false;
 
@@ -6964,9 +6973,9 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
-BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
-BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
@@ -7024,6 +7033,7 @@ __bpf_kfunc_start_defs();
 
 /**
  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Iterate over all of the tasks currently enqueued on the local DSQ of the
  * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
@@ -7032,13 +7042,13 @@ __bpf_kfunc_start_defs();
  * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
  * returning variant that can be called from anywhere.
  */
-__bpf_kfunc u32 scx_bpf_reenqueue_local(void)
+__bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	struct rq *rq;
 
 	guard(rcu)();
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return 0;
 
@@ -7054,7 +7064,7 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_cpu_release)
-BTF_ID_FLAGS(func, scx_bpf_reenqueue_local)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local, KF_IMPLICIT_ARGS)
 BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
@@ -7068,11 +7078,12 @@ __bpf_kfunc_start_defs();
  * scx_bpf_create_dsq - Create a custom DSQ
  * @dsq_id: DSQ to create
  * @node: NUMA node to allocate from
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Create a custom DSQ identified by @dsq_id. Can be called from any sleepable
  * scx callback, and any BPF_PROG_TYPE_SYSCALL prog.
  */
-__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_aux *aux)
 {
 	struct scx_dispatch_q *dsq;
 	struct scx_sched *sch;
@@ -7091,7 +7102,7 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 
 	rcu_read_lock();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (sch) {
 		init_dsq(dsq, dsq_id, sch);
 		ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
@@ -7109,7 +7120,7 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
-BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
@@ -7208,18 +7219,19 @@ out:
  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
  * @cpu: cpu to kick
  * @flags: %SCX_KICK_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
  * trigger rescheduling on a busy CPU. This can be called from any online
  * scx_ops operation and the actual kicking is performed asynchronously through
  * an irq work.
  */
-__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (likely(sch))
 		scx_kick_cpu(sch, cpu, flags);
 }
@@ -7293,13 +7305,14 @@ __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
  * @it: iterator to initialize
  * @dsq_id: DSQ to iterate
  * @flags: %SCX_DSQ_ITER_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Initialize BPF iterator @it which can be used with bpf_for_each() to walk
  * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes
  * tasks which are already queued when this function is invoked.
  */
 __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
-				     u64 flags)
+				     u64 flags, const struct bpf_prog_aux *aux)
 {
 	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
 	struct scx_sched *sch;
@@ -7317,7 +7330,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
 	 */
 	kit->dsq = NULL;
 
-	sch = rcu_dereference_check(scx_root, rcu_read_lock_bh_held());
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return -ENODEV;
 
@@ -7406,6 +7419,7 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
 /**
  * scx_bpf_dsq_peek - Lockless peek at the first element.
  * @dsq_id: DSQ to examine.
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Read the first element in the DSQ. This is semantically equivalent to using
  * the DSQ iterator, but is lockfree. Of course, like any lockless operation,
@@ -7414,12 +7428,13 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it)
  *
  * Returns the pointer, or NULL indicates an empty queue OR internal error.
  */
-__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id)
+__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id,
+						 const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	struct scx_dispatch_q *dsq;
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return NULL;
 
@@ -7491,18 +7506,20 @@ __bpf_kfunc_start_defs();
  * @fmt: error message format string
  * @data: format string parameters packaged using ___bpf_fill() macro
  * @data__sz: @data len, must end in '__sz' for the verifier
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
  * disabling.
  */
 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
-				   unsigned long long *data, u32 data__sz)
+				   unsigned long long *data, u32 data__sz,
+				   const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
-	sch = rcu_dereference_bh(scx_root);
+	sch = scx_prog_sched(aux);
 	if (likely(sch) &&
 	    bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
 		scx_exit(sch, SCX_EXIT_UNREG_BPF, exit_code, "%s", scx_exit_bstr_buf.line);
@@ -7514,18 +7531,19 @@ __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
  * @fmt: error message format string
  * @data: format string parameters packaged using ___bpf_fill() macro
  * @data__sz: @data len, must end in '__sz' for the verifier
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Indicate that the BPF scheduler encountered a fatal error and initiate ops
  * disabling.
  */
 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
-				    u32 data__sz)
+				    u32 data__sz, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
-	sch = rcu_dereference_bh(scx_root);
+	sch = scx_prog_sched(aux);
 	if (likely(sch) &&
 	    bstr_format(sch, &scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
 		scx_exit(sch, SCX_EXIT_ERROR_BPF, 0, "%s", scx_exit_bstr_buf.line);
@@ -7537,6 +7555,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
  * @fmt: format string
  * @data: format string parameters packaged using ___bpf_fill() macro
  * @data__sz: @data len, must end in '__sz' for the verifier
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
  * dump_task() to generate extra debug dump specific to the BPF scheduler.
@@ -7545,7 +7564,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
  * multiple calls. The last line is automatically terminated.
  */
 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
-				   u32 data__sz)
+				   u32 data__sz, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	struct scx_dump_data *dd = &scx_dump_data;
@@ -7554,7 +7573,7 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return;
 
@@ -7611,18 +7630,19 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
 /**
  * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
  * @cpu: CPU of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Return the maximum relative capacity of @cpu in relation to the most
  * performant CPU in the system. The return value is in the range [1,
  * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur().
  */
-__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
 		return arch_scale_cpu_capacity(cpu);
 	else
@@ -7632,6 +7652,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
 /**
  * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU
  * @cpu: CPU of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Return the current relative performance of @cpu in relation to its maximum.
  * The return value is in the range [1, %SCX_CPUPERF_ONE].
@@ -7643,13 +7664,13 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu)
  *
  * The result is in the range [1, %SCX_CPUPERF_ONE].
  */
-__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (likely(sch) && ops_cpu_valid(sch, cpu, NULL))
 		return arch_scale_freq_capacity(cpu);
 	else
@@ -7660,6 +7681,7 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
  * scx_bpf_cpuperf_set - Set the relative performance target of a CPU
  * @cpu: CPU of interest
  * @perf: target performance level [0, %SCX_CPUPERF_ONE]
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Set the target performance level of @cpu to @perf. @perf is in linear
  * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
@@ -7670,13 +7692,13 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
  * use. Consult hardware and cpufreq documentation for more information. The
  * current performance level can be monitored using scx_bpf_cpuperf_cur().
  */
-__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
+__bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return;
 
@@ -7786,14 +7808,15 @@ __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
 /**
  * scx_bpf_cpu_rq - Fetch the rq of a CPU
  * @cpu: CPU of the rq
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  */
-__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
+__bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return NULL;
 
@@ -7812,18 +7835,19 @@ __bpf_kfunc struct rq *scx_bpf_cpu_rq(s32 cpu)
 
 /**
  * scx_bpf_locked_rq - Return the rq currently locked by SCX
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Returns the rq if a rq lock is currently held by SCX.
  * Otherwise emits an error and returns NULL.
  */
-__bpf_kfunc struct rq *scx_bpf_locked_rq(void)
+__bpf_kfunc struct rq *scx_bpf_locked_rq(const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	struct rq *rq;
 
 	guard(preempt)();
 
-	sch = rcu_dereference_sched(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return NULL;
 
@@ -7839,16 +7863,17 @@ __bpf_kfunc struct rq *scx_bpf_locked_rq(void)
 /**
  * scx_bpf_cpu_curr - Return remote CPU's curr task
  * @cpu: CPU of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Callers must hold RCU read lock (KF_RCU).
  */
-__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
+__bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return NULL;
 
@@ -7861,6 +7886,7 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
 /**
  * scx_bpf_task_cgroup - Return the sched cgroup of a task
  * @p: task of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
  * from the scheduler's POV. SCX operations should use this function to
@@ -7870,7 +7896,8 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu)
  * operations. The restriction guarantees that @p's rq is locked by the caller.
  */
 #ifdef CONFIG_CGROUP_SCHED
-__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p,
+					       const struct bpf_prog_aux *aux)
 {
 	struct task_group *tg = p->sched_task_group;
 	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
@@ -7878,7 +7905,7 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		goto out;
 
@@ -8011,20 +8038,20 @@ __bpf_kfunc_end_defs();
 BTF_KFUNCS_START(scx_kfunc_ids_any)
 BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU);
 BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU);
-BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
+BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
-BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL)
-BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
-BTF_ID_FLAGS(func, scx_bpf_exit_bstr)
-BTF_ID_FLAGS(func, scx_bpf_error_bstr)
-BTF_ID_FLAGS(func, scx_bpf_dump_bstr)
+BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
-BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
-BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
-BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
@@ -8032,11 +8059,11 @@ BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
-BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_RET_NULL)
-BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
 #ifdef CONFIG_CGROUP_SCHED
-BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE)
 #endif
 BTF_ID_FLAGS(func, scx_bpf_now)
 BTF_ID_FLAGS(func, scx_bpf_events)
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index ba298ac3ce6c..cc72146ee898 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -945,14 +945,15 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
  * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
  *		      trigger an error if @cpu is invalid
  * @cpu: target CPU
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  */
-__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
+__bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch) || !ops_cpu_valid(sch, cpu, NULL))
 		return NUMA_NO_NODE;
 	return cpu_to_node(cpu);
@@ -964,6 +965,7 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
  * @prev_cpu: CPU @p was on previously
  * @wake_flags: %SCX_WAKE_* flags
  * @is_idle: out parameter indicating whether the returned CPU is idle
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
  * context such as a BPF test_run() call, as long as built-in CPU selection
@@ -974,14 +976,15 @@ __bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
  * currently idle and thus a good candidate for direct dispatching.
  */
 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
-				       u64 wake_flags, bool *is_idle)
+				       u64 wake_flags, bool *is_idle,
+				       const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	s32 cpu;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return -ENODEV;
 
@@ -1009,6 +1012,7 @@ struct scx_bpf_select_cpu_and_args {
  *       @args->prev_cpu: CPU @p was on previously
  *       @args->wake_flags: %SCX_WAKE_* flags
  *       @args->flags: %SCX_PICK_IDLE* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
  * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
@@ -1027,13 +1031,14 @@ struct scx_bpf_select_cpu_and_args {
  */
 __bpf_kfunc s32
 __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
-			 struct scx_bpf_select_cpu_and_args *args)
+			 struct scx_bpf_select_cpu_and_args *args,
+			 const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return -ENODEV;
 
@@ -1063,18 +1068,20 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
  * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
  * idle-tracking per-CPU cpumask of a target NUMA node.
  * @node: target NUMA node
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Returns an empty cpumask if idle tracking is not enabled, if @node is
  * not valid, or running on a UP kernel. In this case the actual error will
  * be reported to the BPF scheduler via scx_error().
  */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
+__bpf_kfunc const struct cpumask *
+scx_bpf_get_idle_cpumask_node(s32 node, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return cpu_none_mask;
 
@@ -1088,17 +1095,18 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
 /**
  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
  * per-CPU cpumask.
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Returns an empty mask if idle tracking is not enabled, or running on a
  * UP kernel.
  */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return cpu_none_mask;
 
@@ -1118,18 +1126,20 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
  * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
  * used to determine if an entire physical core is free.
  * @node: target NUMA node
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Returns an empty cpumask if idle tracking is not enabled, if @node is
  * not valid, or running on a UP kernel. In this case the actual error will
  * be reported to the BPF scheduler via scx_error().
  */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
+__bpf_kfunc const struct cpumask *
+scx_bpf_get_idle_smtmask_node(s32 node, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return cpu_none_mask;
 
@@ -1147,17 +1157,18 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
  * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
  * per-physical-core cpumask. Can be used to determine if an entire physical
  * core is free.
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Returns an empty mask if idle tracking is not enabled, or running on a
  * UP kernel.
  */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return cpu_none_mask;
 
@@ -1193,6 +1204,7 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
 /**
  * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
  * @cpu: cpu to test and clear idle for
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Returns %true if @cpu was idle and its idle state was successfully cleared.
  * %false otherwise.
@@ -1200,13 +1212,13 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
  * Unavailable if ops.update_idle() is implemented and
  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
  */
-__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return false;
 
@@ -1224,6 +1236,7 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
  * @cpus_allowed: Allowed cpumask
  * @node: target NUMA node
  * @flags: %SCX_PICK_IDLE_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
  *
@@ -1239,13 +1252,14 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
  * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set.
  */
 __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
-					   int node, u64 flags)
+					   s32 node, u64 flags,
+					   const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return -ENODEV;
 
@@ -1260,6 +1274,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
  * @cpus_allowed: Allowed cpumask
  * @flags: %SCX_PICK_IDLE_CPU_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
  * number on success. -%EBUSY if no matching cpu was found.
@@ -1279,13 +1294,13 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
  * scx_bpf_pick_idle_cpu_node() instead.
  */
 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
-				      u64 flags)
+				      u64 flags, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return -ENODEV;
 
@@ -1306,6 +1321,7 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
  * @cpus_allowed: Allowed cpumask
  * @node: target NUMA node
  * @flags: %SCX_PICK_IDLE_CPU_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
@@ -1322,14 +1338,15 @@ __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
  * CPU.
  */
 __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
-					  int node, u64 flags)
+					  s32 node, u64 flags,
+					  const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	s32 cpu;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return -ENODEV;
 
@@ -1355,6 +1372,7 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
  * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
  * @cpus_allowed: Allowed cpumask
  * @flags: %SCX_PICK_IDLE_CPU_* flags
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
@@ -1369,14 +1387,14 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
  * scx_bpf_pick_any_cpu_node() instead.
  */
 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
-				     u64 flags)
+				     u64 flags, const struct bpf_prog_aux *aux)
 {
 	struct scx_sched *sch;
 	s32 cpu;
 
 	guard(rcu)();
 
-	sch = rcu_dereference(scx_root);
+	sch = scx_prog_sched(aux);
 	if (unlikely(!sch))
 		return -ENODEV;
 
@@ -1401,20 +1419,20 @@ __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_idle)
-BTF_ID_FLAGS(func, scx_bpf_cpu_node)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
-BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
-BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
-BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
-BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_idle)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 0612006019da..4ee7c427948a 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -844,7 +844,7 @@ struct sched_ext_ops {
 	char name[SCX_OPS_NAME_LEN];
 
 	/* internal use only, must be NULL */
-	void *priv;
+	void __rcu *priv;
 };
 
 enum scx_opi {
@@ -946,6 +946,7 @@ struct scx_sched {
 	 */
 	bool			warned_zero_slice:1;
 	bool			warned_deprecated_rq:1;
+	bool			warned_unassoc_progs:1;
 
 	struct list_head	all;
 
@@ -1283,6 +1284,42 @@ static inline bool scx_task_on_sched(struct scx_sched *sch,
 {
 	return rcu_access_pointer(p->scx.sched) == sch;
 }
+
+/**
+ * scx_prog_sched - Find scx_sched associated with a BPF prog
+ * @aux: aux passed in from BPF to a kfunc
+ *
+ * To be called from kfuncs. Return the scheduler instance associated with the
+ * BPF program given the implicit kfunc argument aux. The returned scx_sched is
+ * RCU protected.
+ */
+static inline struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux)
+{
+	struct sched_ext_ops *ops;
+	struct scx_sched *root;
+
+	ops = bpf_prog_get_assoc_struct_ops(aux);
+	if (likely(ops))
+		return rcu_dereference_all(ops->priv);
+
+	root = rcu_dereference_all(scx_root);
+	if (root) {
+		/*
+		 * COMPAT-v6.19: Schedulers built before sub-sched support was
+		 * introduced may have unassociated non-struct_ops programs.
+		 */
+		if (!root->ops.sub_attach)
+			return root;
+
+		if (!root->warned_unassoc_progs) {
+			printk_deferred(KERN_WARNING "sched_ext: Unassociated program %s (id %d)\n",
+					aux->name, aux->id);
+			root->warned_unassoc_progs = true;
+		}
+	}
+
+	return NULL;
+}
 #else	/* CONFIG_EXT_SUB_SCHED */
 static inline struct scx_sched *scx_task_sched(const struct task_struct *p)
 {
@@ -1301,4 +1338,9 @@ static inline bool scx_task_on_sched(struct scx_sched *sch,
 {
 	return true;
 }
+
+static struct scx_sched *scx_prog_sched(const struct bpf_prog_aux *aux)
+{
+	return rcu_dereference_all(scx_root);
+}
 #endif	/* CONFIG_EXT_SUB_SCHED */
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index edccc99c7294..9b6df13b187b 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -183,8 +183,18 @@ static inline long scx_hotplug_seq(void)
 })
 
 #define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({		\
+	struct bpf_program *__prog;						\
 	UEI_SET_SIZE(__skel, __ops_name, __uei_name);				\
 	SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel");	\
+	bpf_object__for_each_program(__prog, (__skel)->obj) {			\
+		if (bpf_program__type(__prog) == BPF_PROG_TYPE_STRUCT_OPS)	\
+			continue;						\
+		s32 err = bpf_program__assoc_struct_ops(__prog,			\
+					(__skel)->maps.__ops_name, NULL);	\
+		if (err)							\
+			fprintf(stderr, "ERROR: Failed to associate %s with %s: %d\n", \
+				bpf_program__name(__prog), #__ops_name, err);	\
+	}									\
 })
 
 /*

From a5fa0708cbfda4d3c2c6a447de7c4b0b23595527 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 014/134] sched_ext: Enforce scheduling authority in dispatch
 and select_cpu operations

Add checks to enforce scheduling authority boundaries when multiple
schedulers are present:

1. In scx_dsq_insert_preamble() and the dispatch retry path, ignore attempts
   to insert tasks that the scheduler doesn't own, counting them via
   SCX_EV_INSERT_NOT_OWNED. As BPF schedulers are allowed to ignore
   dequeues, such attempts can occur legitimately during sub-scheduler
   enabling when tasks move between schedulers. The counter helps distinguish
   normal cases from scheduler bugs.

2. For scx_bpf_dsq_insert_vtime() and scx_bpf_select_cpu_and(), error out
   when sub-schedulers are attached. These functions lack the aux__prog
   parameter needed to identify the calling scheduler, so they cannot be used
   safely with multiple schedulers. BPF programs should use the arg-wrapped
   versions (__scx_bpf_dsq_insert_vtime() and __scx_bpf_select_cpu_and())
   instead.

These checks ensure that with multiple concurrent schedulers, scheduler
identity can be properly determined and unauthorized task operations are
prevented or tracked.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 26 ++++++++++++++++++++++++++
 kernel/sched/ext_idle.c     | 11 +++++++++++
 kernel/sched/ext_internal.h | 12 ++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 98b927aa20d1..bfe0f0c38ef7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2325,6 +2325,12 @@ retry:
 		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
 			return;
 
+		/* see SCX_EV_INSERT_NOT_OWNED definition */
+		if (unlikely(!scx_task_on_sched(sch, p))) {
+			__scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1);
+			return;
+		}
+
 		/*
 		 * While we know @p is accessible, we don't yet have a claim on
 		 * it - the BPF scheduler is allowed to dispatch tasks
@@ -4028,6 +4034,7 @@ static ssize_t scx_attr_events_show(struct kobject *kobj,
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED);
 	return at;
 }
 SCX_ATTR(events);
@@ -5150,6 +5157,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 	scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
 	scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
 	scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
+	scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED);
 
 	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
@@ -6476,6 +6484,12 @@ static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p
 		return false;
 	}
 
+	/* see SCX_EV_INSERT_NOT_OWNED definition */
+	if (unlikely(!scx_task_on_sched(sch, p))) {
+		__scx_add_event(sch, SCX_EV_INSERT_NOT_OWNED, 1);
+		return false;
+	}
+
 	return true;
 }
 
@@ -6668,6 +6682,17 @@ __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
 	if (unlikely(!sch))
 		return;
 
+#ifdef CONFIG_EXT_SUB_SCHED
+	/*
+	 * Disallow if any sub-scheds are attached. There is no way to tell
+	 * which scheduler called us, just error out @p's scheduler.
+	 */
+	if (unlikely(!list_empty(&sch->children))) {
+		scx_error(scx_task_sched(p), "__scx_bpf_dsq_insert_vtime() must be used");
+		return;
+	}
+#endif
+
 	scx_dsq_insert_vtime(sch, p, dsq_id, slice, vtime, enq_flags);
 }
 
@@ -8000,6 +8025,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
 		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
 		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
 		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+		scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED);
 	}
 }
 
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index cc72146ee898..9f6abee1e234 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -1060,6 +1060,17 @@ __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64
 	if (unlikely(!sch))
 		return -ENODEV;
 
+#ifdef CONFIG_EXT_SUB_SCHED
+	/*
+	 * Disallow if any sub-scheds are attached. There is no way to tell
+	 * which scheduler called us, just error out @p's scheduler.
+	 */
+	if (unlikely(!list_empty(&sch->children))) {
+		scx_error(scx_task_sched(p), "__scx_bpf_select_cpu_and() must be used");
+		return -EINVAL;
+	}
+#endif
+
 	return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
 				     cpus_allowed, flags);
 }
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 4ee7c427948a..026bfdd0e11d 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -911,6 +911,18 @@ struct scx_event_stats {
 	 * The number of times the bypassing mode has been activated.
 	 */
 	s64		SCX_EV_BYPASS_ACTIVATE;
+
+	/*
+	 * The number of times the scheduler attempted to insert a task that it
+	 * doesn't own into a DSQ. Such attempts are ignored.
+	 *
+	 * As BPF schedulers are allowed to ignore dequeues, it's difficult to
+	 * tell whether such an attempt is from a scheduler malfunction or an
+	 * ignored dequeue around sub-sched enabling. If this count keeps going
+	 * up regardless of sub-sched enabling, it likely indicates a bug in the
+	 * scheduler.
+	 */
+	s64		SCX_EV_INSERT_NOT_OWNED;
 };
 
 struct scx_sched_pcpu {

From 245d09c594ea40dbd4b8f989f2422dbc8e65a61c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 015/134] sched_ext: Enforce scheduler ownership when updating
 slice and dsq_vtime

scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() now verify that
the calling scheduler has authority over the task before allowing updates.
This prevents schedulers from modifying tasks that don't belong to them in
hierarchical scheduling configurations.

Direct writes to p->scx.slice and p->scx.dsq_vtime are deprecated and now
trigger warnings. They will be disallowed in a future release.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bfe0f0c38ef7..beb0e3443209 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5945,12 +5945,17 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
 
 	t = btf_type_by_id(reg->btf, reg->btf_id);
 	if (t == task_struct_type) {
-		if (off >= offsetof(struct task_struct, scx.slice) &&
-		    off + size <= offsetofend(struct task_struct, scx.slice))
-			return SCALAR_VALUE;
-		if (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
-		    off + size <= offsetofend(struct task_struct, scx.dsq_vtime))
+		/*
+		 * COMPAT: Will be removed in v6.23.
+		 */
+		if ((off >= offsetof(struct task_struct, scx.slice) &&
+		     off + size <= offsetofend(struct task_struct, scx.slice)) ||
+		    (off >= offsetof(struct task_struct, scx.dsq_vtime) &&
+		     off + size <= offsetofend(struct task_struct, scx.dsq_vtime))) {
+			pr_warn("sched_ext: Writing directly to p->scx.slice/dsq_vtime is deprecated, use scx_bpf_task_set_slice/dsq_vtime()");
 			return SCALAR_VALUE;
+		}
+
 		if (off >= offsetof(struct task_struct, scx.disallow) &&
 		    off + size <= offsetofend(struct task_struct, scx.disallow))
 			return SCALAR_VALUE;
@@ -7163,12 +7168,21 @@ __bpf_kfunc_start_defs();
  * scx_bpf_task_set_slice - Set task's time slice
  * @p: task of interest
  * @slice: time slice to set in nsecs
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Set @p's time slice to @slice. Returns %true on success, %false if the
  * calling scheduler doesn't have authority over @p.
  */
-__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
+__bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice,
+					const struct bpf_prog_aux *aux)
 {
+	struct scx_sched *sch;
+
+	guard(rcu)();
+	sch = scx_prog_sched(aux);
+	if (unlikely(!scx_task_on_sched(sch, p)))
+		return false;
+
 	p->scx.slice = slice;
 	return true;
 }
@@ -7177,12 +7191,21 @@ __bpf_kfunc bool scx_bpf_task_set_slice(struct task_struct *p, u64 slice)
  * scx_bpf_task_set_dsq_vtime - Set task's virtual time for DSQ ordering
  * @p: task of interest
  * @vtime: virtual time to set
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Set @p's virtual time to @vtime. Returns %true on success, %false if the
  * calling scheduler doesn't have authority over @p.
  */
-__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime)
+__bpf_kfunc bool scx_bpf_task_set_dsq_vtime(struct task_struct *p, u64 vtime,
+					    const struct bpf_prog_aux *aux)
 {
+	struct scx_sched *sch;
+
+	guard(rcu)();
+	sch = scx_prog_sched(aux);
+	if (unlikely(!scx_task_on_sched(sch, p)))
+		return false;
+
 	p->scx.dsq_vtime = vtime;
 	return true;
 }
@@ -8062,8 +8085,8 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_any)
-BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_RCU);
-BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_RCU);
+BTF_ID_FLAGS(func, scx_bpf_task_set_slice, KF_IMPLICIT_ARGS | KF_RCU);
+BTF_ID_FLAGS(func, scx_bpf_task_set_dsq_vtime, KF_IMPLICIT_ARGS | KF_RCU);
 BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)

From bb4d9fd551588165dc918aa2f2108b939e3367db Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 016/134] sched_ext: scx_dsq_move() should validate the task
 belongs to the right scheduler

scx_bpf_dsq_move[_vtime]() calls scx_dsq_move() to move task from a DSQ to
another. However, @p doesn't necessarily have to come form the containing
iteration and can thus be a task which belongs to another scx_sched. Verify
that @p is on the same scx_sched as the DSQ being iterated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index beb0e3443209..7d74f7b119ca 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6718,8 +6718,8 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 			 struct task_struct *p, u64 dsq_id, u64 enq_flags)
 {
-	struct scx_sched *sch = scx_root;
 	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
+	struct scx_sched *sch = src_dsq->sched;
 	struct rq *this_rq, *src_rq, *locked_rq;
 	bool dispatched = false;
 	bool in_balance;
@@ -6736,6 +6736,11 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	if (unlikely(READ_ONCE(scx_aborting)))
 		return false;
 
+	if (unlikely(!scx_task_on_sched(sch, p))) {
+		scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler",
+			  p->comm, p->pid);
+	}
+
 	/*
 	 * Can be called from either ops.dispatch() locking this_rq() or any
 	 * context where no rq lock is held. If latter, lock @p's task_rq which

From 073d4f0667b064ed05c19e44d840b0d4cd49a251 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 017/134] sched_ext: Refactor task init/exit helpers

- Add the @sch parameter to scx_init_task() and drop @tg as it can be
  obtained from @p. Separate out __scx_init_task() which does everything
  except for the task state transition.

- Add the @sch parameter to scx_enable_task(). Separate out
  __scx_enable_task() which does everything except for the task state
  transition.

- Add the @sch parameter to scx_disable_task().

- Rename scx_exit_task() to scx_disable_and_exit_task() and separate out
  __scx_disable_and_exit_task() which does everything except for the task
  state transition.

While some task state transitions are relocated, no meaningful behavior
changes are expected.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 68 ++++++++++++++++++++++++++++++----------------
 1 file changed, 45 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7d74f7b119ca..7741456e3cb0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3111,16 +3111,15 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
 	p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
 }
 
-static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork)
+static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
 {
-	struct scx_sched *sch = scx_root;
 	int ret;
 
 	p->scx.disallow = false;
 
 	if (SCX_HAS_OP(sch, init_task)) {
 		struct scx_init_task_args args = {
-			SCX_INIT_TASK_ARGS_CGROUP(tg)
+			SCX_INIT_TASK_ARGS_CGROUP(task_group(p))
 			.fork = fork,
 		};
 
@@ -3132,8 +3131,6 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
 		}
 	}
 
-	scx_set_task_state(p, SCX_TASK_INIT);
-
 	if (p->scx.disallow) {
 		if (unlikely(scx_parent(sch))) {
 			scx_error(sch, "non-root ops.init_task() set task->scx.disallow for %s[%d]",
@@ -3163,13 +3160,27 @@ static int scx_init_task(struct task_struct *p, struct task_group *tg, bool fork
 		}
 	}
 
-	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
 	return 0;
 }
 
-static void scx_enable_task(struct task_struct *p)
+static int scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
+{
+	int ret;
+
+	ret = __scx_init_task(sch, p, fork);
+	if (!ret) {
+		/*
+		 * While @p's rq is not locked. @p is not visible to the rest of
+		 * SCX yet and it's safe to update the flags and state.
+		 */
+		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
+		scx_set_task_state(p, SCX_TASK_INIT);
+	}
+	return ret;
+}
+
+static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p)
 {
-	struct scx_sched *sch = scx_root;
 	struct rq *rq = task_rq(p);
 	u32 weight;
 
@@ -3195,16 +3206,20 @@ static void scx_enable_task(struct task_struct *p)
 
 	if (SCX_HAS_OP(sch, enable))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
-	scx_set_task_state(p, SCX_TASK_ENABLED);
 
 	if (SCX_HAS_OP(sch, set_weight))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
 				 p, p->scx.weight);
 }
 
-static void scx_disable_task(struct task_struct *p)
+static void scx_enable_task(struct scx_sched *sch, struct task_struct *p)
+{
+	__scx_enable_task(sch, p);
+	scx_set_task_state(p, SCX_TASK_ENABLED);
+}
+
+static void scx_disable_task(struct scx_sched *sch, struct task_struct *p)
 {
-	struct scx_sched *sch = scx_root;
 	struct rq *rq = task_rq(p);
 
 	lockdep_assert_rq_held(rq);
@@ -3222,9 +3237,9 @@ static void scx_disable_task(struct task_struct *p)
 	WARN_ON_ONCE(p->scx.flags & SCX_TASK_IN_CUSTODY);
 }
 
-static void scx_exit_task(struct task_struct *p)
+static void __scx_disable_and_exit_task(struct scx_sched *sch,
+					struct task_struct *p)
 {
-	struct scx_sched *sch = scx_task_sched(p);
 	struct scx_exit_task_args args = {
 		.cancelled = false,
 	};
@@ -3241,7 +3256,7 @@ static void scx_exit_task(struct task_struct *p)
 	case SCX_TASK_READY:
 		break;
 	case SCX_TASK_ENABLED:
-		scx_disable_task(p);
+		scx_disable_task(sch, p);
 		break;
 	default:
 		WARN_ON_ONCE(true);
@@ -3251,6 +3266,13 @@ static void scx_exit_task(struct task_struct *p)
 	if (SCX_HAS_OP(sch, exit_task))
 		SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
 				 p, &args);
+}
+
+static void scx_disable_and_exit_task(struct scx_sched *sch,
+				      struct task_struct *p)
+{
+	__scx_disable_and_exit_task(sch, p);
+
 	scx_set_task_sched(p, NULL);
 	scx_set_task_state(p, SCX_TASK_NONE);
 }
@@ -3286,7 +3308,7 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 	percpu_rwsem_assert_held(&scx_fork_rwsem);
 
 	if (scx_init_task_enabled) {
-		ret = scx_init_task(p, task_group(p), true);
+		ret = scx_init_task(scx_root, p, true);
 		if (!ret)
 			scx_set_task_sched(p, scx_root);
 		return ret;
@@ -3310,7 +3332,7 @@ void scx_post_fork(struct task_struct *p)
 			struct rq *rq;
 
 			rq = task_rq_lock(p, &rf);
-			scx_enable_task(p);
+			scx_enable_task(scx_task_sched(p), p);
 			task_rq_unlock(rq, p, &rf);
 		}
 	}
@@ -3330,7 +3352,7 @@ void scx_cancel_fork(struct task_struct *p)
 
 		rq = task_rq_lock(p, &rf);
 		WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
-		scx_exit_task(p);
+		scx_disable_and_exit_task(scx_task_sched(p), p);
 		task_rq_unlock(rq, p, &rf);
 	}
 
@@ -3389,7 +3411,7 @@ void sched_ext_dead(struct task_struct *p)
 		struct rq *rq;
 
 		rq = task_rq_lock(p, &rf);
-		scx_exit_task(p);
+		scx_disable_and_exit_task(scx_task_sched(p), p);
 		task_rq_unlock(rq, p, &rf);
 	}
 }
@@ -3421,7 +3443,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 	if (task_dead_and_done(p))
 		return;
 
-	scx_enable_task(p);
+	scx_enable_task(sch, p);
 
 	/*
 	 * set_cpus_allowed_scx() is not called while @p is associated with a
@@ -3437,7 +3459,7 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
 	if (task_dead_and_done(p))
 		return;
 
-	scx_disable_task(p);
+	scx_disable_task(scx_task_sched(p), p);
 }
 
 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
@@ -4681,7 +4703,7 @@ static void scx_root_disable(struct scx_sched *sch)
 
 	/*
 	 * Shut down cgroup support before tasks so that the cgroup attach path
-	 * doesn't race against scx_exit_task().
+	 * doesn't race against scx_disable_and_exit_task().
 	 */
 	scx_cgroup_lock();
 	scx_cgroup_exit(sch);
@@ -4710,7 +4732,7 @@ static void scx_root_disable(struct scx_sched *sch)
 			p->sched_class = new_class;
 		}
 
-		scx_exit_task(p);
+		scx_disable_and_exit_task(scx_task_sched(p), p);
 	}
 	scx_task_iter_stop(&sti);
 
@@ -5595,7 +5617,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 
 		scx_task_iter_unlock(&sti);
 
-		ret = scx_init_task(p, task_group(p), false);
+		ret = scx_init_task(sch, p, false);
 		if (ret) {
 			put_task_struct(p);
 			scx_task_iter_stop(&sti);

From 41346d68d0aa79a86374c57164c92ce136b6b723 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 018/134] sched_ext: Make scx_prio_less() handle multiple
 schedulers

Call ops.core_sched_before() iff both tasks belong to the same scx_sched.
Otherwise, use timestamp based ordering.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7741456e3cb0..e1fc2b1fc779 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2809,16 +2809,17 @@ void ext_server_init(struct rq *rq)
 bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 		   bool in_fi)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch_a = scx_task_sched(a);
+	struct scx_sched *sch_b = scx_task_sched(b);
 
 	/*
 	 * The const qualifiers are dropped from task_struct pointers when
 	 * calling ops.core_sched_before(). Accesses are controlled by the
 	 * verifier.
 	 */
-	if (SCX_HAS_OP(sch, core_sched_before) &&
+	if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) &&
 	    !scx_rq_bypassing(task_rq(a)))
-		return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, core_sched_before,
+		return SCX_CALL_OP_2TASKS_RET(sch_a, SCX_KF_REST, core_sched_before,
 					      NULL,
 					      (struct task_struct *)a,
 					      (struct task_struct *)b);

From e1cccf365ef4b8927d002e424e95fd4e04e2d966 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 019/134] sched_ext: Move default slice to per-scheduler field

The default time slice was stored in the global scx_slice_dfl variable which
was dynamically modified when entering and exiting bypass mode. With
hierarchical scheduling, each scheduler instance needs its own default slice
configuration so that bypass operations on one scheduler don't affect others.

Move slice_dfl into struct scx_sched and update all access sites. The bypass
logic now modifies the root scheduler's slice_dfl. At task initialization in
init_scx_entity(), use the SCX_SLICE_DFL constant directly since the task may
not yet be associated with a specific scheduler.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 14 ++++++++------
 kernel/sched/ext_internal.h |  1 +
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e1fc2b1fc779..a73a5957e9d9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -164,7 +164,6 @@ static struct kset *scx_kset;
  * There usually is no reason to modify these as normal scheduler operation
  * shouldn't be affected by them. The knobs are primarily for debugging.
  */
-static u64 scx_slice_dfl = SCX_SLICE_DFL;
 static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
 static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
 
@@ -1135,7 +1134,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
 
 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
 {
-	p->scx.slice = READ_ONCE(scx_slice_dfl);
+	p->scx.slice = READ_ONCE(sch->slice_dfl);
 	__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
 }
 
@@ -3288,7 +3287,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
 	INIT_LIST_HEAD(&scx->runnable_node);
 	scx->runnable_at = jiffies;
 	scx->ddsp_dsq_id = SCX_DSQ_INVALID;
-	scx->slice = READ_ONCE(scx_slice_dfl);
+	scx->slice = SCX_SLICE_DFL;
 }
 
 void scx_pre_fork(struct task_struct *p)
@@ -4449,6 +4448,8 @@ static void scx_bypass(bool bypass)
 
 	raw_spin_lock_irqsave(&bypass_lock, flags);
 	sch = rcu_dereference_bh(scx_root);
+	if (!sch)
+		goto unlock;
 
 	if (bypass) {
 		u32 intv_us;
@@ -4457,7 +4458,7 @@ static void scx_bypass(bool bypass)
 		WARN_ON_ONCE(scx_bypass_depth <= 0);
 		if (scx_bypass_depth != 1)
 			goto unlock;
-		WRITE_ONCE(scx_slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
+		WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
 		bypass_timestamp = ktime_get_ns();
 		if (sch)
 			scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
@@ -4473,7 +4474,7 @@ static void scx_bypass(bool bypass)
 		WARN_ON_ONCE(scx_bypass_depth < 0);
 		if (scx_bypass_depth != 0)
 			goto unlock;
-		WRITE_ONCE(scx_slice_dfl, SCX_SLICE_DFL);
+		WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
 		if (sch)
 			scx_add_event(sch, SCX_EV_BYPASS_DURATION,
 				      ktime_get_ns() - bypass_timestamp);
@@ -5317,6 +5318,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	sch->ancestors[level] = sch;
 	sch->level = level;
 
+	sch->slice_dfl = SCX_SLICE_DFL;
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
 	init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
 	kthread_init_work(&sch->disable_work, scx_disable_workfn);
@@ -5662,7 +5664,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 			queue_flags |= DEQUEUE_CLASS;
 
 		scoped_guard (sched_change, p, queue_flags) {
-			p->scx.slice = READ_ONCE(scx_slice_dfl);
+			p->scx.slice = READ_ONCE(sch->slice_dfl);
 			p->sched_class = new_class;
 		}
 	}
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 026bfdd0e11d..6c1eeaaa41db 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -950,6 +950,7 @@ struct scx_sched {
 	struct scx_dispatch_q	**global_dsqs;
 	struct scx_sched_pcpu __percpu *pcpu;
 
+	u64			slice_dfl;
 	s32			level;
 
 	/*

From c1743da43cf52caa412413af06eb56a547086c7a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 020/134] sched_ext: Move aborting flag to per-scheduler field

The abort state was tracked in the global scx_aborting flag which was used to
break out of potential live-lock scenarios when an error occurs. With
hierarchical scheduling, each scheduler instance must track its own abort
state independently so that an aborting scheduler doesn't interfere with
others.

Move the aborting flag into struct scx_sched and update all access sites. The
early initialization check in scx_root_enable() that warned about residual
aborting state is no longer needed as each scheduler instance now starts with
a clean state.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 10 +++-------
 kernel/sched/ext_internal.h |  1 +
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a73a5957e9d9..47b51bd31a30 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -44,7 +44,6 @@ static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
 static int scx_bypass_depth;
 static cpumask_var_t scx_bypass_lb_donee_cpumask;
 static cpumask_var_t scx_bypass_lb_resched_cpumask;
-static bool scx_aborting;
 static bool scx_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -2151,7 +2150,7 @@ retry:
 		 * the system into the bypass mode. This can easily live-lock the
 		 * machine. If aborting, exit from all non-bypass DSQs.
 		 */
-		if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
+		if (unlikely(READ_ONCE(sch->aborting)) && dsq->id != SCX_DSQ_BYPASS)
 			break;
 
 		if (rq == task_rq) {
@@ -4677,7 +4676,6 @@ static void scx_root_disable(struct scx_sched *sch)
 
 	/* guarantee forward progress and wait for descendants to be disabled */
 	scx_bypass(true);
-	WRITE_ONCE(scx_aborting, false);
 	drain_descendants(sch);
 
 	switch (scx_set_enable_state(SCX_DISABLING)) {
@@ -4838,7 +4836,7 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
 	 * flag to break potential live-lock scenarios, ensuring we can
 	 * successfully reach scx_bypass().
 	 */
-	WRITE_ONCE(scx_aborting, true);
+	WRITE_ONCE(sch->aborting, true);
 
 	/*
 	 * Propagate exits to descendants immediately. Each has a dedicated
@@ -5485,8 +5483,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	 */
 	WARN_ON_ONCE(scx_set_enable_state(SCX_ENABLING) != SCX_DISABLED);
 	WARN_ON_ONCE(scx_root);
-	if (WARN_ON_ONCE(READ_ONCE(scx_aborting)))
-		WRITE_ONCE(scx_aborting, false);
 
 	atomic_long_set(&scx_nr_rejected, 0);
 
@@ -6758,7 +6754,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	 * If the BPF scheduler keeps calling this function repeatedly, it can
 	 * cause similar live-lock conditions as consume_dispatch_q().
 	 */
-	if (unlikely(READ_ONCE(scx_aborting)))
+	if (unlikely(READ_ONCE(sch->aborting)))
 		return false;
 
 	if (unlikely(!scx_task_on_sched(sch, p))) {
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 6c1eeaaa41db..87f19aed47fb 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -951,6 +951,7 @@ struct scx_sched {
 	struct scx_sched_pcpu __percpu *pcpu;
 
 	u64			slice_dfl;
+	bool			aborting;
 	s32			level;
 
 	/*

From ff06f727a9412b3c9f2f13f1441a5a0d2a31366b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 021/134] sched_ext: Move bypass_dsq into scx_sched_pcpu

To support bypass mode for sub-schedulers, move bypass_dsq from struct scx_rq
to struct scx_sched_pcpu. Add bypass_dsq() helper. Move bypass_dsq
initialization from init_sched_ext_class() to scx_alloc_and_attach_sched().
bypass_lb_cpu() now takes a CPU number instead of rq pointer. All callers
updated. No behavior change as all tasks use the root scheduler.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 52 +++++++++++++++++++------------------
 kernel/sched/ext_internal.h |  2 ++
 kernel/sched/sched.h        |  1 -
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 47b51bd31a30..a371877c40ba 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -359,6 +359,11 @@ static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
 	return __setscheduler_class(p->policy, p->prio);
 }
 
+static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu)
+{
+	return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
+}
+
 /*
  * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
  * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -1632,7 +1637,7 @@ global:
 	dsq = find_global_dsq(sch, p);
 	goto enqueue;
 bypass:
-	dsq = &task_rq(p)->scx.bypass_dsq;
+	dsq = bypass_dsq(sch, task_cpu(p));
 	goto enqueue;
 
 enqueue:
@@ -2443,7 +2448,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		goto has_tasks;
 
 	if (scx_rq_bypassing(rq)) {
-		if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
+		if (consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu_of(rq))))
 			goto has_tasks;
 		else
 			goto no_tasks;
@@ -4210,11 +4215,12 @@ bool scx_hardlockup(int cpu)
 	return true;
 }
 
-static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor,
 			 struct cpumask *donee_mask, struct cpumask *resched_mask,
 			 u32 nr_donor_target, u32 nr_donee_target)
 {
-	struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+	struct rq *donor_rq = cpu_rq(donor);
+	struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor);
 	struct task_struct *p, *n;
 	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
 	s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
@@ -4230,7 +4236,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
 	if (delta < DIV_ROUND_UP(min_delta_us, READ_ONCE(scx_slice_bypass_us)))
 		return 0;
 
-	raw_spin_rq_lock_irq(rq);
+	raw_spin_rq_lock_irq(donor_rq);
 	raw_spin_lock(&donor_dsq->lock);
 	list_add(&cursor.node, &donor_dsq->list);
 resume:
@@ -4238,7 +4244,6 @@ resume:
 	n = nldsq_next_task(donor_dsq, n, false);
 
 	while ((p = n)) {
-		struct rq *donee_rq;
 		struct scx_dispatch_q *donee_dsq;
 		int donee;
 
@@ -4254,14 +4259,13 @@ resume:
 		if (donee >= nr_cpu_ids)
 			continue;
 
-		donee_rq = cpu_rq(donee);
-		donee_dsq = &donee_rq->scx.bypass_dsq;
+		donee_dsq = bypass_dsq(sch, donee);
 
 		/*
 		 * $p's rq is not locked but $p's DSQ lock protects its
 		 * scheduling properties making this test safe.
 		 */
-		if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+		if (!task_can_run_on_remote_rq(sch, p, cpu_rq(donee), false))
 			continue;
 
 		/*
@@ -4276,7 +4280,7 @@ resume:
 		 * between bypass DSQs.
 		 */
 		dispatch_dequeue_locked(p, donor_dsq);
-		dispatch_enqueue(sch, donee_rq, donee_dsq, p, SCX_ENQ_NESTED);
+		dispatch_enqueue(sch, cpu_rq(donee), donee_dsq, p, SCX_ENQ_NESTED);
 
 		/*
 		 * $donee might have been idle and need to be woken up. No need
@@ -4291,9 +4295,9 @@ resume:
 		if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
 			list_move_tail(&cursor.node, &n->scx.dsq_list.node);
 			raw_spin_unlock(&donor_dsq->lock);
-			raw_spin_rq_unlock_irq(rq);
+			raw_spin_rq_unlock_irq(donor_rq);
 			cpu_relax();
-			raw_spin_rq_lock_irq(rq);
+			raw_spin_rq_lock_irq(donor_rq);
 			raw_spin_lock(&donor_dsq->lock);
 			goto resume;
 		}
@@ -4301,7 +4305,7 @@ resume:
 
 	list_del_init(&cursor.node);
 	raw_spin_unlock(&donor_dsq->lock);
-	raw_spin_rq_unlock_irq(rq);
+	raw_spin_rq_unlock_irq(donor_rq);
 
 	return nr_balanced;
 }
@@ -4319,7 +4323,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
 
 	/* count the target tasks and CPUs */
 	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
-		u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+		u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
 
 		nr_tasks += nr;
 		nr_cpus++;
@@ -4341,24 +4345,21 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
 
 	cpumask_clear(donee_mask);
 	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
-		if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+		if (READ_ONCE(bypass_dsq(sch, cpu)->nr) < nr_target)
 			cpumask_set_cpu(cpu, donee_mask);
 	}
 
 	/* iterate !donee CPUs and see if they should be offloaded */
 	cpumask_clear(resched_mask);
 	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
-		struct rq *rq = cpu_rq(cpu);
-		struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
-
 		if (cpumask_empty(donee_mask))
 			break;
 		if (cpumask_test_cpu(cpu, donee_mask))
 			continue;
-		if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+		if (READ_ONCE(bypass_dsq(sch, cpu)->nr) <= nr_donor_target)
 			continue;
 
-		nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+		nr_balanced += bypass_lb_cpu(sch, cpu, donee_mask, resched_mask,
 					     nr_donor_target, nr_target);
 	}
 
@@ -4366,7 +4367,7 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
 		resched_cpu(cpu);
 
 	for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
-		u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+		u32 nr = READ_ONCE(bypass_dsq(sch, cpu)->nr);
 
 		after_min = min(nr, after_min);
 		after_max = max(nr, after_max);
@@ -5261,7 +5262,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 {
 	struct scx_sched *sch;
 	s32 level = parent ? parent->level + 1 : 0;
-	int node, ret;
+	s32 node, cpu, ret;
 
 	sch = kzalloc_flex(*sch, ancestors, level);
 	if (!sch)
@@ -5302,6 +5303,9 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		goto err_free_gdsqs;
 	}
 
+	for_each_possible_cpu(cpu)
+		init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
+
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
 	if (IS_ERR(sch->helper)) {
 		ret = PTR_ERR(sch->helper);
@@ -5490,7 +5494,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		struct rq *rq = cpu_rq(cpu);
 
 		rq->scx.local_dsq.sched = sch;
-		rq->scx.bypass_dsq.sched = sch;
 		rq->scx.cpuperf_target = SCX_CPUPERF_ONE;
 	}
 
@@ -6465,9 +6468,8 @@ void __init init_sched_ext_class(void)
 		struct rq *rq = cpu_rq(cpu);
 		int  n = cpu_to_node(cpu);
 
-		/* local/bypass dsq's sch will be set during scx_root_enable() */
+		/* local_dsq's sch will be set during scx_root_enable() */
 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL);
-		init_dsq(&rq->scx.bypass_dsq, SCX_DSQ_BYPASS, NULL);
 
 		INIT_LIST_HEAD(&rq->scx.runnable_list);
 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 87f19aed47fb..1901efd40517 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -932,6 +932,8 @@ struct scx_sched_pcpu {
 	 * constructed when requested by scx_bpf_events().
 	 */
 	struct scx_event_stats	event_stats;
+
+	struct scx_dispatch_q	bypass_dsq;
 };
 
 struct scx_sched {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 43bbf0693cca..9e142c2f50f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -810,7 +810,6 @@ struct scx_rq {
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;
-	struct scx_dispatch_q	bypass_dsq;
 };
 #endif /* CONFIG_SCHED_CLASS_EXT */
 

From 5c8d98a1b4de444709f7d2b7cee3d0ea00c581a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 022/134] sched_ext: Move bypass state into scx_sched

In preparation of multiple scheduler support, make bypass state
per-scx_sched. Move scx_bypass_depth, bypass_timestamp and bypass_lb_timer
from globals into scx_sched. Move SCX_RQ_BYPASSING from rq to scx_sched_pcpu
as SCX_SCHED_PCPU_BYPASSING.

scx_bypass() now takes @sch and scx_rq_bypassing(rq) is replaced with
scx_bypassing(sch, cpu). All callers updated.

scx_bypassed_for_enable existed to balance the global scx_bypass_depth when
enable failed. Now that bypass_depth is per-scheduler, the counter is
destroyed along with the scheduler on enable failure. Remove
scx_bypassed_for_enable.

As all tasks currently use the root scheduler, there's no observable behavior
change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 143 +++++++++++++++++-------------------
 kernel/sched/ext_idle.c     |   3 +-
 kernel/sched/ext_internal.h |  14 +++-
 kernel/sched/sched.h        |   1 -
 4 files changed, 80 insertions(+), 81 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a371877c40ba..56c05b01f088 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -41,20 +41,12 @@ static DEFINE_MUTEX(scx_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
-static int scx_bypass_depth;
 static cpumask_var_t scx_bypass_lb_donee_cpumask;
 static cpumask_var_t scx_bypass_lb_resched_cpumask;
 static bool scx_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
 
-/*
- * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass
- * depth on enable failure. Will be removed when bypass depth is moved into the
- * sched instance.
- */
-static bool scx_bypassed_for_enable;
-
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 
@@ -1570,7 +1562,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	if (!scx_rq_online(rq))
 		goto local;
 
-	if (scx_rq_bypassing(rq)) {
+	if (scx_bypassing(sch, cpu_of(rq))) {
 		__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
 		goto bypass;
 	}
@@ -1951,7 +1943,7 @@ static bool task_can_run_on_remote_rq(struct scx_sched *sch,
 				      struct task_struct *p, struct rq *rq,
 				      bool enforce)
 {
-	int cpu = cpu_of(rq);
+	s32 cpu = cpu_of(rq);
 
 	WARN_ON_ONCE(task_cpu(p) == cpu);
 
@@ -2402,6 +2394,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
 	bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
+	s32 cpu = cpu_of(rq);
 
 	lockdep_assert_rq_held(rq);
 	rq->scx.flags |= SCX_RQ_IN_BALANCE;
@@ -2416,8 +2409,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * emitted in switch_class().
 		 */
 		if (SCX_HAS_OP(sch, cpu_acquire))
-			SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq,
-				    cpu_of(rq), NULL);
+			SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, cpu, NULL);
 		rq->scx.cpu_released = false;
 	}
 
@@ -2434,7 +2426,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * See scx_disable_workfn() for the explanation on the bypassing
 		 * test.
 		 */
-		if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
+		if (prev_on_rq && prev->scx.slice && !scx_bypassing(sch, cpu)) {
 			rq->scx.flags |= SCX_RQ_BAL_KEEP;
 			goto has_tasks;
 		}
@@ -2447,8 +2439,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (consume_global_dsq(sch, rq))
 		goto has_tasks;
 
-	if (scx_rq_bypassing(rq)) {
-		if (consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu_of(rq))))
+	if (scx_bypassing(sch, cpu)) {
+		if (consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu)))
 			goto has_tasks;
 		else
 			goto no_tasks;
@@ -2469,8 +2461,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	do {
 		dspc->nr_tasks = 0;
 
-		SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq,
-			    cpu_of(rq), prev_on_scx ? prev : NULL);
+		SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
+			    prev_on_scx ? prev : NULL);
 
 		flush_dispatch_buf(sch, rq);
 
@@ -2493,7 +2485,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * scx_kick_cpu() for deferred kicking.
 		 */
 		if (unlikely(!--nr_loops)) {
-			scx_kick_cpu(sch, cpu_of(rq), 0);
+			scx_kick_cpu(sch, cpu, 0);
 			break;
 		}
 	} while (dspc->nr_tasks);
@@ -2504,7 +2496,7 @@ no_tasks:
 	 * %SCX_OPS_ENQ_LAST is in effect.
 	 */
 	if (prev_on_rq &&
-	    (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_rq_bypassing(rq))) {
+	    (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) {
 		rq->scx.flags |= SCX_RQ_BAL_KEEP;
 		__scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);
 		goto has_tasks;
@@ -2663,7 +2655,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 		 * forcing a different task. Leave it at the head of the local
 		 * DSQ.
 		 */
-		if (p->scx.slice && !scx_rq_bypassing(rq)) {
+		if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) {
 			dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p,
 					 SCX_ENQ_HEAD);
 			goto switch_class;
@@ -2746,7 +2738,8 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
 		if (unlikely(!p->scx.slice)) {
 			struct scx_sched *sch = scx_task_sched(p);
 
-			if (!scx_rq_bypassing(rq) && !sch->warned_zero_slice) {
+			if (!scx_bypassing(sch, cpu_of(rq)) &&
+			    !sch->warned_zero_slice) {
 				printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in %s()\n",
 						p->comm, p->pid, __func__);
 				sch->warned_zero_slice = true;
@@ -2821,7 +2814,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 	 * verifier.
 	 */
 	if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) &&
-	    !scx_rq_bypassing(task_rq(a)))
+	    !scx_bypassing(sch_a, task_cpu(a)))
 		return SCX_CALL_OP_2TASKS_RET(sch_a, SCX_KF_REST, core_sched_before,
 					      NULL,
 					      (struct task_struct *)a,
@@ -2834,7 +2827,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
 {
 	struct scx_sched *sch = scx_task_sched(p);
-	bool rq_bypass;
+	bool bypassing;
 
 	/*
 	 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
@@ -2849,8 +2842,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 	if (unlikely(wake_flags & WF_EXEC))
 		return prev_cpu;
 
-	rq_bypass = scx_rq_bypassing(task_rq(p));
-	if (likely(SCX_HAS_OP(sch, select_cpu)) && !rq_bypass) {
+	bypassing = scx_bypassing(sch, task_cpu(p));
+	if (likely(SCX_HAS_OP(sch, select_cpu)) && !bypassing) {
 		s32 cpu;
 		struct task_struct **ddsp_taskp;
 
@@ -2880,7 +2873,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		}
 		p->scx.selected_cpu = cpu;
 
-		if (rq_bypass)
+		if (bypassing)
 			__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
 		return cpu;
 	}
@@ -2917,7 +2910,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
 static void handle_hotplug(struct rq *rq, bool online)
 {
 	struct scx_sched *sch = scx_root;
-	int cpu = cpu_of(rq);
+	s32 cpu = cpu_of(rq);
 
 	atomic_long_inc(&scx_hotplug_seq);
 
@@ -3046,7 +3039,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 	 * While disabling, always resched and refresh core-sched timestamp as
 	 * we can't trust the slice management or ops.core_sched_before().
 	 */
-	if (scx_rq_bypassing(rq)) {
+	if (scx_bypassing(sch, cpu_of(rq))) {
 		curr->scx.slice = 0;
 		touch_core_sched(rq, curr);
 	} else if (SCX_HAS_OP(sch, tick)) {
@@ -3486,13 +3479,14 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
 bool scx_can_stop_tick(struct rq *rq)
 {
 	struct task_struct *p = rq->curr;
-
-	if (scx_rq_bypassing(rq))
-		return false;
+	struct scx_sched *sch = scx_task_sched(p);
 
 	if (p->sched_class != &ext_sched_class)
 		return true;
 
+	if (scx_bypassing(sch, cpu_of(rq)))
+		return false;
+
 	/*
 	 * @rq can dispatch from different DSQs, so we can't tell whether it
 	 * needs the tick or not by looking at nr_running. Allow stopping ticks
@@ -3993,6 +3987,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	irq_work_sync(&sch->error_irq_work);
 	kthread_destroy_worker(sch->helper);
+	timer_shutdown_sync(&sch->bypass_lb_timer);
 
 #ifdef CONFIG_EXT_SUB_SCHED
 	kfree(sch->cgrp_path);
@@ -4389,12 +4384,11 @@ static void bypass_lb_node(struct scx_sched *sch, int node)
  */
 static void scx_bypass_lb_timerfn(struct timer_list *timer)
 {
-	struct scx_sched *sch;
+	struct scx_sched *sch = container_of(timer, struct scx_sched, bypass_lb_timer);
 	int node;
 	u32 intv_us;
 
-	sch = rcu_dereference_all(scx_root);
-	if (unlikely(!sch) || !READ_ONCE(scx_bypass_depth))
+	if (!READ_ONCE(sch->bypass_depth))
 		return;
 
 	for_each_node_with_cpus(node)
@@ -4405,10 +4399,9 @@ static void scx_bypass_lb_timerfn(struct timer_list *timer)
 		mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
 }
 
-static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
-
 /**
  * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * @sch: sched to bypass
  * @bypass: true for bypass, false for unbypass
  *
  * Bypassing guarantees that all runnable tasks make forward progress without
@@ -4438,51 +4431,44 @@ static DEFINE_TIMER(scx_bypass_lb_timer, scx_bypass_lb_timerfn);
  *
  * - scx_prio_less() reverts to the default core_sched_at order.
  */
-static void scx_bypass(bool bypass)
+static void scx_bypass(struct scx_sched *sch, bool bypass)
 {
 	static DEFINE_RAW_SPINLOCK(bypass_lock);
-	static unsigned long bypass_timestamp;
-	struct scx_sched *sch;
 	unsigned long flags;
 	int cpu;
 
 	raw_spin_lock_irqsave(&bypass_lock, flags);
-	sch = rcu_dereference_bh(scx_root);
-	if (!sch)
-		goto unlock;
 
 	if (bypass) {
 		u32 intv_us;
 
-		WRITE_ONCE(scx_bypass_depth, scx_bypass_depth + 1);
-		WARN_ON_ONCE(scx_bypass_depth <= 0);
-		if (scx_bypass_depth != 1)
+		WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1);
+		WARN_ON_ONCE(sch->bypass_depth <= 0);
+		if (sch->bypass_depth != 1)
 			goto unlock;
 		WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
-		bypass_timestamp = ktime_get_ns();
-		if (sch)
-			scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+		sch->bypass_timestamp = ktime_get_ns();
+		scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
 
 		intv_us = READ_ONCE(scx_bypass_lb_intv_us);
-		if (intv_us && !timer_pending(&scx_bypass_lb_timer)) {
-			scx_bypass_lb_timer.expires =
+		if (intv_us && !timer_pending(&sch->bypass_lb_timer)) {
+			sch->bypass_lb_timer.expires =
 				jiffies + usecs_to_jiffies(intv_us);
-			add_timer_global(&scx_bypass_lb_timer);
+			add_timer_global(&sch->bypass_lb_timer);
 		}
 	} else {
-		WRITE_ONCE(scx_bypass_depth, scx_bypass_depth - 1);
-		WARN_ON_ONCE(scx_bypass_depth < 0);
-		if (scx_bypass_depth != 0)
+		WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1);
+		WARN_ON_ONCE(sch->bypass_depth < 0);
+		if (sch->bypass_depth != 0)
 			goto unlock;
 		WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
-		if (sch)
-			scx_add_event(sch, SCX_EV_BYPASS_DURATION,
-				      ktime_get_ns() - bypass_timestamp);
+		scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+			      ktime_get_ns() - sch->bypass_timestamp);
 	}
 
 	/*
 	 * No task property is changing. We just need to make sure all currently
-	 * queued tasks are re-queued according to the new scx_rq_bypassing()
+	 * queued tasks are re-queued according to the new scx_bypassing()
 	 * state. As an optimization, walk each rq's runnable_list instead of
 	 * the scx_tasks list.
 	 *
@@ -4491,22 +4477,23 @@ static void scx_bypass(bool bypass)
 	 */
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
+		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 		struct task_struct *p, *n;
 
 		raw_spin_rq_lock(rq);
 
 		if (bypass) {
-			WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
-			rq->scx.flags |= SCX_RQ_BYPASSING;
+			WARN_ON_ONCE(pcpu->flags & SCX_SCHED_PCPU_BYPASSING);
+			pcpu->flags |= SCX_SCHED_PCPU_BYPASSING;
 		} else {
-			WARN_ON_ONCE(!(rq->scx.flags & SCX_RQ_BYPASSING));
-			rq->scx.flags &= ~SCX_RQ_BYPASSING;
+			WARN_ON_ONCE(!(pcpu->flags & SCX_SCHED_PCPU_BYPASSING));
+			pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
 		}
 
 		/*
 		 * We need to guarantee that no tasks are on the BPF scheduler
 		 * while bypassing. Either we see enabled or the enable path
-		 * sees scx_rq_bypassing() before moving tasks to SCX.
+		 * sees scx_bypassing() before moving tasks to SCX.
 		 */
 		if (!scx_enabled()) {
 			raw_spin_rq_unlock(rq);
@@ -4676,7 +4663,7 @@ static void scx_root_disable(struct scx_sched *sch)
 	int cpu;
 
 	/* guarantee forward progress and wait for descendants to be disabled */
-	scx_bypass(true);
+	scx_bypass(sch, true);
 	drain_descendants(sch);
 
 	switch (scx_set_enable_state(SCX_DISABLING)) {
@@ -4801,16 +4788,11 @@ static void scx_root_disable(struct scx_sched *sch)
 	scx_dsp_max_batch = 0;
 	free_kick_syncs();
 
-	if (scx_bypassed_for_enable) {
-		scx_bypassed_for_enable = false;
-		scx_bypass(false);
-	}
-
 	mutex_unlock(&scx_enable_mutex);
 
 	WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING);
 done:
-	scx_bypass(false);
+	scx_bypass(sch, false);
 }
 
 /*
@@ -5324,6 +5306,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
 	init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
 	kthread_init_work(&sch->disable_work, scx_disable_workfn);
+	timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
 	sch->ops = *ops;
 	rcu_assign_pointer(ops->priv, sch);
 
@@ -5569,8 +5552,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	 * scheduling) may not function correctly before all tasks are switched.
 	 * Init in bypass mode to guarantee forward progress.
 	 */
-	scx_bypass(true);
-	scx_bypassed_for_enable = true;
+	scx_bypass(sch, true);
 
 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
 		if (((void (**)(void))ops)[i])
@@ -5670,8 +5652,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	scx_task_iter_stop(&sti);
 	percpu_up_write(&scx_fork_rwsem);
 
-	scx_bypassed_for_enable = false;
-	scx_bypass(false);
+	scx_bypass(sch, false);
 
 	if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) {
 		WARN_ON_ONCE(atomic_read(&sch->exit_kind) == SCX_EXIT_NONE);
@@ -6424,6 +6405,14 @@ void print_scx_info(const char *log_lvl, struct task_struct *p)
 
 static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr)
 {
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = rcu_dereference(scx_root);
+	if (!sch)
+		return NOTIFY_OK;
+
 	/*
 	 * SCX schedulers often have userspace components which are sometimes
 	 * involved in critial scheduling paths. PM operations involve freezing
@@ -6434,12 +6423,12 @@ static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *
 	case PM_HIBERNATION_PREPARE:
 	case PM_SUSPEND_PREPARE:
 	case PM_RESTORE_PREPARE:
-		scx_bypass(true);
+		scx_bypass(sch, true);
 		break;
 	case PM_POST_HIBERNATION:
 	case PM_POST_SUSPEND:
 	case PM_POST_RESTORE:
-		scx_bypass(false);
+		scx_bypass(sch, false);
 		break;
 	}
 
@@ -7255,7 +7244,7 @@ static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags)
 	 * lead to irq_work_queue() malfunction such as infinite busy wait for
 	 * IRQ status update. Suppress kicking.
 	 */
-	if (scx_rq_bypassing(this_rq))
+	if (scx_bypassing(sch, cpu_of(this_rq)))
 		goto out;
 
 	/*
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 9f6abee1e234..03be4d664267 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -767,7 +767,8 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
 	 * either enqueue() sees the idle bit or update_idle() sees the task
 	 * that enqueue() queued.
 	 */
-	if (SCX_HAS_OP(sch, update_idle) && do_notify && !scx_rq_bypassing(rq))
+	if (SCX_HAS_OP(sch, update_idle) && do_notify &&
+	    !scx_bypassing(sch, cpu_of(rq)))
 		SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
 }
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 1901efd40517..0e0806119711 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -925,7 +925,13 @@ struct scx_event_stats {
 	s64		SCX_EV_INSERT_NOT_OWNED;
 };
 
+enum scx_sched_pcpu_flags {
+	SCX_SCHED_PCPU_BYPASSING	= 1LLU << 0,
+};
+
 struct scx_sched_pcpu {
+	u64			flags;	/* protected by rq lock */
+
 	/*
 	 * The event counters are in a per-CPU variable to minimize the
 	 * accounting overhead. A system-wide view on the event counter is
@@ -953,6 +959,8 @@ struct scx_sched {
 	struct scx_sched_pcpu __percpu *pcpu;
 
 	u64			slice_dfl;
+	u64			bypass_timestamp;
+	s32			bypass_depth;
 	bool			aborting;
 	s32			level;
 
@@ -984,6 +992,7 @@ struct scx_sched {
 	struct kthread_worker	*helper;
 	struct irq_work		error_irq_work;
 	struct kthread_work	disable_work;
+	struct timer_list	bypass_lb_timer;
 	struct rcu_work		rcu_work;
 
 	/* all ancestors including self */
@@ -1257,9 +1266,10 @@ static inline bool scx_kf_allowed_if_unlocked(void)
 	return !current->scx.kf_mask;
 }
 
-static inline bool scx_rq_bypassing(struct rq *rq)
+static inline bool scx_bypassing(struct scx_sched *sch, s32 cpu)
 {
-	return unlikely(rq->scx.flags & SCX_RQ_BYPASSING);
+	return unlikely(per_cpu_ptr(sch->pcpu, cpu)->flags &
+			SCX_SCHED_PCPU_BYPASSING);
 }
 
 #ifdef CONFIG_EXT_SUB_SCHED
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9e142c2f50f2..596f6713cf7e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -782,7 +782,6 @@ enum scx_rq_flags {
 	SCX_RQ_ONLINE		= 1 << 0,
 	SCX_RQ_CAN_STOP_TICK	= 1 << 1,
 	SCX_RQ_BAL_KEEP		= 1 << 3, /* balance decided to keep current */
-	SCX_RQ_BYPASSING	= 1 << 4,
 	SCX_RQ_CLK_VALID	= 1 << 5, /* RQ clock is fresh and valid */
 	SCX_RQ_BAL_CB_PENDING	= 1 << 6, /* must queue a cb after dispatching */
 

From c7f0e467a27a9ee98a324f12b25abf53280d71d0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 023/134] sched_ext: Prepare bypass mode for hierarchical
 operation

Bypass mode is used to simplify enable and disable paths and guarantee
forward progress when something goes wrong. When enabled, all tasks skip BPF
scheduling and fall back to simple in-kernel FIFO scheduling. While this
global behavior can be used as-is when dealing with sub-scheds, that would
allow any sub-sched instance to affect the whole system in a significantly
disruptive manner.

Make bypass state hierarchical by propagating it to descendants and updating
per-cpu flags accordingly. This allows an scx_sched to bypass if itself or
any of its ancestors are in bypass mode. However, this doesn't make the
actual bypass enqueue and dispatch paths hierarchical yet. That will be done
in later patches.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 85 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 63 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 56c05b01f088..0bec650c0ab6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -41,6 +41,7 @@ static DEFINE_MUTEX(scx_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
+static DEFINE_RAW_SPINLOCK(scx_bypass_lock);
 static cpumask_var_t scx_bypass_lb_donee_cpumask;
 static cpumask_var_t scx_bypass_lb_resched_cpumask;
 static bool scx_init_task_enabled;
@@ -4399,6 +4400,36 @@ static void scx_bypass_lb_timerfn(struct timer_list *timer)
 		mod_timer(timer, jiffies + usecs_to_jiffies(intv_us));
 }
 
+static bool inc_bypass_depth(struct scx_sched *sch)
+{
+	lockdep_assert_held(&scx_bypass_lock);
+
+	WARN_ON_ONCE(sch->bypass_depth < 0);
+	WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1);
+	if (sch->bypass_depth != 1)
+		return false;
+
+	WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
+	sch->bypass_timestamp = ktime_get_ns();
+	scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
+	return true;
+}
+
+static bool dec_bypass_depth(struct scx_sched *sch)
+{
+	lockdep_assert_held(&scx_bypass_lock);
+
+	WARN_ON_ONCE(sch->bypass_depth < 1);
+	WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1);
+	if (sch->bypass_depth != 0)
+		return false;
+
+	WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
+	scx_add_event(sch, SCX_EV_BYPASS_DURATION,
+		      ktime_get_ns() - sch->bypass_timestamp);
+	return true;
+}
+
 /**
  * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
  * @sch: sched to bypass
@@ -4433,22 +4464,17 @@ static void scx_bypass_lb_timerfn(struct timer_list *timer)
  */
 static void scx_bypass(struct scx_sched *sch, bool bypass)
 {
-	static DEFINE_RAW_SPINLOCK(bypass_lock);
+	struct scx_sched *pos;
 	unsigned long flags;
 	int cpu;
 
-	raw_spin_lock_irqsave(&bypass_lock, flags);
+	raw_spin_lock_irqsave(&scx_bypass_lock, flags);
 
 	if (bypass) {
 		u32 intv_us;
 
-		WRITE_ONCE(sch->bypass_depth, sch->bypass_depth + 1);
-		WARN_ON_ONCE(sch->bypass_depth <= 0);
-		if (sch->bypass_depth != 1)
+		if (!inc_bypass_depth(sch))
 			goto unlock;
-		WRITE_ONCE(sch->slice_dfl, READ_ONCE(scx_slice_bypass_us) * NSEC_PER_USEC);
-		sch->bypass_timestamp = ktime_get_ns();
-		scx_add_event(sch, SCX_EV_BYPASS_ACTIVATE, 1);
 
 		intv_us = READ_ONCE(scx_bypass_lb_intv_us);
 		if (intv_us && !timer_pending(&sch->bypass_lb_timer)) {
@@ -4457,15 +4483,25 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 			add_timer_global(&sch->bypass_lb_timer);
 		}
 	} else {
-		WRITE_ONCE(sch->bypass_depth, sch->bypass_depth - 1);
-		WARN_ON_ONCE(sch->bypass_depth < 0);
-		if (sch->bypass_depth != 0)
+		if (!dec_bypass_depth(sch))
 			goto unlock;
-		WRITE_ONCE(sch->slice_dfl, SCX_SLICE_DFL);
-		scx_add_event(sch, SCX_EV_BYPASS_DURATION,
-			      ktime_get_ns() - sch->bypass_timestamp);
 	}
 
+	/*
+	 * Bypass state is propagated to all descendants - an scx_sched bypasses
+	 * if itself or any of its ancestors are in bypass mode.
+	 */
+	raw_spin_lock(&scx_sched_lock);
+	scx_for_each_descendant_pre(pos, sch) {
+		if (pos == sch)
+			continue;
+		if (bypass)
+			inc_bypass_depth(pos);
+		else
+			dec_bypass_depth(pos);
+	}
+	raw_spin_unlock(&scx_sched_lock);
+
 	/*
 	 * No task property is changing. We just need to make sure all currently
 	 * queued tasks are re-queued according to the new scx_bypassing()
@@ -4477,18 +4513,20 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 	 */
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
-		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 		struct task_struct *p, *n;
 
 		raw_spin_rq_lock(rq);
 
-		if (bypass) {
-			WARN_ON_ONCE(pcpu->flags & SCX_SCHED_PCPU_BYPASSING);
-			pcpu->flags |= SCX_SCHED_PCPU_BYPASSING;
-		} else {
-			WARN_ON_ONCE(!(pcpu->flags & SCX_SCHED_PCPU_BYPASSING));
-			pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
+		raw_spin_lock(&scx_sched_lock);
+		scx_for_each_descendant_pre(pos, sch) {
+			struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
+
+			if (pos->bypass_depth)
+				pcpu->flags |= SCX_SCHED_PCPU_BYPASSING;
+			else
+				pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
 		}
+		raw_spin_unlock(&scx_sched_lock);
 
 		/*
 		 * We need to guarantee that no tasks are on the BPF scheduler
@@ -4509,6 +4547,9 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 		 */
 		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
 						 scx.runnable_node) {
+			if (!scx_is_descendant(scx_task_sched(p), sch))
+				continue;
+
 			/* cycling deq/enq is enough, see the function comment */
 			scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
 				/* nothing */ ;
@@ -4523,7 +4564,7 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 	}
 
 unlock:
-	raw_spin_unlock_irqrestore(&bypass_lock, flags);
+	raw_spin_unlock_irqrestore(&scx_bypass_lock, flags);
 }
 
 static void free_exit_info(struct scx_exit_info *ei)

From 39d0b2c43776fc84a27b2fc37cebe89e442aafd8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 024/134] sched_ext: Factor out scx_dispatch_sched()

In preparation of multiple scheduler support, factor out
scx_dispatch_sched() from balance_one(). The function boundary makes
remembering $prev_on_scx and $prev_on_rq less useful. Open code $prev_on_scx
in balance_one() and $prev_on_rq in both balance_one() and
scx_dispatch_sched().

No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 127 ++++++++++++++++++++++++---------------------
 1 file changed, 67 insertions(+), 60 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0bec650c0ab6..28ac7ba3ece0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2388,13 +2388,70 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
 	rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
 }
 
+static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
+			       struct task_struct *prev)
+{
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+	bool prev_on_scx = prev->sched_class == &ext_sched_class;
+	int nr_loops = SCX_DSP_MAX_LOOPS;
+	s32 cpu = cpu_of(rq);
+
+	if (consume_global_dsq(sch, rq))
+		return true;
+
+	if (scx_bypassing(sch, cpu))
+		return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+
+	if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
+		return false;
+
+	dspc->rq = rq;
+
+	/*
+	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
+	 * the local DSQ might still end up empty after a successful
+	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
+	 * produced some tasks, retry. The BPF scheduler may depend on this
+	 * looping behavior to simplify its implementation.
+	 */
+	do {
+		dspc->nr_tasks = 0;
+
+		SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
+			    prev_on_scx ? prev : NULL);
+
+		flush_dispatch_buf(sch, rq);
+
+		if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice) {
+			rq->scx.flags |= SCX_RQ_BAL_KEEP;
+			return true;
+		}
+		if (rq->scx.local_dsq.nr)
+			return true;
+		if (consume_global_dsq(sch, rq))
+			return true;
+
+		/*
+		 * ops.dispatch() can trap us in this loop by repeatedly
+		 * dispatching ineligible tasks. Break out once in a while to
+		 * allow the watchdog to run. As IRQ can't be enabled in
+		 * balance(), we want to complete this scheduling cycle and then
+		 * start a new one. IOW, we want to call resched_curr() on the
+		 * next, most likely idle, task, not the current one. Use
+		 * __scx_bpf_kick_cpu() for deferred kicking.
+		 */
+		if (unlikely(!--nr_loops)) {
+			scx_kick_cpu(sch, cpu, 0);
+			break;
+		}
+	} while (dspc->nr_tasks);
+
+	return false;
+}
+
 static int balance_one(struct rq *rq, struct task_struct *prev)
 {
 	struct scx_sched *sch = scx_root;
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
-	bool prev_on_scx = prev->sched_class == &ext_sched_class;
-	bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
-	int nr_loops = SCX_DSP_MAX_LOOPS;
 	s32 cpu = cpu_of(rq);
 
 	lockdep_assert_rq_held(rq);
@@ -2414,7 +2471,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		rq->scx.cpu_released = false;
 	}
 
-	if (prev_on_scx) {
+	if (prev->sched_class == &ext_sched_class) {
 		update_curr_scx(rq);
 
 		/*
@@ -2427,7 +2484,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * See scx_disable_workfn() for the explanation on the bypassing
 		 * test.
 		 */
-		if (prev_on_rq && prev->scx.slice && !scx_bypassing(sch, cpu)) {
+		if ((prev->scx.flags & SCX_TASK_QUEUED) && prev->scx.slice &&
+		    !scx_bypassing(sch, cpu)) {
 			rq->scx.flags |= SCX_RQ_BAL_KEEP;
 			goto has_tasks;
 		}
@@ -2437,66 +2495,15 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (rq->scx.local_dsq.nr)
 		goto has_tasks;
 
-	if (consume_global_dsq(sch, rq))
+	/* dispatch @sch */
+	if (scx_dispatch_sched(sch, rq, prev))
 		goto has_tasks;
 
-	if (scx_bypassing(sch, cpu)) {
-		if (consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu)))
-			goto has_tasks;
-		else
-			goto no_tasks;
-	}
-
-	if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
-		goto no_tasks;
-
-	dspc->rq = rq;
-
-	/*
-	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
-	 * the local DSQ might still end up empty after a successful
-	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
-	 * produced some tasks, retry. The BPF scheduler may depend on this
-	 * looping behavior to simplify its implementation.
-	 */
-	do {
-		dspc->nr_tasks = 0;
-
-		SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
-			    prev_on_scx ? prev : NULL);
-
-		flush_dispatch_buf(sch, rq);
-
-		if (prev_on_rq && prev->scx.slice) {
-			rq->scx.flags |= SCX_RQ_BAL_KEEP;
-			goto has_tasks;
-		}
-		if (rq->scx.local_dsq.nr)
-			goto has_tasks;
-		if (consume_global_dsq(sch, rq))
-			goto has_tasks;
-
-		/*
-		 * ops.dispatch() can trap us in this loop by repeatedly
-		 * dispatching ineligible tasks. Break out once in a while to
-		 * allow the watchdog to run. As IRQ can't be enabled in
-		 * balance(), we want to complete this scheduling cycle and then
-		 * start a new one. IOW, we want to call resched_curr() on the
-		 * next, most likely idle, task, not the current one. Use
-		 * scx_kick_cpu() for deferred kicking.
-		 */
-		if (unlikely(!--nr_loops)) {
-			scx_kick_cpu(sch, cpu, 0);
-			break;
-		}
-	} while (dspc->nr_tasks);
-
-no_tasks:
 	/*
 	 * Didn't find another task to run. Keep running @prev unless
 	 * %SCX_OPS_ENQ_LAST is in effect.
 	 */
-	if (prev_on_rq &&
+	if ((prev->scx.flags & SCX_TASK_QUEUED) &&
 	    (!(sch->ops.flags & SCX_OPS_ENQ_LAST) || scx_bypassing(sch, cpu))) {
 		rq->scx.flags |= SCX_RQ_BAL_KEEP;
 		__scx_add_event(sch, SCX_EV_DISPATCH_KEEP_LAST, 1);

From d94d09a23340b343ffc6ad935f4a7fa90516684a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 025/134] sched_ext: When calling ops.dispatch() @prev must be
 on the same scx_sched

The @prev parameter passed into ops.dispatch() is expected to be on the
same sched. Passing in @prev which isn't on the sched can spuriously
trigger failures that can kill the scheduler. Pass in @prev iff it's on
the same sched.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 28ac7ba3ece0..413d0558372f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2392,9 +2392,10 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 			       struct task_struct *prev)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
-	bool prev_on_scx = prev->sched_class == &ext_sched_class;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
 	s32 cpu = cpu_of(rq);
+	bool prev_on_sch = (prev->sched_class == &ext_sched_class) &&
+		scx_task_on_sched(sch, prev);
 
 	if (consume_global_dsq(sch, rq))
 		return true;
@@ -2418,7 +2419,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 		dspc->nr_tasks = 0;
 
 		SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
-			    prev_on_scx ? prev : NULL);
+			    prev_on_sch ? prev : NULL);
 
 		flush_dispatch_buf(sch, rq);
 

From aa2a0a19686c90106ade6a7e848ffbb62d55d733 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:03 -1000
Subject: [PATCH 026/134] sched_ext: Separate bypass dispatch enabling from
 bypass depth tracking

The bypass_depth field tracks nesting of bypass operations but is also used
to determine whether the bypass dispatch path should be active. With
hierarchical scheduling, child schedulers may need to activate their parent's
bypass dispatch path without affecting the parent's bypass_depth, requiring
separation of these concerns.

Add bypass_dsp_enable_depth and bypass_dsp_claim to independently control
bypass dispatch path activation. The new enable_bypass_dsp() and
disable_bypass_dsp() functions manage this state with proper claim semantics
to prevent races. The bypass dispatch path now only activates when
bypass_dsp_enabled() returns true, which checks the new enable_depth counter.

The disable operation is carefully ordered after all tasks are moved out of
bypass DSQs to ensure they are drained before the dispatch path is disabled.
During scheduler teardown, disable_bypass_dsp() is called explicitly to ensure
cleanup even if bypass mode was never entered normally.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 74 ++++++++++++++++++++++++++++++++-----
 kernel/sched/ext_internal.h |  5 +++
 2 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 413d0558372f..ea5179df3de4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -357,6 +357,26 @@ static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu)
 	return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
 }
 
+/**
+ * bypass_dsp_enabled - Check if bypass dispatch path is enabled
+ * @sch: scheduler to check
+ *
+ * When a descendant scheduler enters bypass mode, bypassed tasks are scheduled
+ * by the nearest non-bypassing ancestor, or the root scheduler if all ancestors
+ * are bypassing. In the former case, the ancestor is not itself bypassing but
+ * its bypass DSQs will be populated with bypassed tasks from descendants. Thus,
+ * the ancestor's bypass dispatch path must be active even though its own
+ * bypass_depth remains zero.
+ *
+ * This function checks bypass_dsp_enable_depth which is managed separately from
+ * bypass_depth to enable this decoupling. See enable_bypass_dsp() and
+ * disable_bypass_dsp().
+ */
+static bool bypass_dsp_enabled(struct scx_sched *sch)
+{
+	return unlikely(atomic_read(&sch->bypass_dsp_enable_depth));
+}
+
 /*
  * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
  * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -2400,7 +2420,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 	if (consume_global_dsq(sch, rq))
 		return true;
 
-	if (scx_bypassing(sch, cpu))
+	if (bypass_dsp_enabled(sch) && scx_bypassing(sch, cpu))
 		return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
 
 	if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
@@ -4397,7 +4417,7 @@ static void scx_bypass_lb_timerfn(struct timer_list *timer)
 	int node;
 	u32 intv_us;
 
-	if (!READ_ONCE(sch->bypass_depth))
+	if (!bypass_dsp_enabled(sch))
 		return;
 
 	for_each_node_with_cpus(node)
@@ -4438,6 +4458,42 @@ static bool dec_bypass_depth(struct scx_sched *sch)
 	return true;
 }
 
+static void enable_bypass_dsp(struct scx_sched *sch)
+{
+	u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
+	s32 ret;
+
+	/*
+	 * @sch->bypass_depth transitioning from 0 to 1 triggers enabling.
+	 * Shouldn't stagger.
+	 */
+	if (WARN_ON_ONCE(test_and_set_bit(0, &sch->bypass_dsp_claim)))
+		return;
+
+	/*
+	 * The LB timer will stop running if bypass_arm_depth is 0. Increment
+	 * before starting the LB timer.
+	 */
+	ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
+	WARN_ON_ONCE(ret <= 0);
+
+	if (intv_us && !timer_pending(&sch->bypass_lb_timer))
+		mod_timer(&sch->bypass_lb_timer,
+			  jiffies + usecs_to_jiffies(intv_us));
+}
+
+/* may be called without holding scx_bypass_lock */
+static void disable_bypass_dsp(struct scx_sched *sch)
+{
+	s32 ret;
+
+	if (!test_and_clear_bit(0, &sch->bypass_dsp_claim))
+		return;
+
+	ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
+	WARN_ON_ONCE(ret < 0);
+}
+
 /**
  * scx_bypass - [Un]bypass scx_ops and guarantee forward progress
  * @sch: sched to bypass
@@ -4479,17 +4535,10 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 	raw_spin_lock_irqsave(&scx_bypass_lock, flags);
 
 	if (bypass) {
-		u32 intv_us;
-
 		if (!inc_bypass_depth(sch))
 			goto unlock;
 
-		intv_us = READ_ONCE(scx_bypass_lb_intv_us);
-		if (intv_us && !timer_pending(&sch->bypass_lb_timer)) {
-			sch->bypass_lb_timer.expires =
-				jiffies + usecs_to_jiffies(intv_us);
-			add_timer_global(&sch->bypass_lb_timer);
-		}
+		enable_bypass_dsp(sch);
 	} else {
 		if (!dec_bypass_depth(sch))
 			goto unlock;
@@ -4571,6 +4620,9 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 		raw_spin_rq_unlock(rq);
 	}
 
+	/* disarming must come after moving all tasks out of the bypass DSQs */
+	if (!bypass)
+		disable_bypass_dsp(sch);
 unlock:
 	raw_spin_unlock_irqrestore(&scx_bypass_lock, flags);
 }
@@ -4672,6 +4724,8 @@ static void scx_sub_disable(struct scx_sched *sch)
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 
+	disable_bypass_dsp(sch);
+
 	raw_spin_lock_irq(&scx_sched_lock);
 	list_del_init(&sch->sibling);
 	list_del_rcu(&sch->all);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 0e0806119711..575c26f9a3d7 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -961,6 +961,11 @@ struct scx_sched {
 	u64			slice_dfl;
 	u64			bypass_timestamp;
 	s32			bypass_depth;
+
+	/* bypass dispatch path enable state, see bypass_dsp_enabled() */
+	unsigned long		bypass_dsp_claim;
+	atomic_t		bypass_dsp_enable_depth;
+
 	bool			aborting;
 	s32			level;
 

From 025b1bd419653f181c8b9c748aa07802177ff828 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 027/134] sched_ext: Implement hierarchical bypass mode

When a sub-scheduler enters bypass mode, its tasks must be scheduled by an
ancestor to guarantee forward progress. Tasks from bypassing descendants are
queued in the bypass DSQs of the nearest non-bypassing ancestor, or the root
scheduler if all ancestors are bypassing. This requires coordination between
bypassing schedulers and their hosts.

Add bypass_enq_target_dsq() to find the correct bypass DSQ by walking up the
hierarchy until reaching a non-bypassing ancestor. When a sub-scheduler starts
bypassing, all its runnable tasks are re-enqueued after scx_bypassing() is set,
ensuring proper migration to ancestor bypass DSQs.

Update scx_dispatch_sched() to handle hosting bypassed descendants. When a
scheduler is not bypassing but has bypassing descendants, it must schedule both
its own tasks and bypassed descendant tasks. A simple policy is implemented
where every Nth dispatch attempt (SCX_BYPASS_HOST_NTH=2) consumes from the
bypass DSQ. A fallback consumption is also added at the end of dispatch to
ensure bypassed tasks make progress even when normal scheduling is idle.

Update enable_bypass_dsp() and disable_bypass_dsp() to increment
bypass_dsp_enable_depth on both the bypassing scheduler and its parent host,
ensuring both can detect that bypass dispatch is active through
bypass_dsp_enabled().

Add SCX_EV_SUB_BYPASS_DISPATCH event counter to track scheduling of bypassed
descendant tasks.

v2: Fix comment typos (Andrea).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 97 ++++++++++++++++++++++++++++++++++---
 kernel/sched/ext_internal.h | 11 +++++
 2 files changed, 101 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ea5179df3de4..2430505ca066 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -357,6 +357,27 @@ static struct scx_dispatch_q *bypass_dsq(struct scx_sched *sch, s32 cpu)
 	return &per_cpu_ptr(sch->pcpu, cpu)->bypass_dsq;
 }
 
+static struct scx_dispatch_q *bypass_enq_target_dsq(struct scx_sched *sch, s32 cpu)
+{
+#ifdef CONFIG_EXT_SUB_SCHED
+	/*
+	 * If @sch is a sub-sched which is bypassing, its tasks should go into
+	 * the bypass DSQs of the nearest ancestor which is not bypassing. The
+	 * not-bypassing ancestor is responsible for scheduling all tasks from
+	 * bypassing sub-trees. If all ancestors including root are bypassing,
+	 * all tasks should go to the root's bypass DSQs.
+	 *
+	 * Whenever a sched starts bypassing, all runnable tasks in its subtree
+	 * are re-enqueued after scx_bypassing() is turned on, guaranteeing that
+	 * all tasks are transferred to the right DSQs.
+	 */
+	while (scx_parent(sch) && scx_bypassing(sch, cpu))
+		sch = scx_parent(sch);
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
+	return bypass_dsq(sch, cpu);
+}
+
 /**
  * bypass_dsp_enabled - Check if bypass dispatch path is enabled
  * @sch: scheduler to check
@@ -1650,7 +1671,7 @@ global:
 	dsq = find_global_dsq(sch, p);
 	goto enqueue;
 bypass:
-	dsq = bypass_dsq(sch, task_cpu(p));
+	dsq = bypass_enq_target_dsq(sch, task_cpu(p));
 	goto enqueue;
 
 enqueue:
@@ -2420,8 +2441,33 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 	if (consume_global_dsq(sch, rq))
 		return true;
 
-	if (bypass_dsp_enabled(sch) && scx_bypassing(sch, cpu))
-		return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+	if (bypass_dsp_enabled(sch)) {
+		/* if @sch is bypassing, only the bypass DSQs are active */
+		if (scx_bypassing(sch, cpu))
+			return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+
+#ifdef CONFIG_EXT_SUB_SCHED
+		/*
+		 * If @sch isn't bypassing but its children are, @sch is
+		 * responsible for making forward progress for both its own
+		 * tasks that aren't bypassing and the bypassing descendants'
+		 * tasks. The following implements a simple built-in behavior -
+		 * let each CPU try to run the bypass DSQ every Nth time.
+		 *
+		 * Later, if necessary, we can add an ops flag to suppress the
+		 * auto-consumption and a kfunc to consume the bypass DSQ and,
+		 * so that the BPF scheduler can fully control scheduling of
+		 * bypassed tasks.
+		 */
+		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+		if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
+		    consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu))) {
+			__scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1);
+			return true;
+		}
+#endif	/* CONFIG_EXT_SUB_SCHED */
+	}
 
 	if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
 		return false;
@@ -2467,6 +2513,14 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 		}
 	} while (dspc->nr_tasks);
 
+	/*
+	 * Prevent the CPU from going idle while bypassed descendants have tasks
+	 * queued. Without this fallback, bypassed tasks could stall if the host
+	 * scheduler's ops.dispatch() doesn't yield any tasks.
+	 */
+	if (bypass_dsp_enabled(sch))
+		return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+
 	return false;
 }
 
@@ -4085,6 +4139,7 @@ static ssize_t scx_attr_events_show(struct kobject *kobj,
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_INSERT_NOT_OWNED);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_SUB_BYPASS_DISPATCH);
 	return at;
 }
 SCX_ATTR(events);
@@ -4460,6 +4515,7 @@ static bool dec_bypass_depth(struct scx_sched *sch)
 
 static void enable_bypass_dsp(struct scx_sched *sch)
 {
+	struct scx_sched *host = scx_parent(sch) ?: sch;
 	u32 intv_us = READ_ONCE(scx_bypass_lb_intv_us);
 	s32 ret;
 
@@ -4471,14 +4527,35 @@ static void enable_bypass_dsp(struct scx_sched *sch)
 		return;
 
 	/*
-	 * The LB timer will stop running if bypass_arm_depth is 0. Increment
-	 * before starting the LB timer.
+	 * When a sub-sched bypasses, its tasks are queued on the bypass DSQs of
+	 * the nearest non-bypassing ancestor or root. As enable_bypass_dsp() is
+	 * called iff @sch is not already bypassed due to an ancestor bypassing,
+	 * we can assume that the parent is not bypassing and thus will be the
+	 * host of the bypass DSQs.
+	 *
+	 * While the situation may change in the future, the following
+	 * guarantees that the nearest non-bypassing ancestor or root has bypass
+	 * dispatch enabled while a descendant is bypassing, which is all that's
+	 * required.
+	 *
+	 * bypass_dsp_enabled() test is used to determine whether to enter the
+	 * bypass dispatch handling path from both bypassing and hosting scheds.
+	 * Bump enable depth on both @sch and bypass dispatch host.
 	 */
 	ret = atomic_inc_return(&sch->bypass_dsp_enable_depth);
 	WARN_ON_ONCE(ret <= 0);
 
-	if (intv_us && !timer_pending(&sch->bypass_lb_timer))
-		mod_timer(&sch->bypass_lb_timer,
+	if (host != sch) {
+		ret = atomic_inc_return(&host->bypass_dsp_enable_depth);
+		WARN_ON_ONCE(ret <= 0);
+	}
+
+	/*
+	 * The LB timer will stop running if bypass dispatch is disabled. Start
+	 * after enabling bypass dispatch.
+	 */
+	if (intv_us && !timer_pending(&host->bypass_lb_timer))
+		mod_timer(&host->bypass_lb_timer,
 			  jiffies + usecs_to_jiffies(intv_us));
 }
 
@@ -4492,6 +4569,11 @@ static void disable_bypass_dsp(struct scx_sched *sch)
 
 	ret = atomic_dec_return(&sch->bypass_dsp_enable_depth);
 	WARN_ON_ONCE(ret < 0);
+
+	if (scx_parent(sch)) {
+		ret = atomic_dec_return(&scx_parent(sch)->bypass_dsp_enable_depth);
+		WARN_ON_ONCE(ret < 0);
+	}
 }
 
 /**
@@ -5266,6 +5348,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 	scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
 	scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
 	scx_dump_event(s, &events, SCX_EV_INSERT_NOT_OWNED);
+	scx_dump_event(s, &events, SCX_EV_SUB_BYPASS_DISPATCH);
 
 	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 575c26f9a3d7..1da3b9b75d18 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -24,6 +24,8 @@ enum scx_consts {
 	 */
 	SCX_TASK_ITER_BATCH		= 32,
 
+	SCX_BYPASS_HOST_NTH		= 2,
+
 	SCX_BYPASS_LB_DFL_INTV_US	= 500 * USEC_PER_MSEC,
 	SCX_BYPASS_LB_DONOR_PCT		= 125,
 	SCX_BYPASS_LB_MIN_DELTA_DIV	= 4,
@@ -923,6 +925,12 @@ struct scx_event_stats {
 	 * scheduler.
 	 */
 	s64		SCX_EV_INSERT_NOT_OWNED;
+
+	/*
+	 * The number of times tasks from bypassing descendants are scheduled
+	 * from sub_bypass_dsq's.
+	 */
+	s64		SCX_EV_SUB_BYPASS_DISPATCH;
 };
 
 enum scx_sched_pcpu_flags {
@@ -940,6 +948,9 @@ struct scx_sched_pcpu {
 	struct scx_event_stats	event_stats;
 
 	struct scx_dispatch_q	bypass_dsq;
+#ifdef CONFIG_EXT_SUB_SCHED
+	u32			bypass_host_seq;
+#endif
 };
 
 struct scx_sched {

From 0203e0c3f64c6e5b1cb8c28a6661e246feb8043c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 028/134] sched_ext: Dispatch from all scx_sched instances

The cgroup sub-sched support involves invasive changes to many areas of
sched_ext. The overall scaffolding is now in place and the next step is
implementing sub-sched enable/disable.

To enable partial testing and verification, update balance_one() to
dispatch from all scx_sched instances until it finds a task to run. This
should keep scheduling working when sub-scheds are enabled with tasks on
them. This will be replaced by BPF-driven hierarchical operation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2430505ca066..bf23e092032b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2526,7 +2526,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 
 static int balance_one(struct rq *rq, struct task_struct *prev)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch = scx_root, *pos;
 	s32 cpu = cpu_of(rq);
 
 	lockdep_assert_rq_held(rq);
@@ -2570,9 +2570,13 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (rq->scx.local_dsq.nr)
 		goto has_tasks;
 
-	/* dispatch @sch */
-	if (scx_dispatch_sched(sch, rq, prev))
-		goto has_tasks;
+	/*
+	 * TEMPORARY - Dispatch all scheds. This will be replaced by BPF-driven
+	 * hierarchical operation.
+	 */
+	list_for_each_entry_rcu(pos, &scx_sched_all, all)
+		if (scx_dispatch_sched(pos, rq, prev))
+			goto has_tasks;
 
 	/*
 	 * Didn't find another task to run. Keep running @prev unless

From 34ecfb355104c29734c124b35ffd598a49e62156 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 029/134] sched_ext: Move scx_dsp_ctx and scx_dsp_max_batch
 into scx_sched

scx_dsp_ctx and scx_dsp_max_batch are global variables used in the dispatch
path. In prepration for multiple scheduler support, move the former into
scx_sched_pcpu and the latter into scx_sched. No user-visible behavior
changes intended.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 55 ++++++++++---------------------------
 kernel/sched/ext_internal.h | 19 +++++++++++++
 2 files changed, 34 insertions(+), 40 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bf23e092032b..43569d138bd2 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -106,25 +106,6 @@ static const struct rhashtable_params dsq_hash_params = {
 
 static LLIST_HEAD(dsqs_to_free);
 
-/* dispatch buf */
-struct scx_dsp_buf_ent {
-	struct task_struct	*task;
-	unsigned long		qseq;
-	u64			dsq_id;
-	u64			enq_flags;
-};
-
-static u32 scx_dsp_max_batch;
-
-struct scx_dsp_ctx {
-	struct rq		*rq;
-	u32			cursor;
-	u32			nr_tasks;
-	struct scx_dsp_buf_ent	buf[];
-};
-
-static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
-
 /* string formatting from BPF */
 struct scx_bstr_buf {
 	u64			data[MAX_BPRINTF_VARARGS];
@@ -2402,7 +2383,7 @@ retry:
 
 static void flush_dispatch_buf(struct scx_sched *sch, struct rq *rq)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
 	u32 u;
 
 	for (u = 0; u < dspc->cursor; u++) {
@@ -2432,7 +2413,7 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
 static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 			       struct task_struct *prev)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
 	s32 cpu = cpu_of(rq);
 	bool prev_on_sch = (prev->sched_class == &ext_sched_class) &&
@@ -4972,9 +4953,6 @@ static void scx_root_disable(struct scx_sched *sch)
 	 */
 	kobject_del(&sch->kobj);
 
-	free_percpu(scx_dsp_ctx);
-	scx_dsp_ctx = NULL;
-	scx_dsp_max_batch = 0;
 	free_kick_syncs();
 
 	mutex_unlock(&scx_enable_mutex);
@@ -5469,7 +5447,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		sch->global_dsqs[node] = dsq;
 	}
 
-	sch->pcpu = alloc_percpu(struct scx_sched_pcpu);
+	sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
+	sch->pcpu = __alloc_percpu(struct_size_t(struct scx_sched_pcpu,
+						 dsp_ctx.buf, sch->dsp_max_batch),
+				   __alignof__(struct scx_sched_pcpu));
 	if (!sch->pcpu) {
 		ret = -ENOMEM;
 		goto err_free_gdsqs;
@@ -5716,16 +5697,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_disable;
 
-	WARN_ON_ONCE(scx_dsp_ctx);
-	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
-	scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
-						   scx_dsp_max_batch),
-				     __alignof__(struct scx_dsp_ctx));
-	if (!scx_dsp_ctx) {
-		ret = -ENOMEM;
-		goto err_disable;
-	}
-
 	if (ops->timeout_ms)
 		timeout = msecs_to_jiffies(ops->timeout_ms);
 	else
@@ -6703,7 +6674,7 @@ static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p
 static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
 				  u64 dsq_id, u64 enq_flags)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
+	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
 	struct task_struct *ddsp_task;
 
 	ddsp_task = __this_cpu_read(direct_dispatch_task);
@@ -6712,7 +6683,7 @@ static void scx_dsq_insert_commit(struct scx_sched *sch, struct task_struct *p,
 		return;
 	}
 
-	if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
+	if (unlikely(dspc->cursor >= sch->dsp_max_batch)) {
 		scx_error(sch, "dispatch buffer overflow");
 		return;
 	}
@@ -7030,7 +7001,7 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux)
 	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return 0;
 
-	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
+	return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor);
 }
 
 /**
@@ -7042,8 +7013,8 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux)
  */
 __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_sched *sch;
+	struct scx_dsp_ctx *dspc;
 
 	guard(rcu)();
 
@@ -7054,6 +7025,8 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
 	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return;
 
+	dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
+
 	if (dspc->cursor > 0)
 		dspc->cursor--;
 	else
@@ -7077,9 +7050,9 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
  */
 __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_dispatch_q *dsq;
 	struct scx_sched *sch;
+	struct scx_dsp_ctx *dspc;
 
 	guard(rcu)();
 
@@ -7090,6 +7063,8 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux
 	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return false;
 
+	dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
+
 	flush_dispatch_buf(sch, dspc->rq);
 
 	dsq = find_user_dsq(sch, dsq_id);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 1da3b9b75d18..dba8c2ce8ed9 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -937,6 +937,21 @@ enum scx_sched_pcpu_flags {
 	SCX_SCHED_PCPU_BYPASSING	= 1LLU << 0,
 };
 
+/* dispatch buf */
+struct scx_dsp_buf_ent {
+	struct task_struct	*task;
+	unsigned long		qseq;
+	u64			dsq_id;
+	u64			enq_flags;
+};
+
+struct scx_dsp_ctx {
+	struct rq		*rq;
+	u32			cursor;
+	u32			nr_tasks;
+	struct scx_dsp_buf_ent	buf[];
+};
+
 struct scx_sched_pcpu {
 	u64			flags;	/* protected by rq lock */
 
@@ -951,6 +966,9 @@ struct scx_sched_pcpu {
 #ifdef CONFIG_EXT_SUB_SCHED
 	u32			bypass_host_seq;
 #endif
+
+	/* must be the last entry - contains flex array */
+	struct scx_dsp_ctx	dsp_ctx;
 };
 
 struct scx_sched {
@@ -978,6 +996,7 @@ struct scx_sched {
 	atomic_t		bypass_dsp_enable_depth;
 
 	bool			aborting;
+	u32			dsp_max_batch;
 	s32			level;
 
 	/*

From cde94c032b32be773ef05db9847be6f02fb123f0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 030/134] sched_ext: Make watchdog sub-sched aware

Currently, the watchdog checks all tasks as if they are all on scx_root.
Move scx_watchdog_timeout inside scx_sched and make check_rq_for_timeouts()
use the timeout from the scx_sched associated with each task.
refresh_watchdog() is added, which determines the timer interval as half of
the shortest watchdog timeouts of all scheds and arms or disarms it as
necessary. Every scx_sched instance has equivalent or better detection
latency while sharing the same timer.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 74 ++++++++++++++++++++++++-------------
 kernel/sched/ext_internal.h |  7 ++++
 2 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 43569d138bd2..1452c20b6483 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -59,11 +59,10 @@ static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
 
 /*
- * The maximum amount of time in jiffies that a task may be runnable without
- * being scheduled on a CPU. If this timeout is exceeded, it will trigger
- * scx_error().
+ * Watchdog interval. All scx_sched's share a single watchdog timer and the
+ * interval is half of the shortest sch->watchdog_timeout.
  */
-static unsigned long scx_watchdog_timeout;
+static unsigned long scx_watchdog_interval;
 
 /*
  * The last time the delayed work was run. This delayed work relies on
@@ -3038,10 +3037,11 @@ static bool check_rq_for_timeouts(struct rq *rq)
 		goto out_unlock;
 
 	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
+		struct scx_sched *sch = scx_task_sched(p);
 		unsigned long last_runnable = p->scx.runnable_at;
 
 		if (unlikely(time_after(jiffies,
-					last_runnable + READ_ONCE(scx_watchdog_timeout)))) {
+					last_runnable + READ_ONCE(sch->watchdog_timeout)))) {
 			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
 
 			scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
@@ -3058,6 +3058,7 @@ out_unlock:
 
 static void scx_watchdog_workfn(struct work_struct *work)
 {
+	unsigned long intv;
 	int cpu;
 
 	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
@@ -3068,28 +3069,31 @@ static void scx_watchdog_workfn(struct work_struct *work)
 
 		cond_resched();
 	}
-	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
-			   READ_ONCE(scx_watchdog_timeout) / 2);
+
+	intv = READ_ONCE(scx_watchdog_interval);
+	if (intv < ULONG_MAX)
+		queue_delayed_work(system_unbound_wq, to_delayed_work(work),
+				   intv);
 }
 
 void scx_tick(struct rq *rq)
 {
-	struct scx_sched *sch;
+	struct scx_sched *root;
 	unsigned long last_check;
 
 	if (!scx_enabled())
 		return;
 
-	sch = rcu_dereference_bh(scx_root);
-	if (unlikely(!sch))
+	root = rcu_dereference_bh(scx_root);
+	if (unlikely(!root))
 		return;
 
 	last_check = READ_ONCE(scx_watchdog_timestamp);
 	if (unlikely(time_after(jiffies,
-				last_check + READ_ONCE(scx_watchdog_timeout)))) {
+				last_check + READ_ONCE(root->watchdog_timeout)))) {
 		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
 
-		scx_exit(sch, SCX_EXIT_ERROR_STALL, 0,
+		scx_exit(root, SCX_EXIT_ERROR_STALL, 0,
 			 "watchdog failed to check in for %u.%03us",
 			 dur_ms / 1000, dur_ms % 1000);
 	}
@@ -4760,6 +4764,26 @@ static void free_kick_syncs(void)
 	}
 }
 
+static void refresh_watchdog(void)
+{
+	struct scx_sched *sch;
+	unsigned long intv = ULONG_MAX;
+
+	/* take the shortest timeout and use its half for watchdog interval */
+	rcu_read_lock();
+	list_for_each_entry_rcu(sch, &scx_sched_all, all)
+		intv = max(min(intv, sch->watchdog_timeout / 2), 1);
+	rcu_read_unlock();
+
+	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
+	WRITE_ONCE(scx_watchdog_interval, intv);
+
+	if (intv < ULONG_MAX)
+		mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv);
+	else
+		cancel_delayed_work_sync(&scx_watchdog_work);
+}
+
 #ifdef CONFIG_EXT_SUB_SCHED
 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
 
@@ -4798,6 +4822,8 @@ static void scx_sub_disable(struct scx_sched *sch)
 	list_del_rcu(&sch->all);
 	raw_spin_unlock_irq(&scx_sched_lock);
 
+	refresh_watchdog();
+
 	mutex_unlock(&scx_enable_mutex);
 
 	/*
@@ -4932,12 +4958,12 @@ static void scx_root_disable(struct scx_sched *sch)
 	if (sch->ops.exit)
 		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
 
-	cancel_delayed_work_sync(&scx_watchdog_work);
-
 	raw_spin_lock_irq(&scx_sched_lock);
 	list_del_rcu(&sch->all);
 	raw_spin_unlock_irq(&scx_sched_lock);
 
+	refresh_watchdog();
+
 	/*
 	 * scx_root clearing must be inside cpus_read_lock(). See
 	 * handle_hotplug().
@@ -5473,6 +5499,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	sch->ancestors[level] = sch;
 	sch->level = level;
 
+	if (ops->timeout_ms)
+		sch->watchdog_timeout = msecs_to_jiffies(ops->timeout_ms);
+	else
+		sch->watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT;
+
 	sch->slice_dfl = SCX_SLICE_DFL;
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
 	init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
@@ -5615,7 +5646,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	struct scx_sched *sch;
 	struct scx_task_iter sti;
 	struct task_struct *p;
-	unsigned long timeout;
 	int i, cpu, ret;
 
 	mutex_lock(&scx_enable_mutex);
@@ -5667,6 +5697,8 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	list_add_tail_rcu(&sch->all, &scx_sched_all);
 	raw_spin_unlock_irq(&scx_sched_lock);
 
+	refresh_watchdog();
+
 	scx_idle_enable(ops);
 
 	if (sch->ops.init) {
@@ -5697,16 +5729,6 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_disable;
 
-	if (ops->timeout_ms)
-		timeout = msecs_to_jiffies(ops->timeout_ms);
-	else
-		timeout = SCX_WATCHDOG_MAX_TIMEOUT;
-
-	WRITE_ONCE(scx_watchdog_timeout, timeout);
-	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
-	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
-			   READ_ONCE(scx_watchdog_timeout) / 2);
-
 	/*
 	 * Once __scx_enabled is set, %current can be switched to SCX anytime.
 	 * This can lead to stalls as some BPF schedulers (e.g. userspace
@@ -5928,6 +5950,8 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	list_add_tail_rcu(&sch->all, &scx_sched_all);
 	raw_spin_unlock_irq(&scx_sched_lock);
 
+	refresh_watchdog();
+
 	if (sch->level >= SCX_SUB_MAX_DEPTH) {
 		scx_error(sch, "max nesting depth %d violated",
 			  SCX_SUB_MAX_DEPTH);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index dba8c2ce8ed9..9268df30be7f 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1019,6 +1019,13 @@ struct scx_sched {
 	bool			sub_attached;
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
+	/*
+	 * The maximum amount of time in jiffies that a task may be runnable
+	 * without being scheduled on a CPU. If this timeout is exceeded, it
+	 * will trigger scx_error().
+	 */
+	unsigned long		watchdog_timeout;
+
 	atomic_t		exit_kind;
 	struct scx_exit_info	*exit_info;
 

From eff782fddb5cb03c247fdb68b148abfd30cda1fe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 031/134] sched_ext: Convert scx_dump_state() spinlock to raw
 spinlock

The scx_dump_state() function uses a regular spinlock to serialize
access. In a subsequent patch, this function will be called while
holding scx_sched_lock, which is a raw spinlock, creating a lock
nesting violation.

Convert the dump_lock to a raw spinlock and use the guard macro for
cleaner lock management.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1452c20b6483..0e8f5a62ff1c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5220,7 +5220,7 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
 
 static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 {
-	static DEFINE_SPINLOCK(dump_lock);
+	static DEFINE_RAW_SPINLOCK(dump_lock);
 	static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
 	struct scx_sched *sch = scx_root;
 	struct scx_dump_ctx dctx = {
@@ -5232,11 +5232,10 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 	};
 	struct seq_buf s;
 	struct scx_event_stats events;
-	unsigned long flags;
 	char *buf;
 	int cpu;
 
-	spin_lock_irqsave(&dump_lock, flags);
+	guard(raw_spinlock_irqsave)(&dump_lock);
 
 	seq_buf_init(&s, ei->dump, dump_len);
 
@@ -5361,8 +5360,6 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
 		       trunc_marker, sizeof(trunc_marker));
-
-	spin_unlock_irqrestore(&dump_lock, flags);
 }
 
 static void scx_error_irq_workfn(struct irq_work *irq_work)

From 9276b7ccb2202f1c2324e4346d12fd6df166c747 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 032/134] sched_ext: Support dumping multiple schedulers and
 add scheduler identification

Extend scx_dump_state() to support multiple schedulers and improve task
identification in dumps. The function now takes a specific scheduler to
dump and can optionally filter tasks by scheduler.

scx_dump_task() now displays which scheduler each task belongs to, using
"*" to mark tasks owned by the scheduler being dumped. Sub-schedulers
are identified with their level and cgroup ID.

The SysRq-D handler now iterates through all active schedulers under
scx_sched_lock and dumps each one separately. For SysRq-D dumps, only
tasks owned by each scheduler are dumped to avoid redundancy since all
schedulers are being dumped. Error-triggered dumps continue to dump all
tasks since only that specific scheduler is being dumped.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 54 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0e8f5a62ff1c..3f237b9da970 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5175,22 +5175,34 @@ static void ops_dump_exit(void)
 	scx_dump_data.cpu = -1;
 }
 
-static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
+static void scx_dump_task(struct scx_sched *sch,
+			  struct seq_buf *s, struct scx_dump_ctx *dctx,
 			  struct task_struct *p, char marker)
 {
 	static unsigned long bt[SCX_EXIT_BT_LEN];
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *task_sch = scx_task_sched(p);
+	const char *own_marker;
+	char sch_id_buf[32];
 	char dsq_id_buf[19] = "(n/a)";
 	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
 	unsigned int bt_len = 0;
 
+	own_marker = task_sch == sch ? "*" : "";
+
+	if (task_sch->level == 0)
+		scnprintf(sch_id_buf, sizeof(sch_id_buf), "root");
+	else
+		scnprintf(sch_id_buf, sizeof(sch_id_buf), "sub%d-%llu",
+			  task_sch->level, task_sch->ops.sub_cgroup_id);
+
 	if (p->scx.dsq)
 		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
 			  (unsigned long long)p->scx.dsq->id);
 
 	dump_newline(s);
-	dump_line(s, " %c%c %s[%d] %+ldms",
+	dump_line(s, " %c%c %s[%d] %s%s %+ldms",
 		  marker, task_state_to_char(p), p->comm, p->pid,
+		  own_marker, sch_id_buf,
 		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
 	dump_line(s, "      scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
 		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
@@ -5218,11 +5230,18 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
 	}
 }
 
-static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
+/*
+ * Dump scheduler state. If @dump_all_tasks is true, dump all tasks regardless
+ * of which scheduler they belong to. If false, only dump tasks owned by @sch.
+ * For SysRq-D dumps, @dump_all_tasks=false since all schedulers are dumped
+ * separately. For error dumps, @dump_all_tasks=true since only the failing
+ * scheduler is dumped.
+ */
+static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
+			   size_t dump_len, bool dump_all_tasks)
 {
 	static DEFINE_RAW_SPINLOCK(dump_lock);
 	static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
-	struct scx_sched *sch = scx_root;
 	struct scx_dump_ctx dctx = {
 		.kind = ei->kind,
 		.exit_code = ei->exit_code,
@@ -5239,6 +5258,14 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 
 	seq_buf_init(&s, ei->dump, dump_len);
 
+#ifdef CONFIG_EXT_SUB_SCHED
+	if (sch->level == 0)
+		dump_line(&s, "%s: root", sch->ops.name);
+	else
+		dump_line(&s, "%s: sub%d-%llu %s",
+			  sch->ops.name, sch->level, sch->ops.sub_cgroup_id,
+			  sch->cgrp_path);
+#endif
 	if (ei->kind == SCX_EXIT_NONE) {
 		dump_line(&s, "Debug dump triggered by %s", ei->reason);
 	} else {
@@ -5331,11 +5358,13 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
 				seq_buf_set_overflow(&s);
 		}
 
-		if (rq->curr->sched_class == &ext_sched_class)
-			scx_dump_task(&s, &dctx, rq->curr, '*');
+		if (rq->curr->sched_class == &ext_sched_class &&
+		    (dump_all_tasks || scx_task_on_sched(sch, rq->curr)))
+			scx_dump_task(sch, &s, &dctx, rq->curr, '*');
 
 		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
-			scx_dump_task(&s, &dctx, p, ' ');
+			if (dump_all_tasks || scx_task_on_sched(sch, p))
+				scx_dump_task(sch, &s, &dctx, p, ' ');
 	next:
 		rq_unlock_irqrestore(rq, &rf);
 	}
@@ -5368,7 +5397,7 @@ static void scx_error_irq_workfn(struct irq_work *irq_work)
 	struct scx_exit_info *ei = sch->exit_info;
 
 	if (ei->kind >= SCX_EXIT_ERROR)
-		scx_dump_state(ei, sch->ops.exit_dump_len);
+		scx_dump_state(sch, ei, sch->ops.exit_dump_len, true);
 
 	kthread_queue_work(sch->helper, &sch->disable_work);
 }
@@ -6400,9 +6429,12 @@ static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
 static void sysrq_handle_sched_ext_dump(u8 key)
 {
 	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
+	struct scx_sched *sch;
 
-	if (scx_enabled())
-		scx_dump_state(&ei, 0);
+	guard(raw_spinlock_irqsave)(&scx_sched_lock);
+
+	list_for_each_entry_rcu(sch, &scx_sched_all, all)
+		scx_dump_state(sch, &ei, 0, false);
 }
 
 static const struct sysrq_key_op sysrq_sched_ext_dump_op = {

From 337ec00b1d9c676f637651c2cefddb8612b867ee Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 033/134] sched_ext: Implement cgroup sub-sched enabling and
 disabling

The preceding changes implemented the framework to support cgroup
sub-scheds and updated scheduling paths and kfuncs so that they have
minimal but working support for sub-scheds. However, actual sub-sched
enabling/disabling hasn't been implemented yet and all tasks stayed on
scx_root.

Implement cgroup sub-sched enabling and disabling to actually activate
sub-scheds:

- Both enable and disable operations bypass only the tasks in the subtree
  of the child being enabled or disabled to limit disruptions.

- When enabling, all candidate tasks are first initialized for the child
  sched. Once that succeeds, the tasks are exited for the parent and then
  switched over to the child. This adds a bit of complication but
  guarantees that child scheduler failures are always contained.

- Disabling works the same way in the other direction. However, when the
  parent may fail to initialize a task, disabling is propagated up to the
  parent. While this means that a parent sched fail due to a child sched
  event, the failure can only originate from the parent itself (its
  ops.init_task()). The only effect a malfunctioning child can have on the
  parent is attempting to move the tasks back to the parent.

After this change, although not all the necessary mechanisms are in place
yet, sub-scheds can take control of their tasks and schedule them.

v2: Fix missing scx_cgroup_unlock()/percpu_up_write() in abort path
    (Cheng-Yang Chou).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h |   1 +
 kernel/sched/ext.c        | 285 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 280 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 3213e31c7979..f354d7d34306 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -88,6 +88,7 @@ enum scx_ent_flags {
 	SCX_TASK_IN_CUSTODY	= 1 << 1, /* in custody, needs ops.dequeue() when leaving */
 	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
+	SCX_TASK_SUB_INIT	= 1 << 4, /* task being initialized for a sub sched */
 
 	SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */
 	SCX_TASK_STATE_BITS	= 2,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3f237b9da970..70d0f9e8ef61 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -51,6 +51,17 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
 static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0);
 
+#ifdef CONFIG_EXT_SUB_SCHED
+/*
+ * The sub sched being enabled. Used by scx_disable_and_exit_task() to exit
+ * tasks for the sub-sched being enabled. Use a global variable instead of a
+ * per-task field as all enables are serialized.
+ */
+static struct scx_sched *scx_enabling_sub_sched;
+#else
+#define scx_enabling_sub_sched	(struct scx_sched *)NULL
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 /*
  * A monotically increasing sequence number that is incremented every time a
  * scheduler is enabled. This can be used by to check if any custom sched_ext
@@ -3342,6 +3353,17 @@ static void scx_disable_and_exit_task(struct scx_sched *sch,
 {
 	__scx_disable_and_exit_task(sch, p);
 
+	/*
+	 * If set, @p exited between __scx_init_task() and scx_enable_task() in
+	 * scx_sub_enable() and is initialized for both the associated sched and
+	 * its parent. Disable and exit for the child too.
+	 */
+	if ((p->scx.flags & SCX_TASK_SUB_INIT) &&
+	    !WARN_ON_ONCE(!scx_enabling_sub_sched)) {
+		__scx_disable_and_exit_task(scx_enabling_sub_sched, p);
+		p->scx.flags &= ~SCX_TASK_SUB_INIT;
+	}
+
 	scx_set_task_sched(p, NULL);
 	scx_set_task_state(p, SCX_TASK_NONE);
 }
@@ -3377,9 +3399,14 @@ int scx_fork(struct task_struct *p, struct kernel_clone_args *kargs)
 	percpu_rwsem_assert_held(&scx_fork_rwsem);
 
 	if (scx_init_task_enabled) {
-		ret = scx_init_task(scx_root, p, true);
+#ifdef CONFIG_EXT_SUB_SCHED
+		struct scx_sched *sch = kargs->cset->dfl_cgrp->scx_sched;
+#else
+		struct scx_sched *sch = scx_root;
+#endif
+		ret = scx_init_task(sch, p, true);
 		if (!ret)
-			scx_set_task_sched(p, scx_root);
+			scx_set_task_sched(p, sch);
 		return ret;
 	}
 
@@ -4643,9 +4670,9 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 		struct rq *rq = cpu_rq(cpu);
 		struct task_struct *p, *n;
 
+		raw_spin_lock(&scx_sched_lock);
 		raw_spin_rq_lock(rq);
 
-		raw_spin_lock(&scx_sched_lock);
 		scx_for_each_descendant_pre(pos, sch) {
 			struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
 
@@ -4654,6 +4681,7 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 			else
 				pcpu->flags &= ~SCX_SCHED_PCPU_BYPASSING;
 		}
+
 		raw_spin_unlock(&scx_sched_lock);
 
 		/*
@@ -4798,23 +4826,139 @@ static void drain_descendants(struct scx_sched *sch)
 	wait_event(scx_unlink_waitq, list_empty(&sch->children));
 }
 
+static void scx_fail_parent(struct scx_sched *sch,
+			    struct task_struct *failed, s32 fail_code)
+{
+	struct scx_sched *parent = scx_parent(sch);
+	struct scx_task_iter sti;
+	struct task_struct *p;
+
+	scx_error(parent, "ops.init_task() failed (%d) for %s[%d] while disabling a sub-scheduler",
+		  fail_code, failed->comm, failed->pid);
+
+	/*
+	 * Once $parent is bypassed, it's safe to put SCX_TASK_NONE tasks into
+	 * it. This may cause downstream failures on the BPF side but $parent is
+	 * dying anyway.
+	 */
+	scx_bypass(parent, true);
+
+	scx_task_iter_start(&sti, sch->cgrp);
+	while ((p = scx_task_iter_next_locked(&sti))) {
+		if (scx_task_on_sched(parent, p))
+			continue;
+
+		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+			scx_disable_and_exit_task(sch, p);
+			rcu_assign_pointer(p->scx.sched, parent);
+		}
+	}
+	scx_task_iter_stop(&sti);
+}
+
 static void scx_sub_disable(struct scx_sched *sch)
 {
 	struct scx_sched *parent = scx_parent(sch);
+	struct scx_task_iter sti;
+	struct task_struct *p;
+	int ret;
 
+	/*
+	 * Guarantee forward progress and wait for descendants to be disabled.
+	 * To limit
+	 * disruptions, $parent is not bypassed. Tasks are fully prepped and
+	 * then inserted back into $parent.
+	 */
+	scx_bypass(sch, true);
 	drain_descendants(sch);
 
+	/*
+	 * Here, every runnable task is guaranteed to make forward progress and
+	 * we can safely use blocking synchronization constructs. Actually
+	 * disable ops.
+	 */
 	mutex_lock(&scx_enable_mutex);
 	percpu_down_write(&scx_fork_rwsem);
 	scx_cgroup_lock();
 
 	set_cgroup_sched(sch_cgroup(sch), parent);
 
-	/* TODO - perform actual disabling here */
+	scx_task_iter_start(&sti, sch->cgrp);
+	while ((p = scx_task_iter_next_locked(&sti))) {
+		struct rq *rq;
+		struct rq_flags rf;
+
+		/* filter out duplicate visits */
+		if (scx_task_on_sched(parent, p))
+			continue;
+
+		/*
+		 * By the time control reaches here, all descendant schedulers
+		 * should already have been disabled.
+		 */
+		WARN_ON_ONCE(!scx_task_on_sched(sch, p));
+
+		/*
+		 * If $p is about to be freed, nothing prevents $sch from
+		 * unloading before $p reaches sched_ext_free(). Disable and
+		 * exit $p right away.
+		 */
+		if (!tryget_task_struct(p)) {
+			scx_disable_and_exit_task(sch, p);
+			continue;
+		}
+
+		scx_task_iter_unlock(&sti);
+
+		/*
+		 * $p is READY or ENABLED on @sch. Initialize for $parent,
+		 * disable and exit from @sch, and then switch over to $parent.
+		 *
+		 * If a task fails to initialize for $parent, the only available
+		 * action is disabling $parent too. While this allows disabling
+		 * of a child sched to cause the parent scheduler to fail, the
+		 * failure can only originate from ops.init_task() of the
+		 * parent. A child can't directly affect the parent through its
+		 * own failures.
+		 */
+		ret = __scx_init_task(parent, p, false);
+		if (ret) {
+			scx_fail_parent(sch, p, ret);
+			put_task_struct(p);
+			break;
+		}
+
+		rq = task_rq_lock(p, &rf);
+		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+			/*
+			 * $p is initialized for $parent and still attached to
+			 * @sch. Disable and exit for @sch, switch over to
+			 * $parent, override the state to READY to account for
+			 * $p having already been initialized, and then enable.
+			 */
+			scx_disable_and_exit_task(sch, p);
+			scx_set_task_state(p, SCX_TASK_INIT);
+			rcu_assign_pointer(p->scx.sched, parent);
+			scx_set_task_state(p, SCX_TASK_READY);
+			scx_enable_task(parent, p);
+		}
+		task_rq_unlock(rq, p, &rf);
+
+		put_task_struct(p);
+	}
+	scx_task_iter_stop(&sti);
 
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 
+	/*
+	 * All tasks are moved off of @sch but there may still be on-going
+	 * operations (e.g. ops.select_cpu()). Drain them by flushing RCU. Use
+	 * the expedited version as ancestors may be waiting in bypass mode.
+	 * Also, tell the parent that there is no need to keep running bypass
+	 * DSQs for us.
+	 */
+	synchronize_rcu_expedited();
 	disable_bypass_dsp(sch);
 
 	raw_spin_lock_irq(&scx_sched_lock);
@@ -5933,13 +6077,30 @@ static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
 	return parent;
 }
 
+static bool assert_task_ready_or_enabled(struct task_struct *p)
+{
+	enum scx_task_state state = scx_get_task_state(p);
+
+	switch (state) {
+	case SCX_TASK_READY:
+	case SCX_TASK_ENABLED:
+		return true;
+	default:
+		WARN_ONCE(true, "sched_ext: Invalid task state %d for %s[%d] during enabling sub sched",
+			  state, p->comm, p->pid);
+		return false;
+	}
+}
+
 static void scx_sub_enable_workfn(struct kthread_work *work)
 {
 	struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
 	struct sched_ext_ops *ops = cmd->ops;
 	struct cgroup *cgrp;
 	struct scx_sched *parent, *sch;
-	s32 ret;
+	struct scx_task_iter sti;
+	struct task_struct *p;
+	s32 i, ret;
 
 	mutex_lock(&scx_enable_mutex);
 
@@ -6011,6 +6172,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	}
 	sch->sub_attached = true;
 
+	scx_bypass(sch, true);
+
+	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
+		if (((void (**)(void))ops)[i])
+			set_bit(i, sch->has_op);
+
 	percpu_down_write(&scx_fork_rwsem);
 	scx_cgroup_lock();
 
@@ -6024,16 +6191,121 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 		goto err_unlock_and_disable;
 	}
 
-	/* TODO - perform actual enabling here */
+	/*
+	 * Initialize tasks for the new child $sch without exiting them for
+	 * $parent so that the tasks can always be reverted back to $parent
+	 * sched on child init failure.
+	 */
+	WARN_ON_ONCE(scx_enabling_sub_sched);
+	scx_enabling_sub_sched = sch;
+
+	scx_task_iter_start(&sti, sch->cgrp);
+	while ((p = scx_task_iter_next_locked(&sti))) {
+		struct rq *rq;
+		struct rq_flags rf;
+
+		/*
+		 * Task iteration may visit the same task twice when racing
+		 * against exiting. Use %SCX_TASK_SUB_INIT to mark tasks which
+		 * finished __scx_init_task() and skip if set.
+		 *
+		 * A task may exit and get freed between __scx_init_task()
+		 * completion and scx_enable_task(). In such cases,
+		 * scx_disable_and_exit_task() must exit the task for both the
+		 * parent and child scheds.
+		 */
+		if (p->scx.flags & SCX_TASK_SUB_INIT)
+			continue;
+
+		/* see scx_root_enable() */
+		if (!tryget_task_struct(p))
+			continue;
+
+		if (!assert_task_ready_or_enabled(p)) {
+			ret = -EINVAL;
+			goto abort;
+		}
+
+		scx_task_iter_unlock(&sti);
+
+		/*
+		 * As $p is still on $parent, it can't be transitioned to INIT.
+		 * Let's worry about task state later. Use __scx_init_task().
+		 */
+		ret = __scx_init_task(sch, p, false);
+		if (ret)
+			goto abort;
+
+		rq = task_rq_lock(p, &rf);
+		p->scx.flags |= SCX_TASK_SUB_INIT;
+		task_rq_unlock(rq, p, &rf);
+
+		put_task_struct(p);
+	}
+	scx_task_iter_stop(&sti);
+
+	/*
+	 * All tasks are prepped. Disable/exit tasks for $parent and enable for
+	 * the new @sch.
+	 */
+	scx_task_iter_start(&sti, sch->cgrp);
+	while ((p = scx_task_iter_next_locked(&sti))) {
+		/*
+		 * Use clearing of %SCX_TASK_SUB_INIT to detect and skip
+		 * duplicate iterations.
+		 */
+		if (!(p->scx.flags & SCX_TASK_SUB_INIT))
+			continue;
+
+		scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) {
+			/*
+			 * $p must be either READY or ENABLED. If ENABLED,
+			 * __scx_disabled_and_exit_task() first disables and
+			 * makes it READY. However, after exiting $p, it will
+			 * leave $p as READY.
+			 */
+			assert_task_ready_or_enabled(p);
+			__scx_disable_and_exit_task(parent, p);
+
+			/*
+			 * $p is now only initialized for @sch and READY, which
+			 * is what we want. Assign it to @sch and enable.
+			 */
+			rcu_assign_pointer(p->scx.sched, sch);
+			scx_enable_task(sch, p);
+
+			p->scx.flags &= ~SCX_TASK_SUB_INIT;
+		}
+	}
+	scx_task_iter_stop(&sti);
+
+	scx_enabling_sub_sched = NULL;
 
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 
+	scx_bypass(sch, false);
+
 	pr_info("sched_ext: BPF sub-scheduler \"%s\" enabled\n", sch->ops.name);
 	kobject_uevent(&sch->kobj, KOBJ_ADD);
 	ret = 0;
 	goto out_unlock;
 
+abort:
+	put_task_struct(p);
+	scx_task_iter_stop(&sti);
+	scx_enabling_sub_sched = NULL;
+
+	scx_task_iter_start(&sti, sch->cgrp);
+	while ((p = scx_task_iter_next_locked(&sti))) {
+		if (p->scx.flags & SCX_TASK_SUB_INIT) {
+			__scx_disable_and_exit_task(sch, p);
+			p->scx.flags &= ~SCX_TASK_SUB_INIT;
+		}
+	}
+	scx_task_iter_stop(&sti);
+	scx_cgroup_unlock();
+	percpu_up_write(&scx_fork_rwsem);
 out_put_cgrp:
 	cgroup_put(cgrp);
 out_unlock:
@@ -6042,6 +6314,7 @@ out_unlock:
 	return;
 
 err_unlock_and_disable:
+	/* we'll soon enter disable path, keep bypass on */
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 err_disable:

From 7f5fcd47dd62ba7e150468ca05d30bfd26feb306 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 034/134] sched_ext: Add scx_sched back pointer to
 scx_sched_pcpu

Add a back pointer from scx_sched_pcpu to scx_sched. This will be used by
the next patch to make scx_bpf_reenqueue_local() sub-sched aware.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 3 +++
 kernel/sched/ext_internal.h | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 70d0f9e8ef61..e8378d59ddae 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5655,6 +5655,9 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	for_each_possible_cpu(cpu)
 		init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
 
+	for_each_possible_cpu(cpu)
+		per_cpu_ptr(sch->pcpu, cpu)->sch = sch;
+
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
 	if (IS_ERR(sch->helper)) {
 		ret = PTR_ERR(sch->helper);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 9268df30be7f..69d6e9b4d78e 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -933,6 +933,8 @@ struct scx_event_stats {
 	s64		SCX_EV_SUB_BYPASS_DISPATCH;
 };
 
+struct scx_sched;
+
 enum scx_sched_pcpu_flags {
 	SCX_SCHED_PCPU_BYPASSING	= 1LLU << 0,
 };
@@ -953,6 +955,7 @@ struct scx_dsp_ctx {
 };
 
 struct scx_sched_pcpu {
+	struct scx_sched	*sch;
 	u64			flags;	/* protected by rq lock */
 
 	/*

From 0d8c551dd5de1c157600da05a01e3147115dfbb4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 035/134] sched_ext: Make scx_bpf_reenqueue_local() sub-sched
 aware

scx_bpf_reenqueue_local() currently re-enqueues all tasks on the local DSQ
regardless of which sub-scheduler owns them. With multiple sub-schedulers,
each should only re-enqueue tasks it owns or are owned by its descendants.

Replace the per-rq boolean flag with a lock-free linked list to track
per-scheduler reenqueue requests. Filter tasks in reenq_local() using
hierarchical ownership checks and block deferrals during bypass to prevent
use on dead schedulers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 73 ++++++++++++++++++++++++++++++-------
 kernel/sched/ext_internal.h |  1 +
 kernel/sched/sched.h        |  2 +-
 3 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e8378d59ddae..f10a9667b491 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -185,7 +185,7 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
 
 static void process_ddsp_deferred_locals(struct rq *rq);
 static bool task_dead_and_done(struct task_struct *p);
-static u32 reenq_local(struct rq *rq);
+static u32 reenq_local(struct scx_sched *sch, struct rq *rq);
 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
@@ -991,9 +991,16 @@ static void run_deferred(struct rq *rq)
 {
 	process_ddsp_deferred_locals(rq);
 
-	if (local_read(&rq->scx.reenq_local_deferred)) {
-		local_set(&rq->scx.reenq_local_deferred, 0);
-		reenq_local(rq);
+	if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
+		struct llist_node *llist =
+			llist_del_all(&rq->scx.deferred_reenq_locals);
+		struct scx_sched_pcpu *pos, *next;
+
+		llist_for_each_entry_safe(pos, next, llist,
+					  deferred_reenq_locals_node) {
+			init_llist_node(&pos->deferred_reenq_locals_node);
+			reenq_local(pos->sch, rq);
+		}
 	}
 }
 
@@ -4082,7 +4089,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	struct scx_sched *sch = container_of(rcu_work, struct scx_sched, rcu_work);
 	struct rhashtable_iter rht_iter;
 	struct scx_dispatch_q *dsq;
-	int node;
+	int cpu, node;
 
 	irq_work_sync(&sch->error_irq_work);
 	kthread_destroy_worker(sch->helper);
@@ -4094,6 +4101,17 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 		cgroup_put(sch_cgroup(sch));
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
+	/*
+	 * $sch would have entered bypass mode before the RCU grace period. As
+	 * that blocks new deferrals, all deferred_reenq_locals_node's must be
+	 * off-list by now.
+	 */
+	for_each_possible_cpu(cpu) {
+		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+		WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node));
+	}
+
 	free_percpu(sch->pcpu);
 
 	for_each_node_state(node, N_POSSIBLE)
@@ -5655,8 +5673,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	for_each_possible_cpu(cpu)
 		init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
 
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(sch->pcpu, cpu)->sch = sch;
+	for_each_possible_cpu(cpu) {
+		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
+
+		pcpu->sch = sch;
+		init_llist_node(&pcpu->deferred_reenq_locals_node);
+	}
 
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
 	if (IS_ERR(sch->helper)) {
@@ -6957,6 +6979,7 @@ void __init init_sched_ext_class(void)
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
+		init_llist_head(&rq->scx.deferred_reenq_locals);
 		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
 		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
 
@@ -7528,7 +7551,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
 	.set			= &scx_kfunc_ids_dispatch,
 };
 
-static u32 reenq_local(struct rq *rq)
+static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
 {
 	LIST_HEAD(tasks);
 	u32 nr_enqueued = 0;
@@ -7543,6 +7566,8 @@ static u32 reenq_local(struct rq *rq)
 	 */
 	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
 				 scx.dsq_list.node) {
+		struct scx_sched *task_sch = scx_task_sched(p);
+
 		/*
 		 * If @p is being migrated, @p's current CPU may not agree with
 		 * its allowed CPUs and the migration_cpu_stop is about to
@@ -7557,6 +7582,9 @@ static u32 reenq_local(struct rq *rq)
 		if (p->migration_pending)
 			continue;
 
+		if (!scx_is_descendant(task_sch, sch))
+			continue;
+
 		dispatch_dequeue(rq, p);
 		list_add_tail(&p->scx.dsq_list.node, &tasks);
 	}
@@ -7599,7 +7627,7 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
 	rq = cpu_rq(smp_processor_id());
 	lockdep_assert_rq_held(rq);
 
-	return reenq_local(rq);
+	return reenq_local(sch, rq);
 }
 
 __bpf_kfunc_end_defs();
@@ -8170,20 +8198,39 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
 
 /**
  * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  *
  * Iterate over all of the tasks currently enqueued on the local DSQ of the
  * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
  * anywhere.
  */
-__bpf_kfunc void scx_bpf_reenqueue_local___v2(void)
+__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 {
+	unsigned long flags;
+	struct scx_sched *sch;
 	struct rq *rq;
+	struct llist_node *lnode;
 
-	guard(preempt)();
+	raw_local_irq_save(flags);
+
+	sch = scx_prog_sched(aux);
+	if (unlikely(!sch))
+		goto out_irq_restore;
+
+	/*
+	 * Allowing reenqueue-locals doesn't make sense while bypassing. This
+	 * also blocks from new reenqueues to be scheduled on dead scheds.
+	 */
+	if (unlikely(sch->bypass_depth))
+		goto out_irq_restore;
 
 	rq = this_rq();
-	local_set(&rq->scx.reenq_local_deferred, 1);
+	lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node;
+	if (!llist_on_list(lnode))
+		llist_add(lnode, &rq->scx.deferred_reenq_locals);
 	schedule_deferred(rq);
+out_irq_restore:
+	raw_local_irq_restore(flags);
 }
 
 /**
@@ -8608,7 +8655,7 @@ BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 69d6e9b4d78e..aac051e27f7f 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -965,6 +965,7 @@ struct scx_sched_pcpu {
 	 */
 	struct scx_event_stats	event_stats;
 
+	struct llist_node	deferred_reenq_locals_node;
 	struct scx_dispatch_q	bypass_dsq;
 #ifdef CONFIG_EXT_SUB_SCHED
 	u32			bypass_host_seq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 596f6713cf7e..7f3b07872e15 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,7 +805,7 @@ struct scx_rq {
 	cpumask_var_t		cpus_to_preempt;
 	cpumask_var_t		cpus_to_wait;
 	unsigned long		kick_sync;
-	local_t			reenq_local_deferred;
+	struct llist_head	deferred_reenq_locals;
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;

From 54be8de4236a52b301825cb51c6d5fdecb2fd6b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 036/134] sched_ext: Factor out scx_link_sched() and
 scx_unlink_sched()

Factor out scx_link_sched() and scx_unlink_sched() functions to reduce
code duplication in the scheduler enable/disable paths.

No functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 53 +++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f10a9667b491..8674a5fa5437 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4830,6 +4830,33 @@ static void refresh_watchdog(void)
 		cancel_delayed_work_sync(&scx_watchdog_work);
 }
 
+static void scx_link_sched(struct scx_sched *sch)
+{
+	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+#ifdef CONFIG_EXT_SUB_SCHED
+		struct scx_sched *parent = scx_parent(sch);
+		if (parent)
+			list_add_tail(&sch->sibling, &parent->children);
+#endif	/* CONFIG_EXT_SUB_SCHED */
+		list_add_tail_rcu(&sch->all, &scx_sched_all);
+	}
+
+	refresh_watchdog();
+}
+
+static void scx_unlink_sched(struct scx_sched *sch)
+{
+	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
+#ifdef CONFIG_EXT_SUB_SCHED
+		if (scx_parent(sch))
+			list_del_init(&sch->sibling);
+#endif	/* CONFIG_EXT_SUB_SCHED */
+		list_del_rcu(&sch->all);
+	}
+
+	refresh_watchdog();
+}
+
 #ifdef CONFIG_EXT_SUB_SCHED
 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
 
@@ -4979,12 +5006,7 @@ static void scx_sub_disable(struct scx_sched *sch)
 	synchronize_rcu_expedited();
 	disable_bypass_dsp(sch);
 
-	raw_spin_lock_irq(&scx_sched_lock);
-	list_del_init(&sch->sibling);
-	list_del_rcu(&sch->all);
-	raw_spin_unlock_irq(&scx_sched_lock);
-
-	refresh_watchdog();
+	scx_unlink_sched(sch);
 
 	mutex_unlock(&scx_enable_mutex);
 
@@ -5120,11 +5142,7 @@ static void scx_root_disable(struct scx_sched *sch)
 	if (sch->ops.exit)
 		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
 
-	raw_spin_lock_irq(&scx_sched_lock);
-	list_del_rcu(&sch->all);
-	raw_spin_unlock_irq(&scx_sched_lock);
-
-	refresh_watchdog();
+	scx_unlink_sched(sch);
 
 	/*
 	 * scx_root clearing must be inside cpus_read_lock(). See
@@ -5888,11 +5906,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	 */
 	rcu_assign_pointer(scx_root, sch);
 
-	raw_spin_lock_irq(&scx_sched_lock);
-	list_add_tail_rcu(&sch->all, &scx_sched_all);
-	raw_spin_unlock_irq(&scx_sched_lock);
-
-	refresh_watchdog();
+	scx_link_sched(sch);
 
 	scx_idle_enable(ops);
 
@@ -6157,12 +6171,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 		goto out_put_cgrp;
 	}
 
-	raw_spin_lock_irq(&scx_sched_lock);
-	list_add_tail(&sch->sibling, &parent->children);
-	list_add_tail_rcu(&sch->all, &scx_sched_all);
-	raw_spin_unlock_irq(&scx_sched_lock);
-
-	refresh_watchdog();
+	scx_link_sched(sch);
 
 	if (sch->level >= SCX_SUB_MAX_DEPTH) {
 		scx_error(sch, "max nesting depth %d violated",

From 25037af712eb04eb92a89440852029a50eea8d82 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 037/134] sched_ext: Add rhashtable lookup for sub-schedulers

Add rhashtable-based lookup for sub-schedulers indexed by cgroup_id to
enable efficient scheduler discovery in preparation for multiple scheduler
support. The hash table allows quick lookup of the appropriate scheduler
instance when processing tasks from different cgroups.

This extends scx_link_sched() to register sub-schedulers in the hash table
and scx_unlink_sched() to remove them. A new scx_find_sub_sched() function
provides the lookup interface.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 50 +++++++++++++++++++++++++++++++++----
 kernel/sched/ext_internal.h |  2 ++
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8674a5fa5437..9db5002a2f4b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -27,6 +27,16 @@ struct scx_sched __rcu *scx_root;
  */
 static LIST_HEAD(scx_sched_all);
 
+#ifdef CONFIG_EXT_SUB_SCHED
+static const struct rhashtable_params scx_sched_hash_params = {
+	.key_len		= sizeof_field(struct scx_sched, ops.sub_cgroup_id),
+	.key_offset		= offsetof(struct scx_sched, ops.sub_cgroup_id),
+	.head_offset		= offsetof(struct scx_sched, hash_node),
+};
+
+static struct rhashtable scx_sched_hash;
+#endif
+
 /*
  * During exit, a task may schedule after losing its PIDs. When disabling the
  * BPF scheduler, we need to be able to iterate tasks in every state to
@@ -287,6 +297,12 @@ static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos,
 	return NULL;
 }
 
+static struct scx_sched *scx_find_sub_sched(u64 cgroup_id)
+{
+	return rhashtable_lookup(&scx_sched_hash, &cgroup_id,
+				 scx_sched_hash_params);
+}
+
 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
 {
 	rcu_assign_pointer(p->scx.sched, sch);
@@ -294,6 +310,7 @@ static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch)
 #else	/* CONFIG_EXT_SUB_SCHED */
 static struct scx_sched *scx_parent(struct scx_sched *sch) { return NULL; }
 static struct scx_sched *scx_next_descendant_pre(struct scx_sched *pos, struct scx_sched *root) { return pos ? NULL : root; }
+static struct scx_sched *scx_find_sub_sched(u64 cgroup_id) { return NULL; }
 static void scx_set_task_sched(struct task_struct *p, struct scx_sched *sch) {}
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
@@ -4830,26 +4847,41 @@ static void refresh_watchdog(void)
 		cancel_delayed_work_sync(&scx_watchdog_work);
 }
 
-static void scx_link_sched(struct scx_sched *sch)
+static s32 scx_link_sched(struct scx_sched *sch)
 {
 	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
 #ifdef CONFIG_EXT_SUB_SCHED
 		struct scx_sched *parent = scx_parent(sch);
-		if (parent)
+		s32 ret;
+
+		if (parent) {
+			ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
+					&sch->hash_node, scx_sched_hash_params);
+			if (ret) {
+				scx_error(sch, "failed to insert into scx_sched_hash (%d)", ret);
+				return ret;
+			}
+
 			list_add_tail(&sch->sibling, &parent->children);
+		}
 #endif	/* CONFIG_EXT_SUB_SCHED */
+
 		list_add_tail_rcu(&sch->all, &scx_sched_all);
 	}
 
 	refresh_watchdog();
+	return 0;
 }
 
 static void scx_unlink_sched(struct scx_sched *sch)
 {
 	scoped_guard(raw_spinlock_irq, &scx_sched_lock) {
 #ifdef CONFIG_EXT_SUB_SCHED
-		if (scx_parent(sch))
+		if (scx_parent(sch)) {
+			rhashtable_remove_fast(&scx_sched_hash, &sch->hash_node,
+					       scx_sched_hash_params);
 			list_del_init(&sch->sibling);
+		}
 #endif	/* CONFIG_EXT_SUB_SCHED */
 		list_del_rcu(&sch->all);
 	}
@@ -5906,7 +5938,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	 */
 	rcu_assign_pointer(scx_root, sch);
 
-	scx_link_sched(sch);
+	ret = scx_link_sched(sch);
+	if (ret)
+		goto err_disable;
 
 	scx_idle_enable(ops);
 
@@ -6171,7 +6205,9 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 		goto out_put_cgrp;
 	}
 
-	scx_link_sched(sch);
+	ret = scx_link_sched(sch);
+	if (ret)
+		goto err_disable;
 
 	if (sch->level >= SCX_SUB_MAX_DEPTH) {
 		scx_error(sch, "max nesting depth %d violated",
@@ -6999,6 +7035,10 @@ void __init init_sched_ext_class(void)
 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
 	register_sysrq_key('D', &sysrq_sched_ext_dump_op);
 	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
+
+#ifdef CONFIG_EXT_SUB_SCHED
+	BUG_ON(rhashtable_init(&scx_sched_hash, &scx_sched_hash_params));
+#endif	/* CONFIG_EXT_SUB_SCHED */
 }
 
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index aac051e27f7f..4cb97093b872 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1014,6 +1014,8 @@ struct scx_sched {
 	struct list_head	all;
 
 #ifdef CONFIG_EXT_SUB_SCHED
+	struct rhash_head	hash_node;
+
 	struct list_head	children;
 	struct list_head	sibling;
 	struct cgroup		*cgrp;

From 4f8b122848dbc353a193de0fa707bc40b5f067ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2026 07:58:04 -1000
Subject: [PATCH 038/134] sched_ext: Add basic building blocks for nested
 sub-scheduler dispatching

This is an early-stage partial implementation that demonstrates the core
building blocks for nested sub-scheduler dispatching. While significant
work remains in the enqueue path and other areas, this patch establishes
the fundamental mechanisms needed for hierarchical scheduler operation.

The key building blocks introduced include:

- Private stack support for ops.dispatch() to prevent stack overflow when
  walking down nested schedulers during dispatch operations

- scx_bpf_sub_dispatch() kfunc that allows parent schedulers to trigger
  dispatch operations on their direct child schedulers

- Proper parent-child relationship validation to ensure dispatch requests
  are only made to legitimate child schedulers

- Updated scx_dispatch_sched() to handle both nested and non-nested
  invocations with appropriate kf_mask handling

The qmap scheduler is updated to demonstrate the functionality by calling
scx_bpf_sub_dispatch() on registered child schedulers when it has no
tasks in its own queues.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c                       | 120 ++++++++++++++++++++---
 kernel/sched/sched.h                     |   3 +
 tools/sched_ext/include/scx/common.bpf.h |   1 +
 tools/sched_ext/scx_qmap.bpf.c           |  37 ++++++-
 4 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9db5002a2f4b..e25b3593dd30 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2444,8 +2444,14 @@ static inline void maybe_queue_balance_callback(struct rq *rq)
 	rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING;
 }
 
-static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
-			       struct task_struct *prev)
+/*
+ * One user of this function is scx_bpf_dispatch() which can be called
+ * recursively as sub-sched dispatches nest. Always inline to reduce stack usage
+ * from the call frame.
+ */
+static __always_inline bool
+scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
+		   struct task_struct *prev, bool nested)
 {
 	struct scx_dsp_ctx *dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
@@ -2499,8 +2505,23 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 	do {
 		dspc->nr_tasks = 0;
 
-		SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
-			    prev_on_sch ? prev : NULL);
+		if (nested) {
+			/*
+			 * If nested, don't update kf_mask as the originating
+			 * invocation would already have set it up.
+			 */
+			SCX_CALL_OP(sch, 0, dispatch, rq, cpu,
+				    prev_on_sch ? prev : NULL);
+		} else {
+			/*
+			 * If not nested, stash @prev so that nested invocations
+			 * can access it.
+			 */
+			rq->scx.sub_dispatch_prev = prev;
+			SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
+				    prev_on_sch ? prev : NULL);
+			rq->scx.sub_dispatch_prev = NULL;
+		}
 
 		flush_dispatch_buf(sch, rq);
 
@@ -2541,7 +2562,7 @@ static bool scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 
 static int balance_one(struct rq *rq, struct task_struct *prev)
 {
-	struct scx_sched *sch = scx_root, *pos;
+	struct scx_sched *sch = scx_root;
 	s32 cpu = cpu_of(rq);
 
 	lockdep_assert_rq_held(rq);
@@ -2585,13 +2606,8 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	if (rq->scx.local_dsq.nr)
 		goto has_tasks;
 
-	/*
-	 * TEMPORARY - Dispatch all scheds. This will be replaced by BPF-driven
-	 * hierarchical operation.
-	 */
-	list_for_each_entry_rcu(pos, &scx_sched_all, all)
-		if (scx_dispatch_sched(pos, rq, prev))
-			goto has_tasks;
+	if (scx_dispatch_sched(sch, rq, prev, false))
+		goto has_tasks;
 
 	/*
 	 * Didn't find another task to run. Keep running @prev unless
@@ -4942,9 +4958,8 @@ static void scx_sub_disable(struct scx_sched *sch)
 
 	/*
 	 * Guarantee forward progress and wait for descendants to be disabled.
-	 * To limit
-	 * disruptions, $parent is not bypassed. Tasks are fully prepped and
-	 * then inserted back into $parent.
+	 * To limit disruptions, $parent is not bypassed. Tasks are fully
+	 * prepped and then inserted back into $parent.
 	 */
 	scx_bypass(sch, true);
 	drain_descendants(sch);
@@ -6580,6 +6595,20 @@ static int bpf_scx_init_member(const struct btf_type *t,
 	return 0;
 }
 
+#ifdef CONFIG_EXT_SUB_SCHED
+static void scx_pstack_recursion_on_dispatch(struct bpf_prog *prog)
+{
+	struct scx_sched *sch;
+
+	guard(rcu)();
+	sch = scx_prog_sched(prog->aux);
+	if (unlikely(!sch))
+		return;
+
+	scx_error(sch, "dispatch recursion detected");
+}
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 static int bpf_scx_check_member(const struct btf_type *t,
 				const struct btf_member *member,
 				const struct bpf_prog *prog)
@@ -6605,6 +6634,22 @@ static int bpf_scx_check_member(const struct btf_type *t,
 			return -EINVAL;
 	}
 
+#ifdef CONFIG_EXT_SUB_SCHED
+	/*
+	 * Enable private stack for operations that can nest along the
+	 * hierarchy.
+	 *
+	 * XXX - Ideally, we should only do this for scheds that allow
+	 * sub-scheds and sub-scheds themselves but I don't know how to access
+	 * struct_ops from here.
+	 */
+	switch (moff) {
+	case offsetof(struct sched_ext_ops, dispatch):
+		prog->aux->priv_stack_requested = true;
+		prog->aux->recursion_detected = scx_pstack_recursion_on_dispatch;
+	}
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 	return 0;
 }
 
@@ -7583,6 +7628,48 @@ __bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
 			    p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
 }
 
+#ifdef CONFIG_EXT_SUB_SCHED
+/**
+ * scx_bpf_sub_dispatch - Trigger dispatching on a child scheduler
+ * @cgroup_id: cgroup ID of the child scheduler to dispatch
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Allows a parent scheduler to trigger dispatching on one of its direct
+ * child schedulers. The child scheduler runs its dispatch operation to
+ * move tasks from dispatch queues to the local runqueue.
+ *
+ * Returns: true on success, false if cgroup_id is invalid, not a direct
+ * child, or caller lacks dispatch permission.
+ */
+__bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *aux)
+{
+	struct rq *this_rq = this_rq();
+	struct scx_sched *parent, *child;
+
+	guard(rcu)();
+	parent = scx_prog_sched(aux);
+	if (unlikely(!parent))
+		return false;
+
+	if (!scx_kf_allowed(parent, SCX_KF_DISPATCH))
+		return false;
+
+	child = scx_find_sub_sched(cgroup_id);
+
+	if (unlikely(!child))
+		return false;
+
+	if (unlikely(scx_parent(child) != parent)) {
+		scx_error(parent, "trying to dispatch a distant sub-sched on cgroup %llu",
+			  cgroup_id);
+		return false;
+	}
+
+	return scx_dispatch_sched(child, this_rq, this_rq->scx.sub_dispatch_prev,
+				  true);
+}
+#endif	/* CONFIG_EXT_SUB_SCHED */
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
@@ -7593,6 +7680,9 @@ BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+#ifdef CONFIG_EXT_SUB_SCHED
+BTF_ID_FLAGS(func, scx_bpf_sub_dispatch, KF_IMPLICIT_ARGS)
+#endif
 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7f3b07872e15..ebe971d12cb8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -805,6 +805,9 @@ struct scx_rq {
 	cpumask_var_t		cpus_to_preempt;
 	cpumask_var_t		cpus_to_wait;
 	unsigned long		kick_sync;
+
+	struct task_struct	*sub_dispatch_prev;
+
 	struct llist_head	deferred_reenq_locals;
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 821d5791bd42..eba4d87345e0 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -101,6 +101,7 @@ struct rq *scx_bpf_locked_rq(void) __ksym;
 struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
 u64 scx_bpf_now(void) __ksym __weak;
 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
+bool scx_bpf_sub_dispatch(u64 cgroup_id) __ksym __weak;
 
 /*
  * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index ff6ff34177ab..91b8eac83f52 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -48,6 +48,9 @@ const volatile bool suppress_dump;
 u64 nr_highpri_queued;
 u32 test_error_cnt;
 
+#define MAX_SUB_SCHEDS		8
+u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+
 UEI_DEFINE(uei);
 
 struct qmap {
@@ -451,6 +454,12 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		cpuc->dsp_cnt = 0;
 	}
 
+	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+		if (sub_sched_cgroup_ids[i] &&
+		    scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+			return;
+	}
+
 	/*
 	 * No other tasks. @prev will keep running. Update its core_sched_seq as
 	 * if the task were enqueued and dispatched immediately.
@@ -895,7 +904,32 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
 
 s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
 {
-	return 0;
+	s32 i;
+
+	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+		if (!sub_sched_cgroup_ids[i]) {
+			sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+			bpf_printk("attaching sub-sched[%d] on %s",
+				   i, args->cgroup_path);
+			return 0;
+		}
+	}
+
+	return -ENOSPC;
+}
+
+void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
+{
+	s32 i;
+
+	for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+		if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+			sub_sched_cgroup_ids[i] = 0;
+			bpf_printk("detaching sub-sched[%d] on %s",
+				   i, args->cgroup_path);
+			break;
+		}
+	}
 }
 
 SCX_OPS_DEFINE(qmap_ops,
@@ -914,6 +948,7 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .cgroup_set_weight	= (void *)qmap_cgroup_set_weight,
 	       .cgroup_set_bandwidth	= (void *)qmap_cgroup_set_bandwidth,
 	       .sub_attach		= (void *)qmap_sub_attach,
+	       .sub_detach		= (void *)qmap_sub_detach,
 	       .cpu_online		= (void *)qmap_cpu_online,
 	       .cpu_offline		= (void *)qmap_cpu_offline,
 	       .init			= (void *)qmap_init,

From 03f5304aad0f90907475437be8052e7e70376319 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Sat, 7 Mar 2026 10:56:31 +0100
Subject: [PATCH 039/134] sched_ext: Pass full dequeue flags to ops.quiescent()

ops.quiescent() is invoked with the same deq_flags as ops.dequeue(), so
the BPF scheduler is able to distinguish sleep vs property changes in
both callbacks.

However, dequeue_task_scx() receives deq_flags as an int from the
sched_class interface, so SCX flags above bit 32 (%SCX_DEQ_SCHED_CHANGE)
are truncated. ops_dequeue() reconstructs the full u64 for ops.dequeue(),
but ops.quiescent() is still called with the original int and can never
see %SCX_DEQ_SCHED_CHANGE.

Fix this by constructing the full u64 deq_flags in dequeue_task_scx()
(renaming the int parameter to core_deq_flags) and passing the complete
flags to both ops_dequeue() and ops.quiescent().

Fixes: ebf1ccff79c4 ("sched_ext: Fix ops.dequeue() semantics")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 72b7a87f66ec..47f65041b363 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1782,14 +1782,6 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 {
 	struct scx_sched *sch = scx_task_sched(p);
 	unsigned long opss;
-	u64 op_deq_flags = deq_flags;
-
-	/*
-	 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property
-	 * change (not sleep or core-sched pick).
-	 */
-	if (!(op_deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC)))
-		op_deq_flags |= SCX_DEQ_SCHED_CHANGE;
 
 	/* dequeue is always temporary, don't reset runnable_at */
 	clr_task_runnable(p, false);
@@ -1846,12 +1838,20 @@ static void ops_dequeue(struct rq *rq, struct task_struct *p, u64 deq_flags)
 	 * NONE but the task may still have %SCX_TASK_IN_CUSTODY set until
 	 * it is enqueued on the destination.
 	 */
-	call_task_dequeue(sch, rq, p, op_deq_flags);
+	call_task_dequeue(sch, rq, p, deq_flags);
 }
 
-static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
+static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_flags)
 {
 	struct scx_sched *sch = scx_task_sched(p);
+	u64 deq_flags = core_deq_flags;
+
+	/*
+	 * Set %SCX_DEQ_SCHED_CHANGE when the dequeue is due to a property
+	 * change (not sleep or core-sched pick).
+	 */
+	if (!(deq_flags & (DEQUEUE_SLEEP | SCX_DEQ_CORE_SCHED_EXEC)))
+		deq_flags |= SCX_DEQ_SCHED_CHANGE;
 
 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
 		WARN_ON_ONCE(task_runnable(p));

From 26b9c7c70027f011a5f39cef9c3cf44539310a6d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 040/134] sched_ext: Relocate scx_bpf_task_cgroup() and its
 BTF_ID to the end of kfunc section

Move scx_bpf_task_cgroup() kfunc definition and its BTF_ID entry to the end
of the kfunc section before __bpf_kfunc_end_defs() for cleaner code
organization.

No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 78 +++++++++++++++++++++++-----------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 47f65041b363..fa8968164fd2 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8627,43 +8627,6 @@ __bpf_kfunc struct task_struct *scx_bpf_cpu_curr(s32 cpu, const struct bpf_prog_
 	return rcu_dereference(cpu_rq(cpu)->curr);
 }
 
-/**
- * scx_bpf_task_cgroup - Return the sched cgroup of a task
- * @p: task of interest
- * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
- *
- * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
- * from the scheduler's POV. SCX operations should use this function to
- * determine @p's current cgroup as, unlike following @p->cgroups,
- * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
- * rq-locked operations. Can be called on the parameter tasks of rq-locked
- * operations. The restriction guarantees that @p's rq is locked by the caller.
- */
-#ifdef CONFIG_CGROUP_SCHED
-__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p,
-					       const struct bpf_prog_aux *aux)
-{
-	struct task_group *tg = p->sched_task_group;
-	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
-	struct scx_sched *sch;
-
-	guard(rcu)();
-
-	sch = scx_prog_sched(aux);
-	if (unlikely(!sch))
-		goto out;
-
-	if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
-		goto out;
-
-	cgrp = tg_cgrp(tg);
-
-out:
-	cgroup_get(cgrp);
-	return cgrp;
-}
-#endif
-
 /**
  * scx_bpf_now - Returns a high-performance monotonically non-decreasing
  * clock for the current CPU. The clock returned is in nanoseconds.
@@ -8778,6 +8741,43 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
 	memcpy(events, &e_sys, events__sz);
 }
 
+#ifdef CONFIG_CGROUP_SCHED
+/**
+ * scx_bpf_task_cgroup - Return the sched cgroup of a task
+ * @p: task of interest
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
+ * from the scheduler's POV. SCX operations should use this function to
+ * determine @p's current cgroup as, unlike following @p->cgroups,
+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
+ * rq-locked operations. Can be called on the parameter tasks of rq-locked
+ * operations. The restriction guarantees that @p's rq is locked by the caller.
+ */
+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p,
+					       const struct bpf_prog_aux *aux)
+{
+	struct task_group *tg = p->sched_task_group;
+	struct cgroup *cgrp = &cgrp_dfl_root.cgrp;
+	struct scx_sched *sch;
+
+	guard(rcu)();
+
+	sch = scx_prog_sched(aux);
+	if (unlikely(!sch))
+		goto out;
+
+	if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
+		goto out;
+
+	cgrp = tg_cgrp(tg);
+
+out:
+	cgroup_get(cgrp);
+	return cgrp;
+}
+#endif	/* CONFIG_CGROUP_SCHED */
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -8807,11 +8807,11 @@ BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_cpu_rq, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_locked_rq, KF_IMPLICIT_ARGS | KF_RET_NULL)
 BTF_ID_FLAGS(func, scx_bpf_cpu_curr, KF_IMPLICIT_ARGS | KF_RET_NULL | KF_RCU_PROTECTED)
+BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_events)
 #ifdef CONFIG_CGROUP_SCHED
 BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_IMPLICIT_ARGS | KF_RCU | KF_ACQUIRE)
 #endif
-BTF_ID_FLAGS(func, scx_bpf_now)
-BTF_ID_FLAGS(func, scx_bpf_events)
 BTF_KFUNCS_END(scx_kfunc_ids_any)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_any = {

From d4ae868c6b7d6aaa29c86c4f72f68d2252709178 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 041/134] sched_ext: Wrap global DSQs in per-node structure

Global DSQs are currently stored as an array of scx_dispatch_q pointers,
one per NUMA node. To allow adding more per-node data structures, wrap the
global DSQ in scx_sched_pnode and replace global_dsqs with pnode array.

NUMA-aware allocation is maintained. No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 32 ++++++++++++++++----------------
 kernel/sched/ext_internal.h |  6 +++++-
 2 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fa8968164fd2..28db05944390 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -344,7 +344,7 @@ static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor)
 static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
 					      struct task_struct *p)
 {
-	return sch->global_dsqs[cpu_to_node(task_cpu(p))];
+	return &sch->pnode[cpu_to_node(task_cpu(p))]->global_dsq;
 }
 
 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
@@ -2228,7 +2228,7 @@ static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
 {
 	int node = cpu_to_node(cpu_of(rq));
 
-	return consume_dispatch_q(sch, rq, sch->global_dsqs[node]);
+	return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq);
 }
 
 /**
@@ -4147,8 +4147,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	free_percpu(sch->pcpu);
 
 	for_each_node_state(node, N_POSSIBLE)
-		kfree(sch->global_dsqs[node]);
-	kfree(sch->global_dsqs);
+		kfree(sch->pnode[node]);
+	kfree(sch->pnode);
 
 	rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
 	do {
@@ -5706,23 +5706,23 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	if (ret < 0)
 		goto err_free_ei;
 
-	sch->global_dsqs = kzalloc_objs(sch->global_dsqs[0], nr_node_ids);
-	if (!sch->global_dsqs) {
+	sch->pnode = kzalloc_objs(sch->pnode[0], nr_node_ids);
+	if (!sch->pnode) {
 		ret = -ENOMEM;
 		goto err_free_hash;
 	}
 
 	for_each_node_state(node, N_POSSIBLE) {
-		struct scx_dispatch_q *dsq;
+		struct scx_sched_pnode *pnode;
 
-		dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node);
-		if (!dsq) {
+		pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node);
+		if (!pnode) {
 			ret = -ENOMEM;
-			goto err_free_gdsqs;
+			goto err_free_pnode;
 		}
 
-		init_dsq(dsq, SCX_DSQ_GLOBAL, sch);
-		sch->global_dsqs[node] = dsq;
+		init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch);
+		sch->pnode[node] = pnode;
 	}
 
 	sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
@@ -5731,7 +5731,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 				   __alignof__(struct scx_sched_pcpu));
 	if (!sch->pcpu) {
 		ret = -ENOMEM;
-		goto err_free_gdsqs;
+		goto err_free_pnode;
 	}
 
 	for_each_possible_cpu(cpu)
@@ -5818,10 +5818,10 @@ err_stop_helper:
 	kthread_destroy_worker(sch->helper);
 err_free_pcpu:
 	free_percpu(sch->pcpu);
-err_free_gdsqs:
+err_free_pnode:
 	for_each_node_state(node, N_POSSIBLE)
-		kfree(sch->global_dsqs[node]);
-	kfree(sch->global_dsqs);
+		kfree(sch->pnode[node]);
+	kfree(sch->pnode);
 err_free_hash:
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 err_free_ei:
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 4cb97093b872..9e5ebd00ea0c 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -975,6 +975,10 @@ struct scx_sched_pcpu {
 	struct scx_dsp_ctx	dsp_ctx;
 };
 
+struct scx_sched_pnode {
+	struct scx_dispatch_q	global_dsq;
+};
+
 struct scx_sched {
 	struct sched_ext_ops	ops;
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
@@ -988,7 +992,7 @@ struct scx_sched {
 	 * per-node split isn't sufficient, it can be further split.
 	 */
 	struct rhashtable	dsq_hash;
-	struct scx_dispatch_q	**global_dsqs;
+	struct scx_sched_pnode	**pnode;
 	struct scx_sched_pcpu __percpu *pcpu;
 
 	u64			slice_dfl;

From 363cd075e97058f70404ed34955864b99530cbdb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 042/134] sched_ext: Factor out pnode allocation and
 deallocation into helpers

Extract pnode allocation and deallocation logic into alloc_pnode() and
free_pnode() helpers. This simplifies scx_alloc_and_add_sched() and prepares
for adding more per-node initialization and cleanup in subsequent patches.

No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 28db05944390..4c8480829993 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4113,6 +4113,7 @@ static const struct attribute_group scx_global_attr_group = {
 	.attrs = scx_global_attrs,
 };
 
+static void free_pnode(struct scx_sched_pnode *pnode);
 static void free_exit_info(struct scx_exit_info *ei);
 
 static void scx_sched_free_rcu_work(struct work_struct *work)
@@ -4147,7 +4148,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	free_percpu(sch->pcpu);
 
 	for_each_node_state(node, N_POSSIBLE)
-		kfree(sch->pnode[node]);
+		free_pnode(sch->pnode[node]);
 	kfree(sch->pnode);
 
 	rhashtable_walk_enter(&sch->dsq_hash, &rht_iter);
@@ -5684,6 +5685,24 @@ static int alloc_kick_syncs(void)
 	return 0;
 }
 
+static void free_pnode(struct scx_sched_pnode *pnode)
+{
+	kfree(pnode);
+}
+
+static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node)
+{
+	struct scx_sched_pnode *pnode;
+
+	pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node);
+	if (!pnode)
+		return NULL;
+
+	init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch);
+
+	return pnode;
+}
+
 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 						 struct cgroup *cgrp,
 						 struct scx_sched *parent)
@@ -5713,16 +5732,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	}
 
 	for_each_node_state(node, N_POSSIBLE) {
-		struct scx_sched_pnode *pnode;
-
-		pnode = kzalloc_node(sizeof(*pnode), GFP_KERNEL, node);
-		if (!pnode) {
+		sch->pnode[node] = alloc_pnode(sch, node);
+		if (!sch->pnode[node]) {
 			ret = -ENOMEM;
 			goto err_free_pnode;
 		}
-
-		init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch);
-		sch->pnode[node] = pnode;
 	}
 
 	sch->dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
@@ -5820,7 +5834,7 @@ err_free_pcpu:
 	free_percpu(sch->pcpu);
 err_free_pnode:
 	for_each_node_state(node, N_POSSIBLE)
-		kfree(sch->pnode[node]);
+		free_pnode(sch->pnode[node]);
 	kfree(sch->pnode);
 err_free_hash:
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);

From 053d27fba582b6c33531aa4d4f7d4e7ee73f193a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 043/134] sched_ext: Change find_global_dsq() to take CPU
 number instead of task

Change find_global_dsq() to take a CPU number directly instead of a task
pointer. This prepares for callers where the CPU is available but the task is
not.

No functional changes.

v2: Rename tcpu to cpu in find_global_dsq() (Emil Tsalapatis).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4c8480829993..35bc5490ee78 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -341,10 +341,9 @@ static bool scx_is_descendant(struct scx_sched *sch, struct scx_sched *ancestor)
 	for ((pos) = scx_next_descendant_pre(NULL, (root)); (pos);		\
 	     (pos) = scx_next_descendant_pre((pos), (root)))
 
-static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
-					      struct task_struct *p)
+static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, s32 cpu)
 {
-	return &sch->pnode[cpu_to_node(task_cpu(p))]->global_dsq;
+	return &sch->pnode[cpu_to_node(cpu)]->global_dsq;
 }
 
 static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
@@ -1266,7 +1265,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
 			scx_error(sch, "attempting to dispatch to a destroyed dsq");
 			/* fall back to the global dsq */
 			raw_spin_unlock(&dsq->lock);
-			dsq = find_global_dsq(sch, p);
+			dsq = find_global_dsq(sch, task_cpu(p));
 			raw_spin_lock(&dsq->lock);
 		}
 	}
@@ -1474,7 +1473,7 @@ static void dispatch_dequeue_locked(struct task_struct *p,
 
 static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
 						    struct rq *rq, u64 dsq_id,
-						    struct task_struct *p)
+						    s32 tcpu)
 {
 	struct scx_dispatch_q *dsq;
 
@@ -1485,20 +1484,19 @@ static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
 		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
 
 		if (!ops_cpu_valid(sch, cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
-			return find_global_dsq(sch, p);
+			return find_global_dsq(sch, tcpu);
 
 		return &cpu_rq(cpu)->scx.local_dsq;
 	}
 
 	if (dsq_id == SCX_DSQ_GLOBAL)
-		dsq = find_global_dsq(sch, p);
+		dsq = find_global_dsq(sch, tcpu);
 	else
 		dsq = find_user_dsq(sch, dsq_id);
 
 	if (unlikely(!dsq)) {
-		scx_error(sch, "non-existent DSQ 0x%llx for %s[%d]",
-			  dsq_id, p->comm, p->pid);
-		return find_global_dsq(sch, p);
+		scx_error(sch, "non-existent DSQ 0x%llx", dsq_id);
+		return find_global_dsq(sch, tcpu);
 	}
 
 	return dsq;
@@ -1540,7 +1538,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
 {
 	struct rq *rq = task_rq(p);
 	struct scx_dispatch_q *dsq =
-		find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
+		find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
 
 	touch_core_sched_dispatch(rq, p);
 
@@ -1683,7 +1681,7 @@ local:
 	dsq = &rq->scx.local_dsq;
 	goto enqueue;
 global:
-	dsq = find_global_dsq(sch, p);
+	dsq = find_global_dsq(sch, task_cpu(p));
 	goto enqueue;
 bypass:
 	dsq = bypass_enq_target_dsq(sch, task_cpu(p));
@@ -2139,7 +2137,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
 		if (src_rq != dst_rq &&
 		    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
-			dst_dsq = find_global_dsq(sch, p);
+			dst_dsq = find_global_dsq(sch, task_cpu(p));
 			dst_rq = src_rq;
 		}
 	} else {
@@ -2268,7 +2266,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
 
 	if (src_rq != dst_rq &&
 	    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
-		dispatch_enqueue(sch, rq, find_global_dsq(sch, p), p,
+		dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p,
 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
 		return;
 	}
@@ -2406,7 +2404,7 @@ retry:
 
 	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
 
-	dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, p);
+	dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, task_cpu(p));
 
 	if (dsq->id == SCX_DSQ_LOCAL)
 		dispatch_to_local_dsq(sch, rq, dsq, p, enq_flags);
@@ -2646,7 +2644,7 @@ static void process_ddsp_deferred_locals(struct rq *rq)
 
 		list_del_init(&p->scx.dsq_list.node);
 
-		dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, p);
+		dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
 		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
 			dispatch_to_local_dsq(sch, rq, dsq, p,
 					      p->scx.ddsp_enq_flags);
@@ -7409,7 +7407,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	}
 
 	/* @p is still on $src_dsq and stable, determine the destination */
-	dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, p);
+	dst_dsq = find_dsq_for_dispatch(sch, this_rq, dsq_id, task_cpu(p));
 
 	/*
 	 * Apply vtime and slice updates before moving so that the new time is

From ea4593e97a1c0b4b84125dd570b8694bda45c3e0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 044/134] sched_ext: Relocate run_deferred() and its callees

Previously, both process_ddsp_deferred_locals() and reenq_local() required
forward declarations. Reorganize so that only run_deferred() needs to be
declared. Both callees are grouped right before run_deferred() for better
locality. This reduces forward declaration clutter and will ease adding more
to the run_deferred() path.

No functional changes.

v2: Also relocate process_ddsp_deferred_locals() next to run_deferred()
    (Daniel Jordan).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 186 ++++++++++++++++++++++-----------------------
 1 file changed, 92 insertions(+), 94 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 35bc5490ee78..18f8fd0d249d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -193,9 +193,8 @@ MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microsecond
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched_ext.h>
 
-static void process_ddsp_deferred_locals(struct rq *rq);
+static void run_deferred(struct rq *rq);
 static bool task_dead_and_done(struct task_struct *p);
-static u32 reenq_local(struct scx_sched *sch, struct rq *rq);
 static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
 static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind);
 static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
@@ -1003,23 +1002,6 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err
 	return -EPROTO;
 }
 
-static void run_deferred(struct rq *rq)
-{
-	process_ddsp_deferred_locals(rq);
-
-	if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
-		struct llist_node *llist =
-			llist_del_all(&rq->scx.deferred_reenq_locals);
-		struct scx_sched_pcpu *pos, *next;
-
-		llist_for_each_entry_safe(pos, next, llist,
-					  deferred_reenq_locals_node) {
-			init_llist_node(&pos->deferred_reenq_locals_node);
-			reenq_local(pos->sch, rq);
-		}
-	}
-}
-
 static void deferred_bal_cb_workfn(struct rq *rq)
 {
 	run_deferred(rq);
@@ -2624,33 +2606,6 @@ has_tasks:
 	return true;
 }
 
-static void process_ddsp_deferred_locals(struct rq *rq)
-{
-	struct task_struct *p;
-
-	lockdep_assert_rq_held(rq);
-
-	/*
-	 * Now that @rq can be unlocked, execute the deferred enqueueing of
-	 * tasks directly dispatched to the local DSQs of other CPUs. See
-	 * direct_dispatch(). Keep popping from the head instead of using
-	 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
-	 * temporarily.
-	 */
-	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
-				struct task_struct, scx.dsq_list.node))) {
-		struct scx_sched *sch = scx_task_sched(p);
-		struct scx_dispatch_q *dsq;
-
-		list_del_init(&p->scx.dsq_list.node);
-
-		dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
-		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
-			dispatch_to_local_dsq(sch, rq, dsq, p,
-					      p->scx.ddsp_enq_flags);
-	}
-}
-
 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 {
 	struct scx_sched *sch = scx_task_sched(p);
@@ -3071,7 +3026,6 @@ static void rq_offline_scx(struct rq *rq)
 	rq->scx.flags &= ~SCX_RQ_ONLINE;
 }
 
-
 static bool check_rq_for_timeouts(struct rq *rq)
 {
 	struct scx_sched *sch;
@@ -3611,6 +3565,97 @@ int scx_check_setscheduler(struct task_struct *p, int policy)
 	return 0;
 }
 
+static void process_ddsp_deferred_locals(struct rq *rq)
+{
+	struct task_struct *p;
+
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * Now that @rq can be unlocked, execute the deferred enqueueing of
+	 * tasks directly dispatched to the local DSQs of other CPUs. See
+	 * direct_dispatch(). Keep popping from the head instead of using
+	 * list_for_each_entry_safe() as dispatch_local_dsq() may unlock @rq
+	 * temporarily.
+	 */
+	while ((p = list_first_entry_or_null(&rq->scx.ddsp_deferred_locals,
+				struct task_struct, scx.dsq_list.node))) {
+		struct scx_sched *sch = scx_task_sched(p);
+		struct scx_dispatch_q *dsq;
+
+		list_del_init(&p->scx.dsq_list.node);
+
+		dsq = find_dsq_for_dispatch(sch, rq, p->scx.ddsp_dsq_id, task_cpu(p));
+		if (!WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
+			dispatch_to_local_dsq(sch, rq, dsq, p,
+					      p->scx.ddsp_enq_flags);
+	}
+}
+
+static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
+{
+	LIST_HEAD(tasks);
+	u32 nr_enqueued = 0;
+	struct task_struct *p, *n;
+
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * The BPF scheduler may choose to dispatch tasks back to
+	 * @rq->scx.local_dsq. Move all candidate tasks off to a private list
+	 * first to avoid processing the same tasks repeatedly.
+	 */
+	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
+				 scx.dsq_list.node) {
+		struct scx_sched *task_sch = scx_task_sched(p);
+
+		/*
+		 * If @p is being migrated, @p's current CPU may not agree with
+		 * its allowed CPUs and the migration_cpu_stop is about to
+		 * deactivate and re-activate @p anyway. Skip re-enqueueing.
+		 *
+		 * While racing sched property changes may also dequeue and
+		 * re-enqueue a migrating task while its current CPU and allowed
+		 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
+		 * the current local DSQ for running tasks and thus are not
+		 * visible to the BPF scheduler.
+		 */
+		if (p->migration_pending)
+			continue;
+
+		if (!scx_is_descendant(task_sch, sch))
+			continue;
+
+		dispatch_dequeue(rq, p);
+		list_add_tail(&p->scx.dsq_list.node, &tasks);
+	}
+
+	list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
+		list_del_init(&p->scx.dsq_list.node);
+		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+		nr_enqueued++;
+	}
+
+	return nr_enqueued;
+}
+
+static void run_deferred(struct rq *rq)
+{
+	process_ddsp_deferred_locals(rq);
+
+	if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
+		struct llist_node *llist =
+			llist_del_all(&rq->scx.deferred_reenq_locals);
+		struct scx_sched_pcpu *pos, *next;
+
+		llist_for_each_entry_safe(pos, next, llist,
+					  deferred_reenq_locals_node) {
+			init_llist_node(&pos->deferred_reenq_locals_node);
+			reenq_local(pos->sch, rq);
+		}
+	}
+}
+
 #ifdef CONFIG_NO_HZ_FULL
 bool scx_can_stop_tick(struct rq *rq)
 {
@@ -7701,53 +7746,6 @@ static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
 	.set			= &scx_kfunc_ids_dispatch,
 };
 
-static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
-{
-	LIST_HEAD(tasks);
-	u32 nr_enqueued = 0;
-	struct task_struct *p, *n;
-
-	lockdep_assert_rq_held(rq);
-
-	/*
-	 * The BPF scheduler may choose to dispatch tasks back to
-	 * @rq->scx.local_dsq. Move all candidate tasks off to a private list
-	 * first to avoid processing the same tasks repeatedly.
-	 */
-	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
-				 scx.dsq_list.node) {
-		struct scx_sched *task_sch = scx_task_sched(p);
-
-		/*
-		 * If @p is being migrated, @p's current CPU may not agree with
-		 * its allowed CPUs and the migration_cpu_stop is about to
-		 * deactivate and re-activate @p anyway. Skip re-enqueueing.
-		 *
-		 * While racing sched property changes may also dequeue and
-		 * re-enqueue a migrating task while its current CPU and allowed
-		 * CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
-		 * the current local DSQ for running tasks and thus are not
-		 * visible to the BPF scheduler.
-		 */
-		if (p->migration_pending)
-			continue;
-
-		if (!scx_is_descendant(task_sch, sch))
-			continue;
-
-		dispatch_dequeue(rq, p);
-		list_add_tail(&p->scx.dsq_list.node, &tasks);
-	}
-
-	list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
-		list_del_init(&p->scx.dsq_list.node);
-		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
-		nr_enqueued++;
-	}
-
-	return nr_enqueued;
-}
-
 __bpf_kfunc_start_defs();
 
 /**

From 8c1b9453fde6ed3490508974d8134355e8c3c476 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 045/134] sched_ext: Convert deferred_reenq_locals from llist
 to regular list

The deferred reenqueue local mechanism uses an llist (lockless list) for
collecting schedulers that need their local DSQs re-enqueued. Convert to a
regular list protected by a raw_spinlock.

The llist was used for its lockless properties, but the upcoming changes to
support remote reenqueue require more complex list operations that are
difficult to implement correctly with lockless data structures. A spinlock-
protected regular list provides the necessary flexibility.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 57 ++++++++++++++++++++++++-------------
 kernel/sched/ext_internal.h |  2 +-
 kernel/sched/sched.h        |  3 +-
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 18f8fd0d249d..9c3129a45103 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3639,21 +3639,35 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
 	return nr_enqueued;
 }
 
+static void process_deferred_reenq_locals(struct rq *rq)
+{
+	lockdep_assert_rq_held(rq);
+
+	while (true) {
+		struct scx_sched *sch;
+
+		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+			struct scx_sched_pcpu *sch_pcpu =
+				list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
+							 struct scx_sched_pcpu,
+							 deferred_reenq_local_node);
+			if (!sch_pcpu)
+				return;
+
+			sch = sch_pcpu->sch;
+			list_del_init(&sch_pcpu->deferred_reenq_local_node);
+		}
+
+		reenq_local(sch, rq);
+	}
+}
+
 static void run_deferred(struct rq *rq)
 {
 	process_ddsp_deferred_locals(rq);
 
-	if (!llist_empty(&rq->scx.deferred_reenq_locals)) {
-		struct llist_node *llist =
-			llist_del_all(&rq->scx.deferred_reenq_locals);
-		struct scx_sched_pcpu *pos, *next;
-
-		llist_for_each_entry_safe(pos, next, llist,
-					  deferred_reenq_locals_node) {
-			init_llist_node(&pos->deferred_reenq_locals_node);
-			reenq_local(pos->sch, rq);
-		}
-	}
+	if (!list_empty(&rq->scx.deferred_reenq_locals))
+		process_deferred_reenq_locals(rq);
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -4179,13 +4193,13 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	/*
 	 * $sch would have entered bypass mode before the RCU grace period. As
-	 * that blocks new deferrals, all deferred_reenq_locals_node's must be
+	 * that blocks new deferrals, all deferred_reenq_local_node's must be
 	 * off-list by now.
 	 */
 	for_each_possible_cpu(cpu) {
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
-		WARN_ON_ONCE(llist_on_list(&pcpu->deferred_reenq_locals_node));
+		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local_node));
 	}
 
 	free_percpu(sch->pcpu);
@@ -5798,7 +5812,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
 		pcpu->sch = sch;
-		init_llist_node(&pcpu->deferred_reenq_locals_node);
+		INIT_LIST_HEAD(&pcpu->deferred_reenq_local_node);
 	}
 
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
@@ -7125,7 +7139,8 @@ void __init init_sched_ext_class(void)
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
-		init_llist_head(&rq->scx.deferred_reenq_locals);
+		raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
+		INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
 		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
 		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
 
@@ -8357,7 +8372,6 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 	unsigned long flags;
 	struct scx_sched *sch;
 	struct rq *rq;
-	struct llist_node *lnode;
 
 	raw_local_irq_save(flags);
 
@@ -8373,9 +8387,14 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 		goto out_irq_restore;
 
 	rq = this_rq();
-	lnode = &this_cpu_ptr(sch->pcpu)->deferred_reenq_locals_node;
-	if (!llist_on_list(lnode))
-		llist_add(lnode, &rq->scx.deferred_reenq_locals);
+	scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+		struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);
+
+		if (list_empty(&pcpu->deferred_reenq_local_node))
+			list_move_tail(&pcpu->deferred_reenq_local_node,
+				       &rq->scx.deferred_reenq_locals);
+	}
+
 	schedule_deferred(rq);
 out_irq_restore:
 	raw_local_irq_restore(flags);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 9e5ebd00ea0c..80d40a9c5ad9 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -965,7 +965,7 @@ struct scx_sched_pcpu {
 	 */
 	struct scx_event_stats	event_stats;
 
-	struct llist_node	deferred_reenq_locals_node;
+	struct list_head	deferred_reenq_local_node;
 	struct scx_dispatch_q	bypass_dsq;
 #ifdef CONFIG_EXT_SUB_SCHED
 	u32			bypass_host_seq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ebe971d12cb8..0794852524e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -808,7 +808,8 @@ struct scx_rq {
 
 	struct task_struct	*sub_dispatch_prev;
 
-	struct llist_head	deferred_reenq_locals;
+	raw_spinlock_t		deferred_reenq_lock;
+	struct list_head	deferred_reenq_locals;	/* scheds requesting reenq of local DSQ */
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;

From 0c4df54ad8cd52cc165fe2c51fec87e311372699 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 046/134] sched_ext: Wrap deferred_reenq_local_node into a
 struct

Wrap the deferred_reenq_local_node list_head into struct
scx_deferred_reenq_local. More fields will be added and this allows using a
shorthand pointer to access them.

No functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 22 +++++++++++++---------
 kernel/sched/ext_internal.h |  6 +++++-
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9c3129a45103..3548cf61477a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3647,15 +3647,19 @@ static void process_deferred_reenq_locals(struct rq *rq)
 		struct scx_sched *sch;
 
 		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
-			struct scx_sched_pcpu *sch_pcpu =
+			struct scx_deferred_reenq_local *drl =
 				list_first_entry_or_null(&rq->scx.deferred_reenq_locals,
-							 struct scx_sched_pcpu,
-							 deferred_reenq_local_node);
-			if (!sch_pcpu)
+							 struct scx_deferred_reenq_local,
+							 node);
+			struct scx_sched_pcpu *sch_pcpu;
+
+			if (!drl)
 				return;
 
+			sch_pcpu = container_of(drl, struct scx_sched_pcpu,
+						deferred_reenq_local);
 			sch = sch_pcpu->sch;
-			list_del_init(&sch_pcpu->deferred_reenq_local_node);
+			list_del_init(&drl->node);
 		}
 
 		reenq_local(sch, rq);
@@ -4199,7 +4203,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	for_each_possible_cpu(cpu) {
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
-		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local_node));
+		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node));
 	}
 
 	free_percpu(sch->pcpu);
@@ -5812,7 +5816,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
 		pcpu->sch = sch;
-		INIT_LIST_HEAD(&pcpu->deferred_reenq_local_node);
+		INIT_LIST_HEAD(&pcpu->deferred_reenq_local.node);
 	}
 
 	sch->helper = kthread_run_worker(0, "sched_ext_helper");
@@ -8390,8 +8394,8 @@ __bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
 	scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 		struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);
 
-		if (list_empty(&pcpu->deferred_reenq_local_node))
-			list_move_tail(&pcpu->deferred_reenq_local_node,
+		if (list_empty(&pcpu->deferred_reenq_local.node))
+			list_move_tail(&pcpu->deferred_reenq_local.node,
 				       &rq->scx.deferred_reenq_locals);
 	}
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 80d40a9c5ad9..1a8d61097cab 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -954,6 +954,10 @@ struct scx_dsp_ctx {
 	struct scx_dsp_buf_ent	buf[];
 };
 
+struct scx_deferred_reenq_local {
+	struct list_head	node;
+};
+
 struct scx_sched_pcpu {
 	struct scx_sched	*sch;
 	u64			flags;	/* protected by rq lock */
@@ -965,7 +969,7 @@ struct scx_sched_pcpu {
 	 */
 	struct scx_event_stats	event_stats;
 
-	struct list_head	deferred_reenq_local_node;
+	struct scx_deferred_reenq_local deferred_reenq_local;
 	struct scx_dispatch_q	bypass_dsq;
 #ifdef CONFIG_EXT_SUB_SCHED
 	u32			bypass_host_seq;

From 9c34c5074d1bc22072fc7f9c86b0028f7e273b2c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 047/134] sched_ext: Introduce scx_bpf_dsq_reenq() for remote
 local DSQ reenqueue

scx_bpf_reenqueue_local() can only trigger re-enqueue of the current CPU's
local DSQ. Introduce scx_bpf_dsq_reenq() which takes a DSQ ID and can target
any local DSQ including remote CPUs via SCX_DSQ_LOCAL_ON | cpu. This will be
expanded to support user DSQs by future changes.

scx_bpf_reenqueue_local() is reimplemented as a simple wrapper around
scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0) and may be deprecated in the future.

Update compat.bpf.h with a compatibility shim and scx_qmap to test the new
functionality.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c                       | 118 ++++++++++++++---------
 tools/sched_ext/include/scx/compat.bpf.h |  21 ++++
 tools/sched_ext/scx_qmap.bpf.c           |  11 ++-
 tools/sched_ext/scx_qmap.c               |   5 +-
 4 files changed, 106 insertions(+), 49 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3548cf61477a..efcf7ef72a3e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1080,6 +1080,31 @@ static void schedule_deferred_locked(struct rq *rq)
 	schedule_deferred(rq);
 }
 
+static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq)
+{
+	/*
+	 * Allowing reenqueues doesn't make sense while bypassing. This also
+	 * blocks from new reenqueues to be scheduled on dead scheds.
+	 */
+	if (unlikely(READ_ONCE(sch->bypass_depth)))
+		return;
+
+	if (dsq->id == SCX_DSQ_LOCAL) {
+		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+		struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq));
+		struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local;
+
+		scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) {
+			if (list_empty(&drl->node))
+				list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals);
+		}
+
+		schedule_deferred(rq);
+	} else {
+		scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
+	}
+}
+
 /**
  * touch_core_sched - Update timestamp used for core-sched task ordering
  * @rq: rq to read clock from, must be locked
@@ -7774,9 +7799,6 @@ __bpf_kfunc_start_defs();
  * Iterate over all of the tasks currently enqueued on the local DSQ of the
  * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of
  * processed tasks. Can only be called from ops.cpu_release().
- *
- * COMPAT: Will be removed in v6.23 along with the ___v2 suffix on the void
- * returning variant that can be called from anywhere.
  */
 __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
 {
@@ -8206,6 +8228,52 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id,
 	return rcu_dereference(dsq->first_task);
 }
 
+/**
+ * scx_bpf_dsq_reenq - Re-enqueue tasks on a DSQ
+ * @dsq_id: DSQ to re-enqueue
+ * @reenq_flags: %SCX_RENQ_*
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Iterate over all of the tasks currently enqueued on the DSQ identified by
+ * @dsq_id, and re-enqueue them in the BPF scheduler. The following DSQs are
+ * supported:
+ *
+ * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu)
+ *
+ * Re-enqueues are performed asynchronously. Can be called from anywhere.
+ */
+__bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags,
+				   const struct bpf_prog_aux *aux)
+{
+	struct scx_sched *sch;
+	struct scx_dispatch_q *dsq;
+
+	guard(preempt)();
+
+	sch = scx_prog_sched(aux);
+	if (unlikely(!sch))
+		return;
+
+	dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id());
+	schedule_dsq_reenq(sch, dsq);
+}
+
+/**
+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
+ * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ *
+ * Iterate over all of the tasks currently enqueued on the local DSQ of the
+ * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
+ * anywhere.
+ *
+ * This is now a special case of scx_bpf_dsq_reenq() and may be removed in the
+ * future.
+ */
+__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
+{
+	scx_bpf_dsq_reenq(SCX_DSQ_LOCAL, 0, aux);
+}
+
 __bpf_kfunc_end_defs();
 
 static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf,
@@ -8363,47 +8431,6 @@ __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
 		ops_dump_flush();
 }
 
-/**
- * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ
- * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
- *
- * Iterate over all of the tasks currently enqueued on the local DSQ of the
- * caller's CPU, and re-enqueue them in the BPF scheduler. Can be called from
- * anywhere.
- */
-__bpf_kfunc void scx_bpf_reenqueue_local___v2(const struct bpf_prog_aux *aux)
-{
-	unsigned long flags;
-	struct scx_sched *sch;
-	struct rq *rq;
-
-	raw_local_irq_save(flags);
-
-	sch = scx_prog_sched(aux);
-	if (unlikely(!sch))
-		goto out_irq_restore;
-
-	/*
-	 * Allowing reenqueue-locals doesn't make sense while bypassing. This
-	 * also blocks from new reenqueues to be scheduled on dead scheds.
-	 */
-	if (unlikely(sch->bypass_depth))
-		goto out_irq_restore;
-
-	rq = this_rq();
-	scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
-		struct scx_sched_pcpu *pcpu = this_cpu_ptr(sch->pcpu);
-
-		if (list_empty(&pcpu->deferred_reenq_local.node))
-			list_move_tail(&pcpu->deferred_reenq_local.node,
-				       &rq->scx.deferred_reenq_locals);
-	}
-
-	schedule_deferred(rq);
-out_irq_restore:
-	raw_local_irq_restore(flags);
-}
-
 /**
  * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU
  * @cpu: CPU of interest
@@ -8820,13 +8847,14 @@ BTF_ID_FLAGS(func, scx_bpf_kick_cpu, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
 BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_IMPLICIT_ARGS | KF_RCU_PROTECTED | KF_RET_NULL)
+BTF_ID_FLAGS(func, scx_bpf_dsq_reenq, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_IMPLICIT_ARGS | KF_ITER_NEW | KF_RCU_PROTECTED)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL)
 BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY)
 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_IMPLICIT_ARGS)
-BTF_ID_FLAGS(func, scx_bpf_reenqueue_local___v2, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_cpuperf_set, KF_IMPLICIT_ARGS)
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index f2969c3061a7..2d3985be7e2c 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -375,6 +375,27 @@ static inline void scx_bpf_reenqueue_local(void)
 		scx_bpf_reenqueue_local___v1();
 }
 
+/*
+ * v6.20: New scx_bpf_dsq_reenq() that allows re-enqueues on more DSQs. This
+ * will eventually deprecate scx_bpf_reenqueue_local().
+ */
+void scx_bpf_dsq_reenq___compat(u64 dsq_id, u64 reenq_flags, const struct bpf_prog_aux *aux__prog) __ksym __weak;
+
+static inline bool __COMPAT_has_generic_reenq(void)
+{
+	return bpf_ksym_exists(scx_bpf_dsq_reenq___compat);
+}
+
+static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags)
+{
+	if (bpf_ksym_exists(scx_bpf_dsq_reenq___compat))
+		scx_bpf_dsq_reenq___compat(dsq_id, reenq_flags, NULL);
+	else if (dsq_id == SCX_DSQ_LOCAL && reenq_flags == 0)
+		scx_bpf_reenqueue_local();
+	else
+		scx_bpf_error("kernel too old to reenqueue foreign local or user DSQs");
+}
+
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 91b8eac83f52..83e8289e8c0c 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -131,7 +131,7 @@ struct {
 } cpu_ctx_stor SEC(".maps");
 
 /* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
 u64 nr_core_sched_execed;
 u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
 u32 cpuperf_min, cpuperf_avg, cpuperf_max;
@@ -206,8 +206,11 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	void *ring;
 	s32 cpu;
 
-	if (enq_flags & SCX_ENQ_REENQ)
+	if (enq_flags & SCX_ENQ_REENQ) {
 		__sync_fetch_and_add(&nr_reenqueued, 1);
+		if (scx_bpf_task_cpu(p) == 0)
+			__sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+	}
 
 	if (p->flags & PF_KTHREAD) {
 		if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
@@ -561,6 +564,10 @@ int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
 	case 2: /* SCHED_RR */
 	case 6: /* SCHED_DEADLINE */
 		scx_bpf_reenqueue_local();
+
+		/* trigger re-enqueue on CPU0 just to exercise LOCAL_ON */
+		if (__COMPAT_has_generic_reenq())
+			scx_bpf_dsq_reenq(SCX_DSQ_LOCAL_ON | 0, 0);
 	}
 
 	return 0;
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 5d762d10f4db..9252037284d3 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -137,9 +137,10 @@ int main(int argc, char **argv)
 		long nr_enqueued = skel->bss->nr_enqueued;
 		long nr_dispatched = skel->bss->nr_dispatched;
 
-		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+		printf("stats  : enq=%lu dsp=%lu delta=%ld reenq/cpu0=%"PRIu64"/%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
 		       nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
-		       skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
+		       skel->bss->nr_reenqueued, skel->bss->nr_reenqueued_cpu0,
+		       skel->bss->nr_dequeued,
 		       skel->bss->nr_core_sched_execed,
 		       skel->bss->nr_ddsp_from_enq);
 		printf("         exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",

From ffa7ae0724e4ee548c87a56dc7a7a0ab7ee0c1d6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:49 -1000
Subject: [PATCH 048/134] sched_ext: Add reenq_flags plumbing to
 scx_bpf_dsq_reenq()

Add infrastructure to pass flags through the deferred reenqueue path.
reenq_local() now takes a reenq_flags parameter, and scx_sched_pcpu gains a
deferred_reenq_local_flags field to accumulate flags from multiple
scx_bpf_dsq_reenq() calls before processing. No flags are defined yet.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 33 ++++++++++++++++++++++++++++-----
 kernel/sched/ext_internal.h | 10 ++++++++++
 2 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index efcf7ef72a3e..d8ea12ddc206 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1080,7 +1080,8 @@ static void schedule_deferred_locked(struct rq *rq)
 	schedule_deferred(rq);
 }
 
-static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq)
+static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq,
+			       u64 reenq_flags)
 {
 	/*
 	 * Allowing reenqueues doesn't make sense while bypassing. This also
@@ -1097,6 +1098,7 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 		scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) {
 			if (list_empty(&drl->node))
 				list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals);
+			drl->flags |= reenq_flags;
 		}
 
 		schedule_deferred(rq);
@@ -3617,7 +3619,14 @@ static void process_ddsp_deferred_locals(struct rq *rq)
 	}
 }
 
-static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
+static bool task_should_reenq(struct task_struct *p, u64 reenq_flags)
+{
+	if (reenq_flags & SCX_REENQ_ANY)
+		return true;
+	return false;
+}
+
+static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
 {
 	LIST_HEAD(tasks);
 	u32 nr_enqueued = 0;
@@ -3651,6 +3660,9 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq)
 		if (!scx_is_descendant(task_sch, sch))
 			continue;
 
+		if (!task_should_reenq(p, reenq_flags))
+			continue;
+
 		dispatch_dequeue(rq, p);
 		list_add_tail(&p->scx.dsq_list.node, &tasks);
 	}
@@ -3670,6 +3682,7 @@ static void process_deferred_reenq_locals(struct rq *rq)
 
 	while (true) {
 		struct scx_sched *sch;
+		u64 reenq_flags = 0;
 
 		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 			struct scx_deferred_reenq_local *drl =
@@ -3684,10 +3697,11 @@ static void process_deferred_reenq_locals(struct rq *rq)
 			sch_pcpu = container_of(drl, struct scx_sched_pcpu,
 						deferred_reenq_local);
 			sch = sch_pcpu->sch;
+			swap(drl->flags, reenq_flags);
 			list_del_init(&drl->node);
 		}
 
-		reenq_local(sch, rq);
+		reenq_local(sch, rq, reenq_flags);
 	}
 }
 
@@ -7816,7 +7830,7 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
 	rq = cpu_rq(smp_processor_id());
 	lockdep_assert_rq_held(rq);
 
-	return reenq_local(sch, rq);
+	return reenq_local(sch, rq, 0);
 }
 
 __bpf_kfunc_end_defs();
@@ -8254,8 +8268,17 @@ __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags,
 	if (unlikely(!sch))
 		return;
 
+	if (unlikely(reenq_flags & ~__SCX_REENQ_USER_MASK)) {
+		scx_error(sch, "invalid SCX_REENQ flags 0x%llx", reenq_flags);
+		return;
+	}
+
+	/* not specifying any filter bits is the same as %SCX_REENQ_ANY */
+	if (!(reenq_flags & __SCX_REENQ_FILTER_MASK))
+		reenq_flags |= SCX_REENQ_ANY;
+
 	dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id());
-	schedule_dsq_reenq(sch, dsq);
+	schedule_dsq_reenq(sch, dsq, reenq_flags);
 }
 
 /**
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 1a8d61097cab..d9eda2e8701c 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -956,6 +956,7 @@ struct scx_dsp_ctx {
 
 struct scx_deferred_reenq_local {
 	struct list_head	node;
+	u64			flags;
 };
 
 struct scx_sched_pcpu {
@@ -1128,6 +1129,15 @@ enum scx_deq_flags {
 	SCX_DEQ_SCHED_CHANGE	= 1LLU << 33,
 };
 
+enum scx_reenq_flags {
+	/* low 16bits determine which tasks should be reenqueued */
+	SCX_REENQ_ANY		= 1LLU << 0,	/* all tasks */
+
+	__SCX_REENQ_FILTER_MASK	= 0xffffLLU,
+
+	__SCX_REENQ_USER_MASK	= SCX_REENQ_ANY,
+};
+
 enum scx_pick_idle_cpu_flags {
 	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
 	SCX_PICK_IDLE_IN_NODE	= 1LLU << 1,	/* pick a CPU in the same target NUMA node */

From 30b0515342db48ac9ffd9999648de0f7ca1d6a87 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:50 -1000
Subject: [PATCH 049/134] sched_ext: Add per-CPU data to DSQs

Add per-CPU data structure to dispatch queues. Each DSQ now has a percpu
scx_dsq_pcpu which contains a back-pointer to the DSQ. This will be used by
future changes to implement per-CPU reenqueue tracking for user DSQs.

init_dsq() now allocates the percpu data and can fail, so it returns an
error code. All callers are updated to handle failures. exit_dsq() is added
to free the percpu data and is called from all DSQ cleanup paths.

In scx_bpf_create_dsq(), init_dsq() is called before rcu_read_lock() since
alloc_percpu() requires GFP_KERNEL context, and dsq->sched is set
afterwards.

v2: Fix err_free_pcpu to only exit_dsq() initialized bypass DSQs (Andrea
    Righi).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h |  5 +++
 kernel/sched/ext.c        | 87 ++++++++++++++++++++++++++++++++-------
 2 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index f354d7d34306..98cc1f41b91e 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -62,6 +62,10 @@ enum scx_dsq_id_flags {
 	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
 };
 
+struct scx_dsq_pcpu {
+	struct scx_dispatch_q	*dsq;
+};
+
 /*
  * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered
  * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to
@@ -79,6 +83,7 @@ struct scx_dispatch_q {
 	struct rhash_head	hash_node;
 	struct llist_node	free_node;
 	struct scx_sched	*sched;
+	struct scx_dsq_pcpu __percpu *pcpu;
 	struct rcu_head		rcu;
 };
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d8ea12ddc206..aea09eb36873 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4020,15 +4020,42 @@ DEFINE_SCHED_CLASS(ext) = {
 #endif
 };
 
-static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
-		     struct scx_sched *sch)
+static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
+		    struct scx_sched *sch)
 {
+	s32 cpu;
+
 	memset(dsq, 0, sizeof(*dsq));
 
 	raw_spin_lock_init(&dsq->lock);
 	INIT_LIST_HEAD(&dsq->list);
 	dsq->id = dsq_id;
 	dsq->sched = sch;
+
+	dsq->pcpu = alloc_percpu(struct scx_dsq_pcpu);
+	if (!dsq->pcpu)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
+
+		pcpu->dsq = dsq;
+	}
+
+	return 0;
+}
+
+static void exit_dsq(struct scx_dispatch_q *dsq)
+{
+	free_percpu(dsq->pcpu);
+}
+
+static void free_dsq_rcufn(struct rcu_head *rcu)
+{
+	struct scx_dispatch_q *dsq = container_of(rcu, struct scx_dispatch_q, rcu);
+
+	exit_dsq(dsq);
+	kfree(dsq);
 }
 
 static void free_dsq_irq_workfn(struct irq_work *irq_work)
@@ -4037,7 +4064,7 @@ static void free_dsq_irq_workfn(struct irq_work *irq_work)
 	struct scx_dispatch_q *dsq, *tmp_dsq;
 
 	llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
-		kfree_rcu(dsq, rcu);
+		call_rcu(&dsq->rcu, free_dsq_rcufn);
 }
 
 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
@@ -4234,15 +4261,17 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 		cgroup_put(sch_cgroup(sch));
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
-	/*
-	 * $sch would have entered bypass mode before the RCU grace period. As
-	 * that blocks new deferrals, all deferred_reenq_local_node's must be
-	 * off-list by now.
-	 */
 	for_each_possible_cpu(cpu) {
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
+		/*
+		 * $sch would have entered bypass mode before the RCU grace
+		 * period. As that blocks new deferrals, all
+		 * deferred_reenq_local_node's must be off-list by now.
+		 */
 		WARN_ON_ONCE(!list_empty(&pcpu->deferred_reenq_local.node));
+
+		exit_dsq(bypass_dsq(sch, cpu));
 	}
 
 	free_percpu(sch->pcpu);
@@ -5787,6 +5816,9 @@ static int alloc_kick_syncs(void)
 
 static void free_pnode(struct scx_sched_pnode *pnode)
 {
+	if (!pnode)
+		return;
+	exit_dsq(&pnode->global_dsq);
 	kfree(pnode);
 }
 
@@ -5798,7 +5830,10 @@ static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node)
 	if (!pnode)
 		return NULL;
 
-	init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch);
+	if (init_dsq(&pnode->global_dsq, SCX_DSQ_GLOBAL, sch)) {
+		kfree(pnode);
+		return NULL;
+	}
 
 	return pnode;
 }
@@ -5809,7 +5844,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 {
 	struct scx_sched *sch;
 	s32 level = parent ? parent->level + 1 : 0;
-	s32 node, cpu, ret;
+	s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids;
 
 	sch = kzalloc_flex(*sch, ancestors, level);
 	if (!sch)
@@ -5848,8 +5883,13 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		goto err_free_pnode;
 	}
 
-	for_each_possible_cpu(cpu)
-		init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
+	for_each_possible_cpu(cpu) {
+		ret = init_dsq(bypass_dsq(sch, cpu), SCX_DSQ_BYPASS, sch);
+		if (ret) {
+			bypass_fail_cpu = cpu;
+			goto err_free_pcpu;
+		}
+	}
 
 	for_each_possible_cpu(cpu) {
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
@@ -5931,6 +5971,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 err_stop_helper:
 	kthread_destroy_worker(sch->helper);
 err_free_pcpu:
+	for_each_possible_cpu(cpu) {
+		if (cpu == bypass_fail_cpu)
+			break;
+		exit_dsq(bypass_dsq(sch, cpu));
+	}
 	free_percpu(sch->pcpu);
 err_free_pnode:
 	for_each_node_state(node, N_POSSIBLE)
@@ -7173,7 +7218,7 @@ void __init init_sched_ext_class(void)
 		int  n = cpu_to_node(cpu);
 
 		/* local_dsq's sch will be set during scx_root_enable() */
-		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL);
+		BUG_ON(init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL, NULL));
 
 		INIT_LIST_HEAD(&rq->scx.runnable_list);
 		INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
@@ -7872,11 +7917,21 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_a
 	if (!dsq)
 		return -ENOMEM;
 
+	/*
+	 * init_dsq() must be called in GFP_KERNEL context. Init it with NULL
+	 * @sch and update afterwards.
+	 */
+	ret = init_dsq(dsq, dsq_id, NULL);
+	if (ret) {
+		kfree(dsq);
+		return ret;
+	}
+
 	rcu_read_lock();
 
 	sch = scx_prog_sched(aux);
 	if (sch) {
-		init_dsq(dsq, dsq_id, sch);
+		dsq->sched = sch;
 		ret = rhashtable_lookup_insert_fast(&sch->dsq_hash, &dsq->hash_node,
 						    dsq_hash_params);
 	} else {
@@ -7884,8 +7939,10 @@ __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node, const struct bpf_prog_a
 	}
 
 	rcu_read_unlock();
-	if (ret)
+	if (ret) {
+		exit_dsq(dsq);
 		kfree(dsq);
+	}
 	return ret;
 }
 

From 35250720d6ed1e83e0d1e12b7e8bf7b8316d7d58 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:50 -1000
Subject: [PATCH 050/134] sched_ext: Factor out nldsq_cursor_next_task() and
 nldsq_cursor_lost_task()

Factor out cursor-based DSQ iteration from bpf_iter_scx_dsq_next() into
nldsq_cursor_next_task() and the task-lost check from scx_dsq_move() into
nldsq_cursor_lost_task() to prepare for reuse.

As ->priv is only used to record dsq->seq for cursors, update
INIT_DSQ_LIST_CURSOR() to take the DSQ pointer and set ->priv from dsq->seq
so that users don't have to read it manually. Move scx_dsq_iter_flags enum
earlier so nldsq_cursor_next_task() can use SCX_DSQ_ITER_REV.

bypass_lb_cpu() now sets cursor.priv to dsq->seq but doesn't use it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h |   6 +-
 kernel/sched/ext.c        | 154 ++++++++++++++++++++++++--------------
 2 files changed, 102 insertions(+), 58 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 98cc1f41b91e..303f57dfb947 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -157,11 +157,11 @@ struct scx_dsq_list_node {
 	u32			priv;		/* can be used by iter cursor */
 };
 
-#define INIT_DSQ_LIST_CURSOR(__node, __flags, __priv)				\
+#define INIT_DSQ_LIST_CURSOR(__cursor, __dsq, __flags)				\
 	(struct scx_dsq_list_node) {						\
-		.node = LIST_HEAD_INIT((__node).node),				\
+		.node = LIST_HEAD_INIT((__cursor).node),			\
 		.flags = SCX_DSQ_LNODE_ITER_CURSOR | (__flags),			\
-		.priv = (__priv),						\
+		.priv = READ_ONCE((__dsq)->seq),				\
 	}
 
 struct scx_sched;
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index aea09eb36873..f51e4c20cd95 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -570,9 +570,22 @@ static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
 	return true;
 }
 
+enum scx_dsq_iter_flags {
+	/* iterate in the reverse dispatch order */
+	SCX_DSQ_ITER_REV		= 1U << 16,
+
+	__SCX_DSQ_ITER_HAS_SLICE	= 1U << 30,
+	__SCX_DSQ_ITER_HAS_VTIME	= 1U << 31,
+
+	__SCX_DSQ_ITER_USER_FLAGS	= SCX_DSQ_ITER_REV,
+	__SCX_DSQ_ITER_ALL_FLAGS	= __SCX_DSQ_ITER_USER_FLAGS |
+					  __SCX_DSQ_ITER_HAS_SLICE |
+					  __SCX_DSQ_ITER_HAS_VTIME,
+};
+
 /**
  * nldsq_next_task - Iterate to the next task in a non-local DSQ
- * @dsq: user dsq being iterated
+ * @dsq: non-local dsq being iterated
  * @cur: current position, %NULL to start iteration
  * @rev: walk backwards
  *
@@ -612,6 +625,85 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
 	for ((p) = nldsq_next_task((dsq), NULL, false); (p);			\
 	     (p) = nldsq_next_task((dsq), (p), false))
 
+/**
+ * nldsq_cursor_next_task - Iterate to the next task given a cursor in a non-local DSQ
+ * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR()
+ * @dsq: non-local dsq being iterated
+ *
+ * Find the next task in a cursor based iteration. The caller must have
+ * initialized @cursor using INIT_DSQ_LIST_CURSOR() and can release the DSQ lock
+ * between the iteration steps.
+ *
+ * Only tasks which were queued before @cursor was initialized are visible. This
+ * bounds the iteration and guarantees that vtime never jumps in the other
+ * direction while iterating.
+ */
+static struct task_struct *nldsq_cursor_next_task(struct scx_dsq_list_node *cursor,
+						  struct scx_dispatch_q *dsq)
+{
+	bool rev = cursor->flags & SCX_DSQ_ITER_REV;
+	struct task_struct *p;
+
+	lockdep_assert_held(&dsq->lock);
+	BUG_ON(!(cursor->flags & SCX_DSQ_LNODE_ITER_CURSOR));
+
+	if (list_empty(&cursor->node))
+		p = NULL;
+	else
+		p = container_of(cursor, struct task_struct, scx.dsq_list);
+
+	/* skip cursors and tasks that were queued after @cursor init */
+	do {
+		p = nldsq_next_task(dsq, p, rev);
+	} while (p && unlikely(u32_before(cursor->priv, p->scx.dsq_seq)));
+
+	if (p) {
+		if (rev)
+			list_move_tail(&cursor->node, &p->scx.dsq_list.node);
+		else
+			list_move(&cursor->node, &p->scx.dsq_list.node);
+	} else {
+		list_del_init(&cursor->node);
+	}
+
+	return p;
+}
+
+/**
+ * nldsq_cursor_lost_task - Test whether someone else took the task since iteration
+ * @cursor: scx_dsq_list_node initialized with INIT_DSQ_LIST_CURSOR()
+ * @rq: rq @p was on
+ * @dsq: dsq @p was on
+ * @p: target task
+ *
+ * @p is a task returned by nldsq_cursor_next_task(). The locks may have been
+ * dropped and re-acquired inbetween. Verify that no one else took or is in the
+ * process of taking @p from @dsq.
+ *
+ * On %false return, the caller can assume full ownership of @p.
+ */
+static bool nldsq_cursor_lost_task(struct scx_dsq_list_node *cursor,
+				   struct rq *rq, struct scx_dispatch_q *dsq,
+				   struct task_struct *p)
+{
+	lockdep_assert_rq_held(rq);
+	lockdep_assert_held(&dsq->lock);
+
+	/*
+	 * @p could have already left $src_dsq, got re-enqueud, or be in the
+	 * process of being consumed by someone else.
+	 */
+	if (unlikely(p->scx.dsq != dsq ||
+		     u32_before(cursor->priv, p->scx.dsq_seq) ||
+		     p->scx.holding_cpu >= 0))
+		return true;
+
+	/* if @p has stayed on @dsq, its rq couldn't have changed */
+	if (WARN_ON_ONCE(rq != task_rq(p)))
+		return true;
+
+	return false;
+}
 
 /*
  * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse]
@@ -619,19 +711,6 @@ static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq,
  * changes without breaking backward compatibility. Can be used with
  * bpf_for_each(). See bpf_iter_scx_dsq_*().
  */
-enum scx_dsq_iter_flags {
-	/* iterate in the reverse dispatch order */
-	SCX_DSQ_ITER_REV		= 1U << 16,
-
-	__SCX_DSQ_ITER_HAS_SLICE	= 1U << 30,
-	__SCX_DSQ_ITER_HAS_VTIME	= 1U << 31,
-
-	__SCX_DSQ_ITER_USER_FLAGS	= SCX_DSQ_ITER_REV,
-	__SCX_DSQ_ITER_ALL_FLAGS	= __SCX_DSQ_ITER_USER_FLAGS |
-					  __SCX_DSQ_ITER_HAS_SLICE |
-					  __SCX_DSQ_ITER_HAS_VTIME,
-};
-
 struct bpf_iter_scx_dsq_kern {
 	struct scx_dsq_list_node	cursor;
 	struct scx_dispatch_q		*dsq;
@@ -4497,7 +4576,7 @@ static u32 bypass_lb_cpu(struct scx_sched *sch, s32 donor,
 	struct rq *donor_rq = cpu_rq(donor);
 	struct scx_dispatch_q *donor_dsq = bypass_dsq(sch, donor);
 	struct task_struct *p, *n;
-	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
+	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, donor_dsq, 0);
 	s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
 	u32 nr_balanced = 0, min_delta_us;
 
@@ -7542,14 +7621,8 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	locked_rq = src_rq;
 	raw_spin_lock(&src_dsq->lock);
 
-	/*
-	 * Did someone else get to it? @p could have already left $src_dsq, got
-	 * re-enqueud, or be in the process of being consumed by someone else.
-	 */
-	if (unlikely(p->scx.dsq != src_dsq ||
-		     u32_before(kit->cursor.priv, p->scx.dsq_seq) ||
-		     p->scx.holding_cpu >= 0) ||
-	    WARN_ON_ONCE(src_rq != task_rq(p))) {
+	/* did someone else get to it while we dropped the locks? */
+	if (nldsq_cursor_lost_task(&kit->cursor, src_rq, src_dsq, p)) {
 		raw_spin_unlock(&src_dsq->lock);
 		goto out;
 	}
@@ -8188,8 +8261,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
 	if (!kit->dsq)
 		return -ENOENT;
 
-	kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, flags,
-					   READ_ONCE(kit->dsq->seq));
+	kit->cursor = INIT_DSQ_LIST_CURSOR(kit->cursor, kit->dsq, flags);
 
 	return 0;
 }
@@ -8203,41 +8275,13 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
 __bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it)
 {
 	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
-	bool rev = kit->cursor.flags & SCX_DSQ_ITER_REV;
-	struct task_struct *p;
-	unsigned long flags;
 
 	if (!kit->dsq)
 		return NULL;
 
-	raw_spin_lock_irqsave(&kit->dsq->lock, flags);
+	guard(raw_spinlock_irqsave)(&kit->dsq->lock);
 
-	if (list_empty(&kit->cursor.node))
-		p = NULL;
-	else
-		p = container_of(&kit->cursor, struct task_struct, scx.dsq_list);
-
-	/*
-	 * Only tasks which were queued before the iteration started are
-	 * visible. This bounds BPF iterations and guarantees that vtime never
-	 * jumps in the other direction while iterating.
-	 */
-	do {
-		p = nldsq_next_task(kit->dsq, p, rev);
-	} while (p && unlikely(u32_before(kit->cursor.priv, p->scx.dsq_seq)));
-
-	if (p) {
-		if (rev)
-			list_move_tail(&kit->cursor.node, &p->scx.dsq_list.node);
-		else
-			list_move(&kit->cursor.node, &p->scx.dsq_list.node);
-	} else {
-		list_del_init(&kit->cursor.node);
-	}
-
-	raw_spin_unlock_irqrestore(&kit->dsq->lock, flags);
-
-	return p;
+	return nldsq_cursor_next_task(&kit->cursor, kit->dsq);
 }
 
 /**

From 84b1a0ea0b7c23dec240783a592e480780efe459 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:50 -1000
Subject: [PATCH 051/134] sched_ext: Implement scx_bpf_dsq_reenq() for user
 DSQs

scx_bpf_dsq_reenq() currently only supports local DSQs. Extend it to support
user-defined DSQs by adding a deferred re-enqueue mechanism similar to the
local DSQ handling.

Add per-cpu deferred_reenq_user_node/flags to scx_dsq_pcpu and
deferred_reenq_users list to scx_rq. When scx_bpf_dsq_reenq() is called on a
user DSQ, the DSQ's per-cpu node is added to the current rq's deferred list.
process_deferred_reenq_users() then iterates the DSQ using the cursor helpers
and re-enqueues each task.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h      |   6 ++
 kernel/sched/ext.c             | 128 +++++++++++++++++++++++++++++++++
 kernel/sched/sched.h           |   1 +
 tools/sched_ext/scx_qmap.bpf.c |  57 ++++++++++++++-
 4 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 303f57dfb947..e77504faa0bc 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -62,8 +62,14 @@ enum scx_dsq_id_flags {
 	SCX_DSQ_LOCAL_CPU_MASK	= 0xffffffffLLU,
 };
 
+struct scx_deferred_reenq_user {
+	struct list_head	node;
+	u64			flags;
+};
+
 struct scx_dsq_pcpu {
 	struct scx_dispatch_q	*dsq;
+	struct scx_deferred_reenq_user deferred_reenq_user;
 };
 
 /*
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f51e4c20cd95..805c6689c99a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1180,6 +1180,18 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 			drl->flags |= reenq_flags;
 		}
 
+		schedule_deferred(rq);
+	} else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) {
+		struct rq *rq = this_rq();
+		struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq));
+		struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user;
+
+		scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) {
+			if (list_empty(&dru->node))
+				list_move_tail(&dru->node, &rq->scx.deferred_reenq_users);
+			dru->flags |= reenq_flags;
+		}
+
 		schedule_deferred(rq);
 	} else {
 		scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
@@ -3784,12 +3796,108 @@ static void process_deferred_reenq_locals(struct rq *rq)
 	}
 }
 
+static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
+{
+	struct rq *locked_rq = rq;
+	struct scx_sched *sch = dsq->sched;
+	struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, dsq, 0);
+	struct task_struct *p;
+	s32 nr_enqueued = 0;
+
+	lockdep_assert_rq_held(rq);
+
+	raw_spin_lock(&dsq->lock);
+
+	while (likely(!READ_ONCE(sch->bypass_depth))) {
+		struct rq *task_rq;
+
+		p = nldsq_cursor_next_task(&cursor, dsq);
+		if (!p)
+			break;
+
+		if (!task_should_reenq(p, reenq_flags))
+			continue;
+
+		task_rq = task_rq(p);
+
+		if (locked_rq != task_rq) {
+			if (locked_rq)
+				raw_spin_rq_unlock(locked_rq);
+			if (unlikely(!raw_spin_rq_trylock(task_rq))) {
+				raw_spin_unlock(&dsq->lock);
+				raw_spin_rq_lock(task_rq);
+				raw_spin_lock(&dsq->lock);
+			}
+			locked_rq = task_rq;
+
+			/* did we lose @p while switching locks? */
+			if (nldsq_cursor_lost_task(&cursor, task_rq, dsq, p))
+				continue;
+		}
+
+		/* @p is on @dsq, its rq and @dsq are locked */
+		dispatch_dequeue_locked(p, dsq);
+		raw_spin_unlock(&dsq->lock);
+		do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
+
+		if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
+			raw_spin_rq_unlock(locked_rq);
+			locked_rq = NULL;
+			cpu_relax();
+		}
+
+		raw_spin_lock(&dsq->lock);
+	}
+
+	list_del_init(&cursor.node);
+	raw_spin_unlock(&dsq->lock);
+
+	if (locked_rq != rq) {
+		if (locked_rq)
+			raw_spin_rq_unlock(locked_rq);
+		raw_spin_rq_lock(rq);
+	}
+}
+
+static void process_deferred_reenq_users(struct rq *rq)
+{
+	lockdep_assert_rq_held(rq);
+
+	while (true) {
+		struct scx_dispatch_q *dsq;
+		u64 reenq_flags = 0;
+
+		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
+			struct scx_deferred_reenq_user *dru =
+				list_first_entry_or_null(&rq->scx.deferred_reenq_users,
+							 struct scx_deferred_reenq_user,
+							 node);
+			struct scx_dsq_pcpu *dsq_pcpu;
+
+			if (!dru)
+				return;
+
+			dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
+						deferred_reenq_user);
+			dsq = dsq_pcpu->dsq;
+			swap(dru->flags, reenq_flags);
+			list_del_init(&dru->node);
+		}
+
+		BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
+		reenq_user(rq, dsq, reenq_flags);
+	}
+}
+
 static void run_deferred(struct rq *rq)
 {
 	process_ddsp_deferred_locals(rq);
 
 	if (!list_empty(&rq->scx.deferred_reenq_locals))
 		process_deferred_reenq_locals(rq);
+
+	if (!list_empty(&rq->scx.deferred_reenq_users))
+		process_deferred_reenq_users(rq);
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -4119,6 +4227,7 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
 		struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
 
 		pcpu->dsq = dsq;
+		INIT_LIST_HEAD(&pcpu->deferred_reenq_user.node);
 	}
 
 	return 0;
@@ -4126,6 +4235,23 @@ static s32 init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id,
 
 static void exit_dsq(struct scx_dispatch_q *dsq)
 {
+	s32 cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_dsq_pcpu *pcpu = per_cpu_ptr(dsq->pcpu, cpu);
+		struct scx_deferred_reenq_user *dru = &pcpu->deferred_reenq_user;
+		struct rq *rq = cpu_rq(cpu);
+
+		/*
+		 * There must have been a RCU grace period since the last
+		 * insertion and @dsq should be off the deferred list by now.
+		 */
+		if (WARN_ON_ONCE(!list_empty(&dru->node))) {
+			guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
+			list_del_init(&dru->node);
+		}
+	}
+
 	free_percpu(dsq->pcpu);
 }
 
@@ -7308,6 +7434,7 @@ void __init init_sched_ext_class(void)
 		BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
 		raw_spin_lock_init(&rq->scx.deferred_reenq_lock);
 		INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals);
+		INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);
 		rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn);
 		rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn);
 
@@ -8354,6 +8481,7 @@ __bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id,
  * supported:
  *
  * - Local DSQs (%SCX_DSQ_LOCAL or %SCX_DSQ_LOCAL_ON | $cpu)
+ * - User DSQs
  *
  * Re-enqueues are performed asynchronously. Can be called from anywhere.
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 0794852524e7..893f89ce2a77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -810,6 +810,7 @@ struct scx_rq {
 
 	raw_spinlock_t		deferred_reenq_lock;
 	struct list_head	deferred_reenq_locals;	/* scheds requesting reenq of local DSQ */
+	struct list_head	deferred_reenq_users;	/* user DSQs requesting reenq */
 	struct balance_callback	deferred_bal_cb;
 	struct irq_work		deferred_irq_work;
 	struct irq_work		kick_cpus_irq_work;
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 83e8289e8c0c..a4a1b84fe359 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -26,8 +26,11 @@
 
 enum consts {
 	ONE_SEC_IN_NS		= 1000000000,
+	ONE_MSEC_IN_NS		= 1000000,
+	LOWPRI_INTV_NS		= 10 * ONE_MSEC_IN_NS,
 	SHARED_DSQ		= 0,
 	HIGHPRI_DSQ		= 1,
+	LOWPRI_DSQ		= 2,
 	HIGHPRI_WEIGHT		= 8668,		/* this is what -20 maps to */
 };
 
@@ -172,6 +175,9 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
 	if (!(tctx = lookup_task_ctx(p)))
 		return -ESRCH;
 
+	if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
+		return prev_cpu;
+
 	cpu = pick_direct_dispatch_cpu(p, prev_cpu);
 
 	if (cpu >= 0) {
@@ -242,6 +248,13 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 		return;
 	}
 
+	/* see lowpri_timerfn() */
+	if (__COMPAT_has_generic_reenq() &&
+	    p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
+		scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
+		return;
+	}
+
 	/* if select_cpu() wasn't called, try direct dispatch */
 	if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
@@ -873,6 +886,28 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
 	return 0;
 }
 
+struct lowpri_timer {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct lowpri_timer);
+} lowpri_timer SEC(".maps");
+
+/*
+ * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
+ * the tasks are transferred to SHARED_DSQ.
+ */
+static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+	scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
+	bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
+	return 0;
+}
+
 s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 {
 	u32 key = 0;
@@ -894,14 +929,32 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
 		return ret;
 	}
 
+	ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
+	if (ret)
+		return ret;
+
 	timer = bpf_map_lookup_elem(&monitor_timer, &key);
 	if (!timer)
 		return -ESRCH;
-
 	bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
 	bpf_timer_set_callback(timer, monitor_timerfn);
+	ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+	if (ret)
+		return ret;
 
-	return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+	if (__COMPAT_has_generic_reenq()) {
+		/* see lowpri_timerfn() */
+		timer = bpf_map_lookup_elem(&lowpri_timer, &key);
+		if (!timer)
+			return -ESRCH;
+		bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
+		bpf_timer_set_callback(timer, lowpri_timerfn);
+		ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)

From a90449b126824b796e9aeefc2b009e57f38af168 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:50 -1000
Subject: [PATCH 052/134] sched_ext: Optimize schedule_dsq_reenq() with
 lockless fast path

schedule_dsq_reenq() always acquires deferred_reenq_lock to queue a reenqueue
request. Add a lockless fast-path to skip lock acquisition when the request is
already pending with the required flags set.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 44 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 805c6689c99a..ee756f1a70e1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1174,10 +1174,20 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 		struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq));
 		struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local;
 
-		scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) {
+		/*
+		 * Pairs with smp_mb() in process_deferred_reenq_locals() and
+		 * guarantees that there is a reenq_local() afterwards.
+		 */
+		smp_mb();
+
+		if (list_empty(&drl->node) ||
+		    (READ_ONCE(drl->flags) & reenq_flags) != reenq_flags) {
+
+			guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
+
 			if (list_empty(&drl->node))
 				list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals);
-			drl->flags |= reenq_flags;
+			WRITE_ONCE(drl->flags, drl->flags | reenq_flags);
 		}
 
 		schedule_deferred(rq);
@@ -1186,10 +1196,20 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 		struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq));
 		struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user;
 
-		scoped_guard (raw_spinlock_irqsave, &rq->scx.deferred_reenq_lock) {
+		/*
+		 * Pairs with smp_mb() in process_deferred_reenq_users() and
+		 * guarantees that there is a reenq_user() afterwards.
+		 */
+		smp_mb();
+
+		if (list_empty(&dru->node) ||
+		    (READ_ONCE(dru->flags) & reenq_flags) != reenq_flags) {
+
+			guard(raw_spinlock_irqsave)(&rq->scx.deferred_reenq_lock);
+
 			if (list_empty(&dru->node))
 				list_move_tail(&dru->node, &rq->scx.deferred_reenq_users);
-			dru->flags |= reenq_flags;
+			WRITE_ONCE(dru->flags, dru->flags | reenq_flags);
 		}
 
 		schedule_deferred(rq);
@@ -3773,7 +3793,7 @@ static void process_deferred_reenq_locals(struct rq *rq)
 
 	while (true) {
 		struct scx_sched *sch;
-		u64 reenq_flags = 0;
+		u64 reenq_flags;
 
 		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 			struct scx_deferred_reenq_local *drl =
@@ -3788,10 +3808,14 @@ static void process_deferred_reenq_locals(struct rq *rq)
 			sch_pcpu = container_of(drl, struct scx_sched_pcpu,
 						deferred_reenq_local);
 			sch = sch_pcpu->sch;
-			swap(drl->flags, reenq_flags);
+			reenq_flags = drl->flags;
+			WRITE_ONCE(drl->flags, 0);
 			list_del_init(&drl->node);
 		}
 
+		/* see schedule_dsq_reenq() */
+		smp_mb();
+
 		reenq_local(sch, rq, reenq_flags);
 	}
 }
@@ -3865,7 +3889,7 @@ static void process_deferred_reenq_users(struct rq *rq)
 
 	while (true) {
 		struct scx_dispatch_q *dsq;
-		u64 reenq_flags = 0;
+		u64 reenq_flags;
 
 		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 			struct scx_deferred_reenq_user *dru =
@@ -3880,10 +3904,14 @@ static void process_deferred_reenq_users(struct rq *rq)
 			dsq_pcpu = container_of(dru, struct scx_dsq_pcpu,
 						deferred_reenq_user);
 			dsq = dsq_pcpu->dsq;
-			swap(dru->flags, reenq_flags);
+			reenq_flags = dru->flags;
+			WRITE_ONCE(dru->flags, 0);
 			list_del_init(&dru->node);
 		}
 
+		/* see schedule_dsq_reenq() */
+		smp_mb();
+
 		BUG_ON(dsq->id & SCX_DSQ_FLAG_BUILTIN);
 		reenq_user(rq, dsq, reenq_flags);
 	}

From 7203d77d6e04f83f7b78838eed099d9cac31700b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:50 -1000
Subject: [PATCH 053/134] sched_ext: Simplify task state handling

Task states (NONE, INIT, READY, ENABLED) were defined in a separate enum with
unshifted values and then shifted when stored in scx_entity.flags. Simplify by
defining them as pre-shifted values directly in scx_ent_flags and removing the
separate scx_task_state enum. This removes the need for shifting when
reading/writing state values.

scx_get_task_state() now returns the masked flags value directly.
scx_set_task_state() accepts the pre-shifted state value. scx_dump_task()
shifts down for display to maintain readable output.

No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h | 28 ++++++++++++++++------------
 kernel/sched/ext.c        | 19 +++++++++----------
 2 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index e77504faa0bc..e822b374b17f 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -93,7 +93,7 @@ struct scx_dispatch_q {
 	struct rcu_head		rcu;
 };
 
-/* scx_entity.flags */
+/* sched_ext_entity.flags */
 enum scx_ent_flags {
 	SCX_TASK_QUEUED		= 1 << 0, /* on ext runqueue */
 	SCX_TASK_IN_CUSTODY	= 1 << 1, /* in custody, needs ops.dequeue() when leaving */
@@ -101,21 +101,25 @@ enum scx_ent_flags {
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
 	SCX_TASK_SUB_INIT	= 1 << 4, /* task being initialized for a sub sched */
 
-	SCX_TASK_STATE_SHIFT	= 8,	  /* bit 8 and 9 are used to carry scx_task_state */
+	/*
+	 * Bits 8 and 9 are used to carry task state:
+	 *
+	 * NONE		ops.init_task() not called yet
+	 * INIT		ops.init_task() succeeded, but task can be cancelled
+	 * READY	fully initialized, but not in sched_ext
+	 * ENABLED	fully initialized and in sched_ext
+	 */
+	SCX_TASK_STATE_SHIFT	= 8,	  /* bits 8 and 9 are used to carry task state */
 	SCX_TASK_STATE_BITS	= 2,
 	SCX_TASK_STATE_MASK	= ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
 
-	SCX_TASK_CURSOR		= 1 << 31, /* iteration cursor, not a task */
-};
+	SCX_TASK_NONE		= 0 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_INIT		= 1 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_READY		= 2 << SCX_TASK_STATE_SHIFT,
+	SCX_TASK_ENABLED	= 3 << SCX_TASK_STATE_SHIFT,
 
-/* scx_entity.flags & SCX_TASK_STATE_MASK */
-enum scx_task_state {
-	SCX_TASK_NONE,		/* ops.init_task() not called yet */
-	SCX_TASK_INIT,		/* ops.init_task() succeeded, but task can be cancelled */
-	SCX_TASK_READY,		/* fully initialized, but not in sched_ext */
-	SCX_TASK_ENABLED,	/* fully initialized and in sched_ext */
-
-	SCX_TASK_NR_STATES,
+	/* iteration cursor, not a task */
+	SCX_TASK_CURSOR		= 1 << 31,
 };
 
 /* scx_entity.dsq_flags */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ee756f1a70e1..f55e1603fc8c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3284,18 +3284,16 @@ static struct cgroup *tg_cgrp(struct task_group *tg)
 
 #endif	/* CONFIG_EXT_GROUP_SCHED */
 
-static enum scx_task_state scx_get_task_state(const struct task_struct *p)
+static u32 scx_get_task_state(const struct task_struct *p)
 {
-	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
+	return p->scx.flags & SCX_TASK_STATE_MASK;
 }
 
-static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
+static void scx_set_task_state(struct task_struct *p, u32 state)
 {
-	enum scx_task_state prev_state = scx_get_task_state(p);
+	u32 prev_state = scx_get_task_state(p);
 	bool warn = false;
 
-	BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
-
 	switch (state) {
 	case SCX_TASK_NONE:
 		break;
@@ -3313,11 +3311,11 @@ static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
 		return;
 	}
 
-	WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
+	WARN_ONCE(warn, "sched_ext: Invalid task state transition 0x%x -> 0x%x for %s[%d]",
 		  prev_state, state, p->comm, p->pid);
 
 	p->scx.flags &= ~SCX_TASK_STATE_MASK;
-	p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
+	p->scx.flags |= state;
 }
 
 static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fork)
@@ -5794,7 +5792,8 @@ static void scx_dump_task(struct scx_sched *sch,
 		  own_marker, sch_id_buf,
 		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
 	dump_line(s, "      scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu",
-		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
+		  scx_get_task_state(p) >> SCX_TASK_STATE_SHIFT,
+		  p->scx.flags & ~SCX_TASK_STATE_MASK,
 		  p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
 		  ops_state >> SCX_OPSS_QSEQ_SHIFT);
 	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s",
@@ -6558,7 +6557,7 @@ static struct scx_sched *find_parent_sched(struct cgroup *cgrp)
 
 static bool assert_task_ready_or_enabled(struct task_struct *p)
 {
-	enum scx_task_state state = scx_get_task_state(p);
+	u32 state = scx_get_task_state(p);
 
 	switch (state) {
 	case SCX_TASK_READY:

From ce897abc21b2d5e74981ff2b848f3a08a580d50a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 05:29:50 -1000
Subject: [PATCH 054/134] sched_ext: Add SCX_TASK_REENQ_REASON flags

SCX_ENQ_REENQ indicates that a task is being re-enqueued but doesn't tell the
BPF scheduler why. Add SCX_TASK_REENQ_REASON flags using bits 12-13 of
p->scx.flags to communicate the reason during ops.enqueue():

- NONE: Not being reenqueued
- KFUNC: Reenqueued by scx_bpf_dsq_reenq() and friends

More reasons will be added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h   | 15 +++++++++++++++
 kernel/sched/ext.c          | 25 ++++++++++++++++++++++---
 kernel/sched/ext_internal.h | 10 +++-------
 3 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index e822b374b17f..60a4f65d0174 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -118,6 +118,21 @@ enum scx_ent_flags {
 	SCX_TASK_READY		= 2 << SCX_TASK_STATE_SHIFT,
 	SCX_TASK_ENABLED	= 3 << SCX_TASK_STATE_SHIFT,
 
+	/*
+	 * Bits 12 and 13 are used to carry reenqueue reason. In addition to
+	 * %SCX_ENQ_REENQ flag, ops.enqueue() can also test for
+	 * %SCX_TASK_REENQ_REASON_NONE to distinguish reenqueues.
+	 *
+	 * NONE		not being reenqueued
+	 * KFUNC	reenqueued by scx_bpf_dsq_reenq() and friends
+	 */
+	SCX_TASK_REENQ_REASON_SHIFT = 12,
+	SCX_TASK_REENQ_REASON_BITS = 2,
+	SCX_TASK_REENQ_REASON_MASK = ((1 << SCX_TASK_REENQ_REASON_BITS) - 1) << SCX_TASK_REENQ_REASON_SHIFT,
+
+	SCX_TASK_REENQ_NONE	= 0 << SCX_TASK_REENQ_REASON_SHIFT,
+	SCX_TASK_REENQ_KFUNC	= 1 << SCX_TASK_REENQ_REASON_SHIFT,
+
 	/* iteration cursor, not a task */
 	SCX_TASK_CURSOR		= 1 << 31,
 };
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f55e1603fc8c..d5849ed4cd3e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3728,8 +3728,10 @@ static void process_ddsp_deferred_locals(struct rq *rq)
 	}
 }
 
-static bool task_should_reenq(struct task_struct *p, u64 reenq_flags)
+static bool task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason)
 {
+	*reason = SCX_TASK_REENQ_KFUNC;
+
 	if (reenq_flags & SCX_REENQ_ANY)
 		return true;
 	return false;
@@ -3751,6 +3753,7 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
 	list_for_each_entry_safe(p, n, &rq->scx.local_dsq.list,
 				 scx.dsq_list.node) {
 		struct scx_sched *task_sch = scx_task_sched(p);
+		u32 reason;
 
 		/*
 		 * If @p is being migrated, @p's current CPU may not agree with
@@ -3769,16 +3772,24 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
 		if (!scx_is_descendant(task_sch, sch))
 			continue;
 
-		if (!task_should_reenq(p, reenq_flags))
+		if (!task_should_reenq(p, reenq_flags, &reason))
 			continue;
 
 		dispatch_dequeue(rq, p);
+
+		if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
+			p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+		p->scx.flags |= reason;
+
 		list_add_tail(&p->scx.dsq_list.node, &tasks);
 	}
 
 	list_for_each_entry_safe(p, n, &tasks, scx.dsq_list.node) {
 		list_del_init(&p->scx.dsq_list.node);
+
 		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+
+		p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
 		nr_enqueued++;
 	}
 
@@ -3832,12 +3843,13 @@ static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flag
 
 	while (likely(!READ_ONCE(sch->bypass_depth))) {
 		struct rq *task_rq;
+		u32 reason;
 
 		p = nldsq_cursor_next_task(&cursor, dsq);
 		if (!p)
 			break;
 
-		if (!task_should_reenq(p, reenq_flags))
+		if (!task_should_reenq(p, reenq_flags, &reason))
 			continue;
 
 		task_rq = task_rq(p);
@@ -3860,8 +3872,15 @@ static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flag
 		/* @p is on @dsq, its rq and @dsq are locked */
 		dispatch_dequeue_locked(p, dsq);
 		raw_spin_unlock(&dsq->lock);
+
+		if (WARN_ON_ONCE(p->scx.flags & SCX_TASK_REENQ_REASON_MASK))
+			p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+		p->scx.flags |= reason;
+
 		do_enqueue_task(task_rq, p, SCX_ENQ_REENQ, -1);
 
+		p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+
 		if (!(++nr_enqueued % SCX_TASK_ITER_BATCH)) {
 			raw_spin_rq_unlock(locked_rq);
 			locked_rq = NULL;
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index d9eda2e8701c..f8df73044515 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1080,13 +1080,9 @@ enum scx_enq_flags {
 	SCX_ENQ_PREEMPT		= 1LLU << 32,
 
 	/*
-	 * The task being enqueued was previously enqueued on the current CPU's
-	 * %SCX_DSQ_LOCAL, but was removed from it in a call to the
-	 * scx_bpf_reenqueue_local() kfunc. If scx_bpf_reenqueue_local() was
-	 * invoked in a ->cpu_release() callback, and the task is again
-	 * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the
-	 * task will not be scheduled on the CPU until at least the next invocation
-	 * of the ->cpu_acquire() callback.
+	 * The task being enqueued was previously enqueued on a DSQ, but was
+	 * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find
+	 * out why a given task is being reenqueued.
 	 */
 	SCX_ENQ_REENQ		= 1LLU << 40,
 

From 28c4ef2b2e57cb13bf784251e4abbf942d37b4ce Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Sun, 8 Mar 2026 01:26:28 +0800
Subject: [PATCH 055/134] sched_ext: Fix scx_bpf_reenqueue_local() silently
 reenqueuing nothing

ffa7ae0724e4 ("sched_ext: Add reenq_flags plumbing to scx_bpf_dsq_reenq()")
introduced task_should_reenq() as a filter inside reenq_local(), requiring
SCX_REENQ_ANY to be set in order to match any task. scx_bpf_dsq_reenq()
handles this correctly by converting a bare reenq_flags=0 to SCX_REENQ_ANY,
but scx_bpf_reenqueue_local() was not updated and continued to call
reenq_local() with 0, causing it to silently reenqueue zero tasks.

Fix by passing SCX_REENQ_ANY directly.

Fixes: ffa7ae0724e4 ("sched_ext: Add reenq_flags plumbing to scx_bpf_dsq_reenq()")
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d5849ed4cd3e..f6bafcfe0b93 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8121,7 +8121,7 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
 	rq = cpu_rq(smp_processor_id());
 	lockdep_assert_rq_held(rq);
 
-	return reenq_local(sch, rq, 0);
+	return reenq_local(sch, rq, SCX_REENQ_ANY);
 }
 
 __bpf_kfunc_end_defs();

From 80a54b807d6c0b98e43522f102da61c953cfd502 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 21:42:12 -1000
Subject: [PATCH 056/134] Revert "sched_ext: Use READ_ONCE() for the read side
 of dsq->nr update"

This reverts commit 9adfcef334bf9c6ef68eaecfca5f45d18614efe0.

dsq->nr is protected by dsq->lock and reading while holding the lock doesn't
constitute a racy read.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: zhidao su <suzhidao@xiaomi.com>
---
 kernel/sched/ext.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f6bafcfe0b93..d6d807337013 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1296,12 +1296,8 @@ static bool scx_dsq_priq_less(struct rb_node *node_a,
 
 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
 {
-	/*
-	 * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE()
-	 * on the read side and WRITE_ONCE() on the write side to properly
-	 * annotate the concurrent lockless access and avoid KCSAN warnings.
-	 */
-	WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta);
+	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
+	WRITE_ONCE(dsq->nr, dsq->nr + delta);
 }
 
 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)

From c90af06c80a33a28f0bdba3ba136439afad4dd38 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 16:45:14 -1000
Subject: [PATCH 057/134] tools/sched_ext/include: Remove dead sdt_task_defs.h
 guard from common.h
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The __has_include guard for sdt_task_defs.h is vestigial — the only
remaining content is the bpf_arena_common.h include which is available
unconditionally. Remove the dead guard.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
---
 tools/sched_ext/include/scx/common.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
index b3c6372bcf81..823251fc4715 100644
--- a/tools/sched_ext/include/scx/common.h
+++ b/tools/sched_ext/include/scx/common.h
@@ -74,10 +74,6 @@ typedef int64_t s64;
 #include "compat.h"
 #include "enums.h"
 
-/* not available when building kernel tools/sched_ext */
-#if __has_include(<lib/sdt_task_defs.h>)
 #include "bpf_arena_common.h"
-#include <lib/sdt_task_defs.h>
-#endif
 
 #endif	/* __SCHED_EXT_COMMON_H */

From 9c6437f7c2e848aea2469df3396f8365d06adbb0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 16:45:15 -1000
Subject: [PATCH 058/134] tools/sched_ext/include: Sync bpf_arena_common.bpf.h
 with scx repo

Sync the following changes from the scx repo:

- Guard __arena define with #ifndef to avoid redefinition when the
  attribute is already defined by another header.
- Add bpf_arena_reserve_pages() and bpf_arena_mapping_nr_pages() ksym
  declarations.
- Rename TEST to SCX_BPF_UNITTEST to avoid collision with generic TEST
  macros in other projects.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
---
 tools/sched_ext/include/scx/bpf_arena_common.bpf.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h
index 4366fb3c91ce..2043d66940ea 100644
--- a/tools/sched_ext/include/scx/bpf_arena_common.bpf.h
+++ b/tools/sched_ext/include/scx/bpf_arena_common.bpf.h
@@ -15,7 +15,9 @@
 #endif
 
 #if defined(__BPF_FEATURE_ADDR_SPACE_CAST) && !defined(BPF_ARENA_FORCE_ASM)
+#ifndef __arena
 #define __arena __attribute__((address_space(1)))
+#endif
 #define __arena_global __attribute__((address_space(1)))
 #define cast_kern(ptr) /* nop for bpf prog. emitted by LLVM */
 #define cast_user(ptr) /* nop for bpf prog. emitted by LLVM */
@@ -81,12 +83,13 @@
 void __arena* bpf_arena_alloc_pages(void *map, void __arena *addr, __u32 page_cnt,
 				    int node_id, __u64 flags) __ksym __weak;
 void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
+int bpf_arena_reserve_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym __weak;
 
 /*
  * Note that cond_break can only be portably used in the body of a breakable
  * construct, whereas can_loop can be used anywhere.
  */
-#ifdef TEST
+#ifdef SCX_BPF_UNITTEST
 #define can_loop true
 #define __cond_break(expr) expr
 #else
@@ -165,7 +168,7 @@ void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym _
 	})
 #endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
 #endif /* __BPF_FEATURE_MAY_GOTO */
-#endif /* TEST */
+#endif /* SCX_BPF_UNITTEST */
 
 #define cond_break __cond_break(break)
 #define cond_break_label(label) __cond_break(goto label)
@@ -173,3 +176,4 @@ void bpf_arena_free_pages(void *map, void __arena *ptr, __u32 page_cnt) __ksym _
 
 void bpf_preempt_disable(void) __weak __ksym;
 void bpf_preempt_enable(void) __weak __ksym;
+ssize_t bpf_arena_mapping_nr_pages(void *p__map) __weak __ksym;

From 3691d380d5ca8c847c716327aad73a07307ec9c4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 16:45:16 -1000
Subject: [PATCH 059/134] tools/sched_ext/include: Add missing helpers to
 common.bpf.h

Sync several helpers from the scx repo:
- bpf_cgroup_acquire() ksym declaration
- __sink() macro for hiding values from verifier precision tracking
- ctzll() count-trailing-zeros implementation
- get_prandom_u64() helper
- scx_clock_task/pelt/virt/irq() clock helpers with get_current_rq()

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
---
 tools/sched_ext/include/scx/common.bpf.h | 277 +++++++++++++++++++++++
 1 file changed, 277 insertions(+)

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index eba4d87345e0..a63a98a96b86 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -292,6 +292,50 @@ BPF_PROG(name, ##args)
 })
 #endif /* ARRAY_ELEM_PTR */
 
+/**
+ * __sink - Hide @expr's value from the compiler and BPF verifier
+ * @expr: The expression whose value should be opacified
+ *
+ * No-op at runtime. The empty inline assembly with a read-write constraint
+ * ("+g") has two effects at compile/verify time:
+ *
+ * 1. Compiler: treats @expr as both read and written, preventing dead-code
+ *    elimination and keeping @expr (and any side effects that produced it)
+ *    alive.
+ *
+ * 2. BPF verifier: forgets the precise value/range of @expr ("makes it
+ *    imprecise"). The verifier normally tracks exact ranges for every register
+ *    and stack slot. While useful, precision means each distinct value creates a
+ *    separate verifier state. Inside loops this leads to state explosion - each
+ *    iteration carries different precise values so states never merge and the
+ *    verifier explores every iteration individually.
+ *
+ * Example - preventing loop state explosion::
+ *
+ *     u32 nr_intersects = 0, nr_covered = 0;
+ *     __sink(nr_intersects);
+ *     __sink(nr_covered);
+ *     bpf_for(i, 0, nr_nodes) {
+ *         if (intersects(cpumask, node_mask[i]))
+ *             nr_intersects++;
+ *         if (covers(cpumask, node_mask[i]))
+ *             nr_covered++;
+ *     }
+ *
+ * Without __sink(), the verifier tracks every possible (nr_intersects,
+ * nr_covered) pair across iterations, causing "BPF program is too large". With
+ * __sink(), the values become unknown scalars so all iterations collapse into
+ * one reusable state.
+ *
+ * Example - keeping a reference alive::
+ *
+ *     struct task_struct *t = bpf_task_acquire(task);
+ *     __sink(t);
+ *
+ * Follows the convention from BPF selftests (bpf_misc.h).
+ */
+#define __sink(expr) asm volatile ("" : "+g"(expr))
+
 /*
  * BPF declarations and helpers
  */
@@ -337,6 +381,7 @@ void bpf_task_release(struct task_struct *p) __ksym;
 
 /* cgroup */
 struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
+struct cgroup *bpf_cgroup_acquire(struct cgroup *cgrp) __ksym;
 void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
 struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
 
@@ -742,6 +787,73 @@ static inline u64 __sqrt_u64(u64 x)
 	return r;
 }
 
+/*
+ * ctzll -- Counts trailing zeros in an unsigned long long. If the input value
+ * is zero, the return value is undefined.
+ */
+static inline int ctzll(u64 v)
+{
+#if (!defined(__BPF__) && defined(__SCX_TARGET_ARCH_x86)) || \
+	(defined(__BPF__) && defined(__clang_major__) && __clang_major__ >= 19)
+	/*
+	 * Use the ctz builtin when: (1) building for native x86, or
+	 * (2) building for BPF with clang >= 19 (BPF backend supports
+	 * the intrinsic from clang 19 onward; earlier versions hit
+	 * "unimplemented opcode" in the backend).
+	 */
+	return __builtin_ctzll(v);
+#else
+	/*
+	 * If neither the target architecture nor the toolchains support ctzll,
+	 * use software-based emulation. Let's use the De Bruijn sequence-based
+	 * approach to find LSB fastly. See the details of De Bruijn sequence:
+	 *
+	 * https://en.wikipedia.org/wiki/De_Bruijn_sequence
+	 * https://www.chessprogramming.org/BitScan#De_Bruijn_Multiplication
+	 */
+	const int lookup_table[64] = {
+		 0,  1, 48,  2, 57, 49, 28,  3, 61, 58, 50, 42, 38, 29, 17,  4,
+		62, 55, 59, 36, 53, 51, 43, 22, 45, 39, 33, 30, 24, 18, 12,  5,
+		63, 47, 56, 27, 60, 41, 37, 16, 54, 35, 52, 21, 44, 32, 23, 11,
+		46, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19,  9, 13,  8,  7,  6,
+	};
+	const u64 DEBRUIJN_CONSTANT = 0x03f79d71b4cb0a89ULL;
+	unsigned int index;
+	u64 lowest_bit;
+	const int *lt;
+
+	if (v == 0)
+		return -1;
+
+	/*
+	 * Isolate the least significant bit (LSB).
+	 * For example, if v = 0b...10100, then v & -v = 0b...00100
+	 */
+	lowest_bit = v & -v;
+
+	/*
+	 * Each isolated bit produces a unique 6-bit value, guaranteed by the
+	 * De Bruijn property. Calculate a unique index into the lookup table
+	 * using the magic constant and a right shift.
+	 *
+	 * Multiplying by the 64-bit constant "spreads out" that 1-bit into a
+	 * unique pattern in the top 6 bits. This uniqueness property is
+	 * exactly what a De Bruijn sequence guarantees: Every possible 6-bit
+	 * pattern (in top bits) occurs exactly once for each LSB position. So,
+	 * the constant 0x03f79d71b4cb0a89ULL is carefully chosen to be a
+	 * De Bruijn sequence, ensuring no collisions in the table index.
+	 */
+	index = (lowest_bit * DEBRUIJN_CONSTANT) >> 58;
+
+	/*
+	 * Lookup in a precomputed table. No collision is guaranteed by the
+	 * De Bruijn property.
+	 */
+	lt = MEMBER_VPTR(lookup_table, [index]);
+	return (lt)? *lt : -1;
+#endif
+}
+
 /*
  * Return a value proportionally scaled to the task's weight.
  */
@@ -759,6 +871,171 @@ static inline u64 scale_by_task_weight_inverse(const struct task_struct *p, u64
 }
 
 
+/*
+ * Get a random u64 from the kernel's pseudo-random generator.
+ */
+static inline u64 get_prandom_u64()
+{
+	return ((u64)bpf_get_prandom_u32() << 32) | bpf_get_prandom_u32();
+}
+
+/*
+ * Define the shadow structure to avoid a compilation error when
+ * vmlinux.h does not enable necessary kernel configs. The ___local
+ * suffix is a CO-RE convention that tells the loader to match this
+ * against the base struct rq in the kernel. The attribute
+ * preserve_access_index tells the compiler to generate a CO-RE
+ * relocation for these fields.
+ */
+struct rq___local {
+	/*
+	 * A monotonically increasing clock per CPU. It is rq->clock minus
+	 * cumulative IRQ time and hypervisor steal time. Unlike rq->clock,
+	 * it does not advance during IRQ processing or hypervisor preemption.
+	 * It does advance during idle (the idle task counts as a running task
+	 * for this purpose).
+	 */
+	u64		clock_task;
+	/*
+	 * Invariant version of clock_task scaled by CPU capacity and
+	 * frequency. For example, clock_pelt advances 2x slower on a CPU
+	 * with half the capacity.
+	 *
+	 * At idle exit, rq->clock_pelt jumps forward to resync with
+	 * clock_task. The kernel's rq_clock_pelt() corrects for this jump
+	 * by subtracting lost_idle_time, yielding a clock that appears
+	 * continuous across idle transitions. scx_clock_pelt() mirrors
+	 * rq_clock_pelt() by performing the same subtraction.
+	 */
+	u64		clock_pelt;
+	/*
+	 * Accumulates the magnitude of each clock_pelt jump at idle exit.
+	 * Subtracting this from clock_pelt gives rq_clock_pelt(): a
+	 * continuous, capacity-invariant clock suitable for both task
+	 * execution time stamping and cross-idle measurements.
+	 */
+	unsigned long	lost_idle_time;
+	/*
+	 * Shadow of paravirt_steal_clock() (the hypervisor's cumulative
+	 * stolen time counter). Stays frozen while the hypervisor preempts
+	 * the vCPU; catches up the next time update_rq_clock_task() is
+	 * called. The delta is the stolen time not yet subtracted from
+	 * clock_task.
+	 *
+	 * Unlike irqtime->total (a plain kernel-side field), the live stolen
+	 * time counter lives in hypervisor-specific shared memory and has no
+	 * kernel-side equivalent readable from BPF in a hypervisor-agnostic
+	 * way. This field is therefore the only portable BPF-accessible
+	 * approximation of cumulative steal time.
+	 *
+	 * Available only when CONFIG_PARAVIRT_TIME_ACCOUNTING is on.
+	 */
+	u64		prev_steal_time_rq;
+} __attribute__((preserve_access_index));
+
+extern struct rq runqueues __ksym;
+
+/*
+ * Define the shadow structure to avoid a compilation error when
+ * vmlinux.h does not enable necessary kernel configs.
+ */
+struct irqtime___local {
+	/*
+	 * Cumulative IRQ time counter for this CPU, in nanoseconds. Advances
+	 * immediately at the exit of every hardirq and non-ksoftirqd softirq
+	 * via irqtime_account_irq(). ksoftirqd time is counted as normal
+	 * task time and is NOT included. NMI time is also NOT included.
+	 *
+	 * The companion field irqtime->sync (struct u64_stats_sync) protects
+	 * against 64-bit tearing on 32-bit architectures. On 64-bit kernels,
+	 * u64_stats_sync is an empty struct and all seqcount operations are
+	 * no-ops, so a plain BPF_CORE_READ of this field is safe.
+	 *
+	 * Available only when CONFIG_IRQ_TIME_ACCOUNTING is on.
+	 */
+	u64		total;
+} __attribute__((preserve_access_index));
+
+/*
+ * cpu_irqtime is a per-CPU variable defined only when
+ * CONFIG_IRQ_TIME_ACCOUNTING is on. Declare it as __weak so the BPF
+ * loader sets its address to 0 (rather than failing) when the symbol
+ * is absent from the running kernel.
+ */
+extern struct irqtime___local cpu_irqtime __ksym __weak;
+
+static inline struct rq___local *get_current_rq(u32 cpu)
+{
+	/*
+	 * This is a workaround to get an rq pointer since we decided to
+	 * deprecate scx_bpf_cpu_rq().
+	 *
+	 * WARNING: The caller must hold the rq lock for @cpu. This is
+	 * guaranteed when called from scheduling callbacks (ops.running,
+	 * ops.stopping, ops.enqueue, ops.dequeue, ops.dispatch, etc.).
+	 * There is no runtime check available in BPF for kernel spinlock
+	 * state — correctness is enforced by calling context only.
+	 */
+	return (void *)bpf_per_cpu_ptr(&runqueues, cpu);
+}
+
+static inline u64 scx_clock_task(u32 cpu)
+{
+	struct rq___local *rq = get_current_rq(cpu);
+
+	/* Equivalent to the kernel's rq_clock_task(). */
+	return rq ? rq->clock_task : 0;
+}
+
+static inline u64 scx_clock_pelt(u32 cpu)
+{
+	struct rq___local *rq = get_current_rq(cpu);
+
+	/*
+	 * Equivalent to the kernel's rq_clock_pelt(): subtracts
+	 * lost_idle_time from clock_pelt to absorb the jump that occurs
+	 * when clock_pelt resyncs with clock_task at idle exit. The result
+	 * is a continuous, capacity-invariant clock safe for both task
+	 * execution time stamping and cross-idle measurements.
+	 */
+	return rq ? (rq->clock_pelt - rq->lost_idle_time) : 0;
+}
+
+static inline u64 scx_clock_virt(u32 cpu)
+{
+	struct rq___local *rq;
+
+	/*
+	 * Check field existence before calling get_current_rq() so we avoid
+	 * the per_cpu lookup entirely on kernels built without
+	 * CONFIG_PARAVIRT_TIME_ACCOUNTING.
+	 */
+	if (!bpf_core_field_exists(((struct rq___local *)0)->prev_steal_time_rq))
+		return 0;
+
+	/* Lagging shadow of the kernel's paravirt_steal_clock(). */
+	rq = get_current_rq(cpu);
+	return rq ? BPF_CORE_READ(rq, prev_steal_time_rq) : 0;
+}
+
+static inline u64 scx_clock_irq(u32 cpu)
+{
+	struct irqtime___local *irqt;
+
+	/*
+	 * bpf_core_type_exists() resolves at load time: if struct irqtime is
+	 * absent from kernel BTF (CONFIG_IRQ_TIME_ACCOUNTING off), the loader
+	 * patches this into an unconditional return 0, making the
+	 * bpf_per_cpu_ptr() call below dead code that the verifier never sees.
+	 */
+	if (!bpf_core_type_exists(struct irqtime___local))
+		return 0;
+
+	/* Equivalent to the kernel's irq_time_read(). */
+	irqt = bpf_per_cpu_ptr(&cpu_irqtime, cpu);
+	return irqt ? BPF_CORE_READ(irqt, total) : 0;
+}
+
 #include "compat.bpf.h"
 #include "enums.bpf.h"
 

From c9c8546cdee64d3f1a54fe09cc2b8c1a8ea80c6d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 16:45:17 -1000
Subject: [PATCH 060/134] tools/sched_ext/include: Add
 __COMPAT_HAS_scx_bpf_select_cpu_and macro

scx_bpf_select_cpu_and() is now an inline wrapper so
bpf_ksym_exists(scx_bpf_select_cpu_and) no longer works. Add
__COMPAT_HAS_scx_bpf_select_cpu_and macro that checks for either the
struct args type (new) or the compat ksym (old) to test availability.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
---
 tools/sched_ext/include/scx/compat.bpf.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 2d3985be7e2c..704728864d83 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -266,6 +266,14 @@ scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 	}
 }
 
+/*
+ * scx_bpf_select_cpu_and() is now an inline wrapper. Use this instead of
+ * bpf_ksym_exists(scx_bpf_select_cpu_and) to test availability.
+ */
+#define __COMPAT_HAS_scx_bpf_select_cpu_and				\
+	(bpf_core_type_exists(struct scx_bpf_select_cpu_and_args) ||	\
+	 bpf_ksym_exists(scx_bpf_select_cpu_and___compat))
+
 /**
  * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
  * @p: task_struct to insert

From 93ac9b150e2fba3a5e94e0b20d954a12e4b0907f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 16:45:18 -1000
Subject: [PATCH 061/134] tools/sched_ext/include: Add libbpf version guard for
 assoc_struct_ops

Extract the inline bpf_program__assoc_struct_ops() call in SCX_OPS_LOAD()
into a __scx_ops_assoc_prog() helper and wrap it with a libbpf >= 1.7
version guard. bpf_program__assoc_struct_ops() was added in libbpf 1.7;
the guard provides a no-op fallback for older versions. Add the
<bpf/libbpf.h> include needed by the helper, and fix "assumming" typo in
a nearby comment.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
---
 tools/sched_ext/include/scx/compat.h | 35 +++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 9b6df13b187b..50297d4b9533 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -8,6 +8,7 @@
 #define __SCX_COMPAT_H
 
 #include <bpf/btf.h>
+#include <bpf/libbpf.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -182,6 +183,31 @@ static inline long scx_hotplug_seq(void)
 	__skel; 								\
 })
 
+/*
+ * Associate non-struct_ops BPF programs with the scheduler's struct_ops map so
+ * that scx_prog_sched() can determine which scheduler a BPF program belongs
+ * to. Requires libbpf >= 1.7.
+ */
+#if LIBBPF_MAJOR_VERSION > 1 ||							\
+	(LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 7)
+static inline void __scx_ops_assoc_prog(struct bpf_program *prog,
+					struct bpf_map *map,
+					const char *ops_name)
+{
+	s32 err = bpf_program__assoc_struct_ops(prog, map, NULL);
+	if (err)
+		fprintf(stderr,
+			"ERROR: Failed to associate %s with %s: %d\n",
+			bpf_program__name(prog), ops_name, err);
+}
+#else
+static inline void __scx_ops_assoc_prog(struct bpf_program *prog,
+					struct bpf_map *map,
+					const char *ops_name)
+{
+}
+#endif
+
 #define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({		\
 	struct bpf_program *__prog;						\
 	UEI_SET_SIZE(__skel, __ops_name, __uei_name);				\
@@ -189,18 +215,15 @@ static inline long scx_hotplug_seq(void)
 	bpf_object__for_each_program(__prog, (__skel)->obj) {			\
 		if (bpf_program__type(__prog) == BPF_PROG_TYPE_STRUCT_OPS)	\
 			continue;						\
-		s32 err = bpf_program__assoc_struct_ops(__prog,			\
-					(__skel)->maps.__ops_name, NULL);	\
-		if (err)							\
-			fprintf(stderr, "ERROR: Failed to associate %s with %s: %d\n", \
-				bpf_program__name(__prog), #__ops_name, err);	\
+		__scx_ops_assoc_prog(__prog, (__skel)->maps.__ops_name,		\
+				     #__ops_name);				\
 	}									\
 })
 
 /*
  * New versions of bpftool now emit additional link placeholders for BPF maps,
  * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps
- * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do
+ * automatically, assuming libbpf is recent enough (v1.5+). Old libbpf will do
  * nothing with those links and won't attempt to auto-attach maps.
  *
  * To maintain compatibility with older libbpf while avoiding trying to attach

From 0a0d3b8dd06b9df0bdc31427090a85e90ac0406b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 7 Mar 2026 16:45:19 -1000
Subject: [PATCH 062/134] tools/sched_ext/include: Regenerate
 enum_defs.autogen.h

Regenerate enum_defs.autogen.h from the current vmlinux.h to pick up
new SCX enums added in the for-7.1 cycle.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
---
 .../sched_ext/include/scx/enum_defs.autogen.h | 49 ++++++++++++++-----
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h
index dcc945304760..78d34f0c29f0 100644
--- a/tools/sched_ext/include/scx/enum_defs.autogen.h
+++ b/tools/sched_ext/include/scx/enum_defs.autogen.h
@@ -14,7 +14,13 @@
 #define HAVE_SCX_EXIT_MSG_LEN
 #define HAVE_SCX_EXIT_DUMP_DFL_LEN
 #define HAVE_SCX_CPUPERF_ONE
-#define HAVE_SCX_OPS_TASK_ITER_BATCH
+#define HAVE_SCX_TASK_ITER_BATCH
+#define HAVE_SCX_BYPASS_HOST_NTH
+#define HAVE_SCX_BYPASS_LB_DFL_INTV_US
+#define HAVE_SCX_BYPASS_LB_DONOR_PCT
+#define HAVE_SCX_BYPASS_LB_MIN_DELTA_DIV
+#define HAVE_SCX_BYPASS_LB_BATCH
+#define HAVE_SCX_SUB_MAX_DEPTH
 #define HAVE_SCX_CPU_PREEMPT_RT
 #define HAVE_SCX_CPU_PREEMPT_DL
 #define HAVE_SCX_CPU_PREEMPT_STOP
@@ -27,6 +33,7 @@
 #define HAVE_SCX_DSQ_INVALID
 #define HAVE_SCX_DSQ_GLOBAL
 #define HAVE_SCX_DSQ_LOCAL
+#define HAVE_SCX_DSQ_BYPASS
 #define HAVE_SCX_DSQ_LOCAL_ON
 #define HAVE_SCX_DSQ_LOCAL_CPU_MASK
 #define HAVE_SCX_DSQ_ITER_REV
@@ -36,6 +43,10 @@
 #define HAVE___SCX_DSQ_ITER_ALL_FLAGS
 #define HAVE_SCX_DSQ_LNODE_ITER_CURSOR
 #define HAVE___SCX_DSQ_LNODE_PRIV_SHIFT
+#define HAVE_SCX_ENABLING
+#define HAVE_SCX_ENABLED
+#define HAVE_SCX_DISABLING
+#define HAVE_SCX_DISABLED
 #define HAVE_SCX_ENQ_WAKEUP
 #define HAVE_SCX_ENQ_HEAD
 #define HAVE_SCX_ENQ_CPU_SELECTED
@@ -45,22 +56,37 @@
 #define HAVE___SCX_ENQ_INTERNAL_MASK
 #define HAVE_SCX_ENQ_CLEAR_OPSS
 #define HAVE_SCX_ENQ_DSQ_PRIQ
+#define HAVE_SCX_ENQ_NESTED
 #define HAVE_SCX_TASK_DSQ_ON_PRIQ
 #define HAVE_SCX_TASK_QUEUED
+#define HAVE_SCX_TASK_IN_CUSTODY
 #define HAVE_SCX_TASK_RESET_RUNNABLE_AT
 #define HAVE_SCX_TASK_DEQD_FOR_SLEEP
+#define HAVE_SCX_TASK_SUB_INIT
 #define HAVE_SCX_TASK_STATE_SHIFT
 #define HAVE_SCX_TASK_STATE_BITS
 #define HAVE_SCX_TASK_STATE_MASK
+#define HAVE_SCX_TASK_NONE
+#define HAVE_SCX_TASK_INIT
+#define HAVE_SCX_TASK_READY
+#define HAVE_SCX_TASK_ENABLED
+#define HAVE_SCX_TASK_REENQ_REASON_SHIFT
+#define HAVE_SCX_TASK_REENQ_REASON_BITS
+#define HAVE_SCX_TASK_REENQ_REASON_MASK
+#define HAVE_SCX_TASK_REENQ_NONE
+#define HAVE_SCX_TASK_REENQ_KFUNC
 #define HAVE_SCX_TASK_CURSOR
 #define HAVE_SCX_ECODE_RSN_HOTPLUG
+#define HAVE_SCX_ECODE_RSN_CGROUP_OFFLINE
 #define HAVE_SCX_ECODE_ACT_RESTART
+#define HAVE_SCX_EFLAG_INITIALIZED
 #define HAVE_SCX_EXIT_NONE
 #define HAVE_SCX_EXIT_DONE
 #define HAVE_SCX_EXIT_UNREG
 #define HAVE_SCX_EXIT_UNREG_BPF
 #define HAVE_SCX_EXIT_UNREG_KERN
 #define HAVE_SCX_EXIT_SYSRQ
+#define HAVE_SCX_EXIT_PARENT
 #define HAVE_SCX_EXIT_ERROR
 #define HAVE_SCX_EXIT_ERROR_BPF
 #define HAVE_SCX_EXIT_ERROR_STALL
@@ -81,40 +107,39 @@
 #define HAVE_SCX_OPI_CPU_HOTPLUG_BEGIN
 #define HAVE_SCX_OPI_CPU_HOTPLUG_END
 #define HAVE_SCX_OPI_END
-#define HAVE_SCX_OPS_ENABLING
-#define HAVE_SCX_OPS_ENABLED
-#define HAVE_SCX_OPS_DISABLING
-#define HAVE_SCX_OPS_DISABLED
 #define HAVE_SCX_OPS_KEEP_BUILTIN_IDLE
 #define HAVE_SCX_OPS_ENQ_LAST
 #define HAVE_SCX_OPS_ENQ_EXITING
 #define HAVE_SCX_OPS_SWITCH_PARTIAL
 #define HAVE_SCX_OPS_ENQ_MIGRATION_DISABLED
 #define HAVE_SCX_OPS_ALLOW_QUEUED_WAKEUP
+#define HAVE_SCX_OPS_BUILTIN_IDLE_PER_NODE
 #define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT
 #define HAVE_SCX_OPS_ALL_FLAGS
+#define HAVE___SCX_OPS_INTERNAL_MASK
+#define HAVE_SCX_OPS_HAS_CPU_PREEMPT
 #define HAVE_SCX_OPSS_NONE
 #define HAVE_SCX_OPSS_QUEUEING
 #define HAVE_SCX_OPSS_QUEUED
 #define HAVE_SCX_OPSS_DISPATCHING
 #define HAVE_SCX_OPSS_QSEQ_SHIFT
 #define HAVE_SCX_PICK_IDLE_CORE
+#define HAVE_SCX_PICK_IDLE_IN_NODE
 #define HAVE_SCX_OPS_NAME_LEN
 #define HAVE_SCX_SLICE_DFL
+#define HAVE_SCX_SLICE_BYPASS
 #define HAVE_SCX_SLICE_INF
+#define HAVE_SCX_REENQ_ANY
+#define HAVE___SCX_REENQ_FILTER_MASK
+#define HAVE___SCX_REENQ_USER_MASK
 #define HAVE_SCX_RQ_ONLINE
 #define HAVE_SCX_RQ_CAN_STOP_TICK
-#define HAVE_SCX_RQ_BAL_PENDING
 #define HAVE_SCX_RQ_BAL_KEEP
-#define HAVE_SCX_RQ_BYPASSING
 #define HAVE_SCX_RQ_CLK_VALID
+#define HAVE_SCX_RQ_BAL_CB_PENDING
 #define HAVE_SCX_RQ_IN_WAKEUP
 #define HAVE_SCX_RQ_IN_BALANCE
-#define HAVE_SCX_TASK_NONE
-#define HAVE_SCX_TASK_INIT
-#define HAVE_SCX_TASK_READY
-#define HAVE_SCX_TASK_ENABLED
-#define HAVE_SCX_TASK_NR_STATES
+#define HAVE_SCX_SCHED_PCPU_BYPASSING
 #define HAVE_SCX_TG_ONLINE
 #define HAVE_SCX_TG_INITED
 #define HAVE_SCX_WAKE_FORK

From bec10581e92289bdc3eed17e90200900ddb00594 Mon Sep 17 00:00:00 2001
From: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Date: Mon, 9 Mar 2026 10:28:46 +0800
Subject: [PATCH 063/134] sched_ext: remove SCX_OPS_HAS_CGROUP_WEIGHT

While running scx_flatcg, dmesg prints "SCX_OPS_HAS_CGROUP_WEIGHT is
deprecated and a noop", in code, SCX_OPS_HAS_CGROUP_WEIGHT has been
marked as DEPRECATED, and will be removed on 6.18. Now it's time to do it.

Signed-off-by: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c                              | 3 ---
 kernel/sched/ext_internal.h                     | 8 +-------
 tools/sched_ext/include/scx/enum_defs.autogen.h | 1 -
 tools/sched_ext/scx_flatcg.bpf.c                | 2 +-
 4 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b6a2d3b511e6..43fda1258903 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6282,9 +6282,6 @@ static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops)
 		return -EINVAL;
 	}
 
-	if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT)
-		pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n");
-
 	if (ops->cpu_acquire || ops->cpu_release)
 		pr_warn("ops->cpu_acquire/release() are deprecated, use sched_switch TP instead\n");
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index f8df73044515..bec4d22890b0 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -180,19 +180,13 @@ enum scx_ops_flags {
 	 */
 	SCX_OPS_BUILTIN_IDLE_PER_NODE	= 1LLU << 6,
 
-	/*
-	 * CPU cgroup support flags
-	 */
-	SCX_OPS_HAS_CGROUP_WEIGHT	= 1LLU << 16,	/* DEPRECATED, will be removed on 6.18 */
-
 	SCX_OPS_ALL_FLAGS		= SCX_OPS_KEEP_BUILTIN_IDLE |
 					  SCX_OPS_ENQ_LAST |
 					  SCX_OPS_ENQ_EXITING |
 					  SCX_OPS_ENQ_MIGRATION_DISABLED |
 					  SCX_OPS_ALLOW_QUEUED_WAKEUP |
 					  SCX_OPS_SWITCH_PARTIAL |
-					  SCX_OPS_BUILTIN_IDLE_PER_NODE |
-					  SCX_OPS_HAS_CGROUP_WEIGHT,
+					  SCX_OPS_BUILTIN_IDLE_PER_NODE,
 
 	/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
 	__SCX_OPS_INTERNAL_MASK		= 0xffLLU << 56,
diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h
index 78d34f0c29f0..40b30dad8ede 100644
--- a/tools/sched_ext/include/scx/enum_defs.autogen.h
+++ b/tools/sched_ext/include/scx/enum_defs.autogen.h
@@ -114,7 +114,6 @@
 #define HAVE_SCX_OPS_ENQ_MIGRATION_DISABLED
 #define HAVE_SCX_OPS_ALLOW_QUEUED_WAKEUP
 #define HAVE_SCX_OPS_BUILTIN_IDLE_PER_NODE
-#define HAVE_SCX_OPS_HAS_CGROUP_WEIGHT
 #define HAVE_SCX_OPS_ALL_FLAGS
 #define HAVE___SCX_OPS_INTERNAL_MASK
 #define HAVE_SCX_OPS_HAS_CPU_PREEMPT
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 0e785cff0f24..a8a9234bb41e 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -960,5 +960,5 @@ SCX_OPS_DEFINE(flatcg_ops,
 	       .cgroup_move		= (void *)fcg_cgroup_move,
 	       .init			= (void *)fcg_init,
 	       .exit			= (void *)fcg_exit,
-	       .flags			= SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
+	       .flags			= SCX_OPS_ENQ_EXITING,
 	       .name			= "flatcg");

From b8840942644cd76438a192ff493c60ec2169f7ef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Mar 2026 10:06:02 -1000
Subject: [PATCH 064/134] sched_ext: Replace system_unbound_wq with
 system_dfl_wq in scx_kobj_release()

c2a57380df9d ("sched: Replace use of system_unbound_wq with system_dfl_wq")
converted system_unbound_wq usages in ext.c but missed the queue_rcu_work()
call in scx_kobj_release() which was added later by the dynamic scx_sched
allocation conversion. Apply the same conversion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Marco Crivellari <marco.crivellari@suse.com>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b35b98020f3b..07476355bfd5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4546,7 +4546,7 @@ static void scx_kobj_release(struct kobject *kobj)
 	struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
 
 	INIT_RCU_WORK(&sch->rcu_work, scx_sched_free_rcu_work);
-	queue_rcu_work(system_unbound_wq, &sch->rcu_work);
+	queue_rcu_work(system_dfl_wq, &sch->rcu_work);
 }
 
 static ssize_t scx_attr_ops_show(struct kobject *kobj,

From 98059335382dc5870207d6a0c1c9e7a004d627ad Mon Sep 17 00:00:00 2001
From: Philipp Hahn <phahn-oss@avm.de>
Date: Tue, 10 Mar 2026 12:48:42 +0100
Subject: [PATCH 065/134] sched: Prefer IS_ERR_OR_NULL over manual NULL check

Prefer using IS_ERR_OR_NULL() over using IS_ERR() and a manual NULL
check.

Change generated with coccinelle.

Signed-off-by: Philipp Hahn <phahn-oss@avm.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 07476355bfd5..93a230859dd7 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4529,7 +4529,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	do {
 		rhashtable_walk_start(&rht_iter);
 
-		while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
+		while (!IS_ERR_OR_NULL((dsq = rhashtable_walk_next(&rht_iter))))
 			destroy_dsq(sch, dsq->id);
 
 		rhashtable_walk_stop(&rht_iter);

From 7e92cf4354e9803cc7b8ed01f38371e0e89d2a0a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2026 07:12:21 -1000
Subject: [PATCH 066/134] sched_ext: Fix sub_detach op check to test the
 parent's ops

sub_detach is the parent's op called to notify the parent that a child
is detaching. Test parent->ops.sub_detach instead of sch->ops.sub_detach.

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 93a230859dd7..4fa0be4980d4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5438,7 +5438,7 @@ static void scx_sub_disable(struct scx_sched *sch)
 	 */
 	wake_up_all(&scx_unlink_waitq);
 
-	if (sch->ops.sub_detach && sch->sub_attached) {
+	if (parent->ops.sub_detach && sch->sub_attached) {
 		struct scx_sub_detach_args sub_detach_args = {
 			.ops = &sch->ops,
 			.cgroup_path = sch->cgrp_path,

From b5bc043505fed4198158037938ead78557eb79ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2026 07:12:21 -1000
Subject: [PATCH 067/134] sched_ext: Add scx_dump_lock and dump_disabled

Add a dedicated scx_dump_lock and per-sched dump_disabled flag so that
debug dumping can be safely disabled during sched teardown without
relying on scx_sched_lock. This is a prep for the next patch which
decouples the sysrq dump path from scx_sched_lock to resolve a lock
ordering issue.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 25 ++++++++++++++++++++++---
 kernel/sched/ext_internal.h |  1 +
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4fa0be4980d4..5cfac2c97bf3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -136,6 +136,8 @@ static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
 static struct scx_bstr_buf scx_exit_bstr_buf;
 
 /* ops debug dump */
+static DEFINE_RAW_SPINLOCK(scx_dump_lock);
+
 struct scx_dump_data {
 	s32			cpu;
 	bool			first;
@@ -5279,6 +5281,17 @@ static void scx_unlink_sched(struct scx_sched *sch)
 	refresh_watchdog();
 }
 
+/*
+ * Called to disable future dumps and wait for in-progress one while disabling
+ * @sch. Once @sch becomes empty during disable, there's no point in dumping it.
+ * This prevents calling dump ops on a dead sch.
+ */
+static void scx_disable_dump(struct scx_sched *sch)
+{
+	guard(raw_spinlock_irqsave)(&scx_dump_lock);
+	sch->dump_disabled = true;
+}
+
 #ifdef CONFIG_EXT_SUB_SCHED
 static DECLARE_WAIT_QUEUE_HEAD(scx_unlink_waitq);
 
@@ -5414,6 +5427,8 @@ static void scx_sub_disable(struct scx_sched *sch)
 	}
 	scx_task_iter_stop(&sti);
 
+	scx_disable_dump(sch);
+
 	scx_cgroup_unlock();
 	percpu_up_write(&scx_fork_rwsem);
 
@@ -5525,6 +5540,8 @@ static void scx_root_disable(struct scx_sched *sch)
 	}
 	scx_task_iter_stop(&sti);
 
+	scx_disable_dump(sch);
+
 	scx_cgroup_lock();
 	set_cgroup_sched(sch_cgroup(sch), NULL);
 	scx_cgroup_unlock();
@@ -5680,7 +5697,7 @@ static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
 
 #ifdef CONFIG_TRACEPOINTS
 	if (trace_sched_ext_dump_enabled()) {
-		/* protected by scx_dump_state()::dump_lock */
+		/* protected by scx_dump_lock */
 		static char line_buf[SCX_EXIT_MSG_LEN];
 
 		va_start(args, fmt);
@@ -5842,7 +5859,6 @@ static void scx_dump_task(struct scx_sched *sch,
 static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
 			   size_t dump_len, bool dump_all_tasks)
 {
-	static DEFINE_RAW_SPINLOCK(dump_lock);
 	static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
 	struct scx_dump_ctx dctx = {
 		.kind = ei->kind,
@@ -5856,7 +5872,10 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
 	char *buf;
 	int cpu;
 
-	guard(raw_spinlock_irqsave)(&dump_lock);
+	guard(raw_spinlock_irqsave)(&scx_dump_lock);
+
+	if (sch->dump_disabled)
+		return;
 
 	seq_buf_init(&s, ei->dump, dump_len);
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index bec4d22890b0..3623de2c30a1 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1003,6 +1003,7 @@ struct scx_sched {
 	atomic_t		bypass_dsp_enable_depth;
 
 	bool			aborting;
+	bool			dump_disabled;	/* protected by scx_dump_lock */
 	u32			dsp_max_batch;
 	s32			level;
 

From f4a6c506d11823e7123bc6573fbd8e432245acf4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2026 07:12:21 -1000
Subject: [PATCH 068/134] sched_ext: Always bounce scx_disable() through
 irq_work

scx_disable() directly called kthread_queue_work() which can acquire
worker->lock, pi_lock and rq->__lock. This made scx_disable() unsafe to
call while holding locks that conflict with this chain - in particular,
scx_claim_exit() calls scx_disable() for each descendant while holding
scx_sched_lock, which nests inside rq->__lock in scx_bypass().

The error path (scx_vexit()) was already bouncing through irq_work to
avoid this issue. Generalize the pattern to all scx_disable() calls by
always going through irq_work. irq_work_queue() is lockless and safe to
call from any context, and the actual kthread_queue_work() call happens
in the irq_work handler outside any locks.

Rename error_irq_work to disable_irq_work to reflect the broader usage.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 12 ++++++------
 kernel/sched/ext_internal.h |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5cfac2c97bf3..bc6ce05bb98e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4498,7 +4498,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 	struct scx_dispatch_q *dsq;
 	int cpu, node;
 
-	irq_work_sync(&sch->error_irq_work);
+	irq_work_sync(&sch->disable_irq_work);
 	kthread_destroy_worker(sch->helper);
 	timer_shutdown_sync(&sch->bypass_lb_timer);
 
@@ -5679,7 +5679,7 @@ static void scx_disable(struct scx_sched *sch, enum scx_exit_kind kind)
 {
 	guard(preempt)();
 	if (scx_claim_exit(sch, kind))
-		kthread_queue_work(sch->helper, &sch->disable_work);
+		irq_work_queue(&sch->disable_irq_work);
 }
 
 static void dump_newline(struct seq_buf *s)
@@ -6012,9 +6012,9 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
 		       trunc_marker, sizeof(trunc_marker));
 }
 
-static void scx_error_irq_workfn(struct irq_work *irq_work)
+static void scx_disable_irq_workfn(struct irq_work *irq_work)
 {
-	struct scx_sched *sch = container_of(irq_work, struct scx_sched, error_irq_work);
+	struct scx_sched *sch = container_of(irq_work, struct scx_sched, disable_irq_work);
 	struct scx_exit_info *ei = sch->exit_info;
 
 	if (ei->kind >= SCX_EXIT_ERROR)
@@ -6048,7 +6048,7 @@ static bool scx_vexit(struct scx_sched *sch,
 	ei->kind = kind;
 	ei->reason = scx_exit_reason(ei->kind);
 
-	irq_work_queue(&sch->error_irq_work);
+	irq_work_queue(&sch->disable_irq_work);
 	return true;
 }
 
@@ -6184,7 +6184,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 
 	sch->slice_dfl = SCX_SLICE_DFL;
 	atomic_set(&sch->exit_kind, SCX_EXIT_NONE);
-	init_irq_work(&sch->error_irq_work, scx_error_irq_workfn);
+	init_irq_work(&sch->disable_irq_work, scx_disable_irq_workfn);
 	kthread_init_work(&sch->disable_work, scx_disable_workfn);
 	timer_setup(&sch->bypass_lb_timer, scx_bypass_lb_timerfn, 0);
 	sch->ops = *ops;
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 3623de2c30a1..c78dadaadab8 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1042,7 +1042,7 @@ struct scx_sched {
 	struct kobject		kobj;
 
 	struct kthread_worker	*helper;
-	struct irq_work		error_irq_work;
+	struct irq_work		disable_irq_work;
 	struct kthread_work	disable_work;
 	struct timer_list	bypass_lb_timer;
 	struct rcu_work		rcu_work;

From 6b36c4c2935c54d6a103389fad2a2a9d25591501 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2026 07:12:21 -1000
Subject: [PATCH 069/134] sched_ext: Fix scx_sched_lock / rq lock ordering

There are two sites that nest rq lock inside scx_sched_lock:

- scx_bypass() takes scx_sched_lock then rq lock per CPU to propagate
  per-cpu bypass flags and re-enqueue tasks.

- sysrq_handle_sched_ext_dump() takes scx_sched_lock to iterate all
  scheds, scx_dump_state() then takes rq lock per CPU for dump.

And scx_claim_exit() takes scx_sched_lock to propagate exits to
descendants. It can be reached from scx_tick(), BPF kfuncs, and many
other paths with rq lock already held, creating the reverse ordering:

  rq lock -> scx_sched_lock vs. scx_sched_lock -> rq lock

Fix by flipping scx_bypass() to take rq lock first, and dropping
scx_sched_lock from sysrq_handle_sched_ext_dump() as scx_sched_all is
already RCU-traversable and scx_dump_lock now prevents dumping a dead
sched. This makes the consistent ordering rq lock -> scx_sched_lock.

Reported-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Link: https://lore.kernel.org/r/20260309163025.2240221-1-yphbchou0911@gmail.com
Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index bc6ce05bb98e..efba05725139 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5097,8 +5097,8 @@ static void scx_bypass(struct scx_sched *sch, bool bypass)
 		struct rq *rq = cpu_rq(cpu);
 		struct task_struct *p, *n;
 
-		raw_spin_lock(&scx_sched_lock);
 		raw_spin_rq_lock(rq);
+		raw_spin_lock(&scx_sched_lock);
 
 		scx_for_each_descendant_pre(pos, sch) {
 			struct scx_sched_pcpu *pcpu = per_cpu_ptr(pos->pcpu, cpu);
@@ -7240,8 +7240,6 @@ static void sysrq_handle_sched_ext_dump(u8 key)
 	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
 	struct scx_sched *sch;
 
-	guard(raw_spinlock_irqsave)(&scx_sched_lock);
-
 	list_for_each_entry_rcu(sch, &scx_sched_all, all)
 		scx_dump_state(sch, &ei, 0, false);
 }

From 6b4576b09714def33890e04ef49621bca3614bbf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Mar 2026 07:12:21 -1000
Subject: [PATCH 070/134] sched_ext: Reject sub-sched attachment to a disabled
 parent

scx_claim_exit() propagates exits to descendants under scx_sched_lock.
A sub-sched being attached concurrently could be missed if it links
after the propagation. Check the parent's exit_kind in scx_link_sched()
under scx_sched_lock to interlock against scx_claim_exit() - either the
parent sees the child in its iteration or the child sees the parent's
non-NONE exit_kind and fails attachment.

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index efba05725139..e7ab3647e35f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5247,6 +5247,17 @@ static s32 scx_link_sched(struct scx_sched *sch)
 		s32 ret;
 
 		if (parent) {
+			/*
+			 * scx_claim_exit() propagates exit_kind transition to
+			 * its sub-scheds while holding scx_sched_lock - either
+			 * we can see the parent's non-NONE exit_kind or the
+			 * parent can shoot us down.
+			 */
+			if (atomic_read(&parent->exit_kind) != SCX_EXIT_NONE) {
+				scx_error(sch, "parent disabled");
+				return -ENOENT;
+			}
+
 			ret = rhashtable_lookup_insert_fast(&scx_sched_hash,
 					&sch->hash_node, scx_sched_hash_params);
 			if (ret) {
@@ -5638,6 +5649,11 @@ static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind)
 	 * serialized, running them in separate threads allows parallelizing
 	 * ops.exit(), which can take arbitrarily long prolonging bypass mode.
 	 *
+	 * To guarantee forward progress, this propagation must be in-line so
+	 * that ->aborting is synchronously asserted for all sub-scheds. The
+	 * propagation is also the interlocking point against sub-sched
+	 * attachment. See scx_link_sched().
+	 *
 	 * This doesn't cause recursions as propagation only takes place for
 	 * non-propagation exits.
 	 */

From bd377af0970164a4d12479bf36049619201be2f0 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Wed, 11 Mar 2026 19:34:07 +0800
Subject: [PATCH 071/134] sched_ext: Fix incomplete help text usage strings

Several demo schedulers and the selftest runner had usage strings
that omitted options which are actually supported:

- scx_central: add missing [-v]
- scx_pair: add missing [-v]
- scx_qmap: add missing [-S] and [-H]
- scx_userland: add missing [-v]
- scx_sdt: remove [-f] which no longer exists
- runner.c: add missing [-s], [-l], [-q]; drop [-h] which none of the
  other sched_ext tools list in their usage lines

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.c              | 2 +-
 tools/sched_ext/scx_pair.c                 | 2 +-
 tools/sched_ext/scx_qmap.c                 | 2 +-
 tools/sched_ext/scx_sdt.c                  | 2 +-
 tools/sched_ext/scx_userland.c             | 2 +-
 tools/testing/selftests/sched_ext/runner.c | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index 710fa03376e2..fd4c0eaa4326 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -21,7 +21,7 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-s SLICE_US] [-c CPU]\n"
+"Usage: %s [-s SLICE_US] [-c CPU] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -c CPU        Override the central CPU (default: 0)\n"
diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 2e509391f3da..61fd86b44c40 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -21,7 +21,7 @@ const char help_fmt[] =
 "\n"
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
-"Usage: %s [-S STRIDE]\n"
+"Usage: %s [-S STRIDE] [-v]\n"
 "\n"
 "  -S STRIDE     Override CPU pair stride (default: nr_cpus_ids / 2)\n"
 "  -v            Print libbpf debug messages\n"
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 9252037284d3..5916bbe0d77f 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -21,7 +21,7 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-"       [-P] [-M] [-d PID] [-D LEN] [-p] [-v]\n"
+"       [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
diff --git a/tools/sched_ext/scx_sdt.c b/tools/sched_ext/scx_sdt.c
index a36405d8df30..bf664b2d3785 100644
--- a/tools/sched_ext/scx_sdt.c
+++ b/tools/sched_ext/scx_sdt.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
 "\n"
 "Modified version of scx_simple that demonstrates arena-based data structures.\n"
 "\n"
-"Usage: %s [-f] [-v]\n"
+"Usage: %s [-v]\n"
 "\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c
index 3f2aba658b4a..616043c165e6 100644
--- a/tools/sched_ext/scx_userland.c
+++ b/tools/sched_ext/scx_userland.c
@@ -38,7 +38,7 @@ const char help_fmt[] =
 "\n"
 "Try to reduce `sysctl kernel.pid_max` if this program triggers OOMs.\n"
 "\n"
-"Usage: %s [-b BATCH]\n"
+"Usage: %s [-b BATCH] [-v]\n"
 "\n"
 "  -b BATCH      The number of tasks to batch when dispatching (default: 8)\n"
 "  -v            Print libbpf debug messages\n"
diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
index 761c21f96404..90043fd74a60 100644
--- a/tools/testing/selftests/sched_ext/runner.c
+++ b/tools/testing/selftests/sched_ext/runner.c
@@ -18,7 +18,7 @@ const char help_fmt[] =
 "It's required for the testcases to be serial, as only a single host-wide sched_ext\n"
 "scheduler may be loaded at any given time."
 "\n"
-"Usage: %s [-t TEST] [-h]\n"
+"Usage: %s [-t TEST] [-s] [-l] [-q]\n"
 "\n"
 "  -t TEST       Only run tests whose name includes this string\n"
 "  -s            Include print output for skipped tests\n"

From 1d02346fec8d13b05e54296ddc6ae29b7e1067df Mon Sep 17 00:00:00 2001
From: David Carlier <devnexen@gmail.com>
Date: Fri, 13 Mar 2026 05:17:55 +0000
Subject: [PATCH 072/134] selftests/sched_ext: Add missing error check for
 exit__load()

exit__load(skel) was called without checking its return value.
Every other test in the suite wraps the load call with
SCX_FAIL_IF(). Add the missing check to be consistent with the
rest of the test suite.

Fixes: a5db7817af78 ("sched_ext: Add selftests")
Signed-off-by: David Carlier <devnexen@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c
index ee25824b1cbe..b987611789d1 100644
--- a/tools/testing/selftests/sched_ext/exit.c
+++ b/tools/testing/selftests/sched_ext/exit.c
@@ -33,7 +33,7 @@ static enum scx_test_status run(void *ctx)
 		skel = exit__open();
 		SCX_ENUM_INIT(skel);
 		skel->rodata->exit_point = tc;
-		exit__load(skel);
+		SCX_FAIL_IF(exit__load(skel), "Failed to load skel");
 		link = bpf_map__attach_struct_ops(skel->maps.exit_ops);
 		if (!link) {
 			SCX_ERR("Failed to attach scheduler");

From f1c1dd9cc1b610e44d16210f5b9bd5e697c7cf74 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Mar 2026 09:43:22 -1000
Subject: [PATCH 073/134] sched_ext: Split task_should_reenq() into local and
 user variants

Split task_should_reenq() into local_task_should_reenq() and
user_task_should_reenq(). The local variant takes reenq_flags by pointer.

No functional change. This prepares for SCX_ENQ_IMMED which will add
IMMED-specific logic to the local variant.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index e7ab3647e35f..774c012831e6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3725,13 +3725,10 @@ static void process_ddsp_deferred_locals(struct rq *rq)
 	}
 }
 
-static bool task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason)
+static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason)
 {
 	*reason = SCX_TASK_REENQ_KFUNC;
-
-	if (reenq_flags & SCX_REENQ_ANY)
-		return true;
-	return false;
+	return *reenq_flags & SCX_REENQ_ANY;
 }
 
 static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
@@ -3769,7 +3766,7 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
 		if (!scx_is_descendant(task_sch, sch))
 			continue;
 
-		if (!task_should_reenq(p, reenq_flags, &reason))
+		if (!local_task_should_reenq(p, &reenq_flags, &reason))
 			continue;
 
 		dispatch_dequeue(rq, p);
@@ -3826,6 +3823,12 @@ static void process_deferred_reenq_locals(struct rq *rq)
 	}
 }
 
+static bool user_task_should_reenq(struct task_struct *p, u64 reenq_flags, u32 *reason)
+{
+	*reason = SCX_TASK_REENQ_KFUNC;
+	return reenq_flags & SCX_REENQ_ANY;
+}
+
 static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flags)
 {
 	struct rq *locked_rq = rq;
@@ -3846,7 +3849,7 @@ static void reenq_user(struct rq *rq, struct scx_dispatch_q *dsq, u64 reenq_flag
 		if (!p)
 			break;
 
-		if (!task_should_reenq(p, reenq_flags, &reason))
+		if (!user_task_should_reenq(p, reenq_flags, &reason))
 			continue;
 
 		task_rq = task_rq(p);

From b5b38761b45a6c7d91760d212fda8b46df8c5362 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Mar 2026 09:43:22 -1000
Subject: [PATCH 074/134] sched_ext: Add scx_vet_enq_flags() and plumb dsq_id
 into preamble

Add scx_vet_enq_flags() stub and call it from scx_dsq_insert_preamble() and
scx_dsq_move(). Pass dsq_id into preamble so the vetting function can
validate flag and DSQ combinations.

No functional change. This prepares for SCX_ENQ_IMMED which will populate the
vetting function.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 774c012831e6..2f59265b9b57 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7530,8 +7530,13 @@ void __init init_sched_ext_class(void)
 /********************************************************************************
  * Helpers that can be called from the BPF scheduler.
  */
+static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 enq_flags)
+{
+	return true;
+}
+
 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
-				    u64 enq_flags)
+				    u64 dsq_id, u64 enq_flags)
 {
 	if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
 		return false;
@@ -7554,6 +7559,9 @@ static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p
 		return false;
 	}
 
+	if (!scx_vet_enq_flags(sch, dsq_id, enq_flags))
+		return false;
+
 	return true;
 }
 
@@ -7635,7 +7643,7 @@ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
 	if (unlikely(!sch))
 		return false;
 
-	if (!scx_dsq_insert_preamble(sch, p, enq_flags))
+	if (!scx_dsq_insert_preamble(sch, p, dsq_id, enq_flags))
 		return false;
 
 	if (slice)
@@ -7661,7 +7669,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id,
 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
 				 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags)
 {
-	if (!scx_dsq_insert_preamble(sch, p, enq_flags))
+	if (!scx_dsq_insert_preamble(sch, p, dsq_id, enq_flags))
 		return false;
 
 	if (slice)
@@ -7788,6 +7796,9 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	    !scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return false;
 
+	if (!scx_vet_enq_flags(sch, dsq_id, enq_flags))
+		return false;
+
 	/*
 	 * If the BPF scheduler keeps calling this function repeatedly, it can
 	 * cause similar live-lock conditions as consume_dispatch_q().

From 98d709cba3193f0bec54da4cd76ef499ea2f1ef7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Mar 2026 09:43:22 -1000
Subject: [PATCH 075/134] sched_ext: Implement SCX_ENQ_IMMED

Add SCX_ENQ_IMMED enqueue flag for local DSQ insertions. Once a task is
dispatched with IMMED, it either gets on the CPU immediately and stays on it,
or gets reenqueued back to the BPF scheduler. It will never linger on a local
DSQ behind other tasks or on a CPU taken by a higher-priority class.

rq_is_open() uses rq->next_class to determine whether the rq is available,
and wakeup_preempt_scx() triggers reenqueue when a higher-priority class task
arrives. These capture all higher class preemptions. Combined with reenqueue
points in the dispatch path, all cases where an IMMED task would not execute
immediately are covered.

SCX_TASK_IMMED persists in p->scx.flags until the next fresh enqueue, so the
guarantee survives SAVE/RESTORE cycles. If preempted while running,
put_prev_task_scx() reenqueues through ops.enqueue() with
SCX_TASK_REENQ_PREEMPTED instead of silently placing the task back on the
local DSQ.

This enables tighter scheduling latency control by preventing tasks from
piling up on local DSQs. It also enables opportunistic CPU sharing across
sub-schedulers - without this, a sub-scheduler can stuff the local DSQ of a
shared CPU, making it difficult for others to use.

v2: - Rewrite is_curr_done() as rq_is_open() using rq->next_class and
      implement wakeup_preempt_scx() to achieve complete coverage of all
      cases where IMMED tasks could get stranded.
    - Track IMMED persistently in p->scx.flags and reenqueue
      preempted-while-running tasks through ops.enqueue().
    - Bound deferred reenq cycles (SCX_REENQ_LOCAL_MAX_REPEAT).
    - Misc renames, documentation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h                |   5 +
 kernel/sched/ext.c                       | 271 +++++++++++++++++++++--
 kernel/sched/ext_internal.h              |  47 ++++
 kernel/sched/sched.h                     |   2 +
 tools/sched_ext/include/scx/compat.bpf.h |   5 +
 5 files changed, 311 insertions(+), 19 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 60a4f65d0174..602dc83cab36 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -100,6 +100,7 @@ enum scx_ent_flags {
 	SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
 	SCX_TASK_DEQD_FOR_SLEEP	= 1 << 3, /* last dequeue was for SLEEP */
 	SCX_TASK_SUB_INIT	= 1 << 4, /* task being initialized for a sub sched */
+	SCX_TASK_IMMED		= 1 << 5, /* task is on local DSQ with %SCX_ENQ_IMMED */
 
 	/*
 	 * Bits 8 and 9 are used to carry task state:
@@ -125,6 +126,8 @@ enum scx_ent_flags {
 	 *
 	 * NONE		not being reenqueued
 	 * KFUNC	reenqueued by scx_bpf_dsq_reenq() and friends
+	 * IMMED	reenqueued due to failed ENQ_IMMED
+	 * PREEMPTED	preempted while running
 	 */
 	SCX_TASK_REENQ_REASON_SHIFT = 12,
 	SCX_TASK_REENQ_REASON_BITS = 2,
@@ -132,6 +135,8 @@ enum scx_ent_flags {
 
 	SCX_TASK_REENQ_NONE	= 0 << SCX_TASK_REENQ_REASON_SHIFT,
 	SCX_TASK_REENQ_KFUNC	= 1 << SCX_TASK_REENQ_REASON_SHIFT,
+	SCX_TASK_REENQ_IMMED	= 2 << SCX_TASK_REENQ_REASON_SHIFT,
+	SCX_TASK_REENQ_PREEMPTED = 3 << SCX_TASK_REENQ_REASON_SHIFT,
 
 	/* iteration cursor, not a task */
 	SCX_TASK_CURSOR		= 1 << 31,
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2f59265b9b57..c75c35b67a18 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -406,6 +406,62 @@ static bool bypass_dsp_enabled(struct scx_sched *sch)
 	return unlikely(atomic_read(&sch->bypass_dsp_enable_depth));
 }
 
+/**
+ * rq_is_open - Is the rq available for immediate execution of an SCX task?
+ * @rq: rq to test
+ * @enq_flags: optional %SCX_ENQ_* of the task being enqueued
+ *
+ * Returns %true if @rq is currently open for executing an SCX task. After a
+ * %false return, @rq is guaranteed to invoke SCX dispatch path at least once
+ * before going to idle and not inserting a task into @rq's local DSQ after a
+ * %false return doesn't cause @rq to stall.
+ */
+static bool rq_is_open(struct rq *rq, u64 enq_flags)
+{
+	lockdep_assert_rq_held(rq);
+
+	/*
+	 * A higher-priority class task is either running or in the process of
+	 * waking up on @rq.
+	 */
+	if (sched_class_above(rq->next_class, &ext_sched_class))
+		return false;
+
+	/*
+	 * @rq is either in transition to or in idle and there is no
+	 * higher-priority class task waking up on it.
+	 */
+	if (sched_class_above(&ext_sched_class, rq->next_class))
+		return true;
+
+	/*
+	 * @rq is either picking, in transition to, or running an SCX task.
+	 */
+
+	/*
+	 * If we're in the dispatch path holding rq lock, $curr may or may not
+	 * be ready depending on whether the on-going dispatch decides to extend
+	 * $curr's slice. We say yes here and resolve it at the end of dispatch.
+	 * See balance_one().
+	 */
+	if (rq->scx.flags & SCX_RQ_IN_BALANCE)
+		return true;
+
+	/*
+	 * %SCX_ENQ_PREEMPT clears $curr's slice if on SCX and kicks dispatch,
+	 * so allow it to avoid spuriously triggering reenq on a combined
+	 * PREEMPT|IMMED insertion.
+	 */
+	if (enq_flags & SCX_ENQ_PREEMPT)
+		return true;
+
+	/*
+	 * @rq is either in transition to or running an SCX task and can't go
+	 * idle without another SCX dispatch cycle.
+	 */
+	return false;
+}
+
 /*
  * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
  * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
@@ -1220,6 +1276,16 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 	}
 }
 
+static void schedule_reenq_local(struct rq *rq, u64 reenq_flags)
+{
+	struct scx_sched *root = rcu_dereference_sched(scx_root);
+
+	if (WARN_ON_ONCE(!root))
+		return;
+
+	schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags);
+}
+
 /**
  * touch_core_sched - Update timestamp used for core-sched task ordering
  * @rq: rq to read clock from, must be locked
@@ -1296,10 +1362,58 @@ static bool scx_dsq_priq_less(struct rb_node *node_a,
 	return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime);
 }
 
-static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
+static void dsq_inc_nr(struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags)
 {
 	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
-	WRITE_ONCE(dsq->nr, dsq->nr + delta);
+	WRITE_ONCE(dsq->nr, dsq->nr + 1);
+
+	/*
+	 * Once @p reaches a local DSQ, it can only leave it by being dispatched
+	 * to the CPU or dequeued. In both cases, the only way @p can go back to
+	 * the BPF sched is through enqueueing. If being inserted into a local
+	 * DSQ with IMMED, persist the state until the next enqueueing event in
+	 * do_enqueue_task() so that we can maintain IMMED protection through
+	 * e.g. SAVE/RESTORE cycles and slice extensions.
+	 */
+	if (enq_flags & SCX_ENQ_IMMED) {
+		if (unlikely(dsq->id != SCX_DSQ_LOCAL)) {
+			WARN_ON_ONCE(!(enq_flags & SCX_ENQ_GDSQ_FALLBACK));
+			return;
+		}
+		p->scx.flags |= SCX_TASK_IMMED;
+	}
+
+	if (p->scx.flags & SCX_TASK_IMMED) {
+		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+
+		if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL))
+			return;
+
+		rq->scx.nr_immed++;
+
+		/*
+		 * If @rq already had other tasks or the current task is not
+		 * done yet, @p can't go on the CPU immediately. Re-enqueue.
+		 */
+		if (unlikely(dsq->nr > 1 || !rq_is_open(rq, enq_flags)))
+			schedule_reenq_local(rq, 0);
+	}
+}
+
+static void dsq_dec_nr(struct scx_dispatch_q *dsq, struct task_struct *p)
+{
+	/* see dsq_inc_nr() */
+	WRITE_ONCE(dsq->nr, dsq->nr - 1);
+
+	if (p->scx.flags & SCX_TASK_IMMED) {
+		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+
+		if (WARN_ON_ONCE(dsq->id != SCX_DSQ_LOCAL) ||
+		    WARN_ON_ONCE(rq->scx.nr_immed <= 0))
+			return;
+
+		rq->scx.nr_immed--;
+	}
 }
 
 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
@@ -1458,7 +1572,7 @@ static void dispatch_enqueue(struct scx_sched *sch, struct rq *rq,
 	WRITE_ONCE(dsq->seq, dsq->seq + 1);
 	p->scx.dsq_seq = dsq->seq;
 
-	dsq_mod_nr(dsq, 1);
+	dsq_inc_nr(dsq, p, enq_flags);
 	p->scx.dsq = dsq;
 
 	/*
@@ -1512,7 +1626,7 @@ static void task_unlink_from_dsq(struct task_struct *p,
 	}
 
 	list_del_init(&p->scx.dsq_list.node);
-	dsq_mod_nr(dsq, -1);
+	dsq_dec_nr(dsq, p);
 
 	if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
 		struct task_struct *first_task;
@@ -1723,10 +1837,18 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 
 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
 
-	/* rq migration */
+	/* internal movements - rq migration / RESTORE */
 	if (sticky_cpu == cpu_of(rq))
 		goto local_norefill;
 
+	/*
+	 * Clear persistent TASK_IMMED for fresh enqueues, see dsq_inc_nr().
+	 * Note that exiting and migration-disabled tasks that skip
+	 * ops.enqueue() below will lose IMMED protection unless
+	 * %SCX_OPS_ENQ_EXITING / %SCX_OPS_ENQ_MIGRATION_DISABLED are set.
+	 */
+	p->scx.flags &= ~SCX_TASK_IMMED;
+
 	/*
 	 * If !scx_rq_online(), we already told the BPF scheduler that the CPU
 	 * is offline and are just running the hotplug path. Don't bother the
@@ -2032,6 +2154,30 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
 		return false;
 }
 
+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+	/*
+	 * Preemption between SCX tasks is implemented by resetting the victim
+	 * task's slice to 0 and triggering reschedule on the target CPU.
+	 * Nothing to do.
+	 */
+	if (p->sched_class == &ext_sched_class)
+		return;
+
+	/*
+	 * Getting preempted by a higher-priority class. Reenqueue IMMED tasks.
+	 * This captures all preemption cases including:
+	 *
+	 * - A SCX task is currently running.
+	 *
+	 * - @rq is waking from idle due to a SCX task waking to it.
+	 *
+	 * - A higher-priority wakes up while SCX dispatch is in progress.
+	 */
+	if (rq->scx.nr_immed)
+		schedule_reenq_local(rq, 0);
+}
+
 static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 					 struct scx_dispatch_q *src_dsq,
 					 struct rq *dst_rq)
@@ -2049,7 +2195,7 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
 	else
 		list_add_tail(&p->scx.dsq_list.node, &dst_dsq->list);
 
-	dsq_mod_nr(dst_dsq, 1);
+	dsq_inc_nr(dst_dsq, p, enq_flags);
 	p->scx.dsq = dst_dsq;
 
 	local_dsq_post_enq(dst_dsq, p, enq_flags);
@@ -2257,6 +2403,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 		    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
 			dst_dsq = find_global_dsq(sch, task_cpu(p));
 			dst_rq = src_rq;
+			enq_flags |= SCX_ENQ_GDSQ_FALLBACK;
 		}
 	} else {
 		/* no need to migrate if destination is a non-local DSQ */
@@ -2385,7 +2532,7 @@ static void dispatch_to_local_dsq(struct scx_sched *sch, struct rq *rq,
 	if (src_rq != dst_rq &&
 	    unlikely(!task_can_run_on_remote_rq(sch, p, dst_rq, true))) {
 		dispatch_enqueue(sch, rq, find_global_dsq(sch, task_cpu(p)), p,
-				 enq_flags | SCX_ENQ_CLEAR_OPSS);
+				 enq_flags | SCX_ENQ_CLEAR_OPSS | SCX_ENQ_GDSQ_FALLBACK);
 		return;
 	}
 
@@ -2738,6 +2885,19 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 	return false;
 
 has_tasks:
+	/*
+	 * @rq may have extra IMMED tasks without reenq scheduled:
+	 *
+	 * - rq_is_open() can't reliably tell when and how slice is going to be
+	 *   modified for $curr and allows IMMED tasks to be queued while
+	 *   dispatch is in progress.
+	 *
+	 * - A non-IMMED HEAD task can get queued in front of an IMMED task
+	 *   between the IMMED queueing and the subsequent scheduling event.
+	 */
+	if (unlikely(rq->scx.local_dsq.nr > 1 && rq->scx.nr_immed))
+		schedule_reenq_local(rq, 0);
+
 	rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
 	return true;
 }
@@ -2859,11 +3019,17 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 		 * If @p has slice left and is being put, @p is getting
 		 * preempted by a higher priority scheduler class or core-sched
 		 * forcing a different task. Leave it at the head of the local
-		 * DSQ.
+		 * DSQ unless it was an IMMED task. IMMED tasks should not
+		 * linger on a busy CPU, reenqueue them to the BPF scheduler.
 		 */
 		if (p->scx.slice && !scx_bypassing(sch, cpu_of(rq))) {
-			dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p,
-					 SCX_ENQ_HEAD);
+			if (p->scx.flags & SCX_TASK_IMMED) {
+				p->scx.flags |= SCX_TASK_REENQ_PREEMPTED;
+				do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
+				p->scx.flags &= ~SCX_TASK_REENQ_REASON_MASK;
+			} else {
+				dispatch_enqueue(sch, rq, &rq->scx.local_dsq, p, SCX_ENQ_HEAD);
+			}
 			goto switch_class;
 		}
 
@@ -3682,8 +3848,6 @@ static void switched_from_scx(struct rq *rq, struct task_struct *p)
 	scx_disable_task(scx_task_sched(p), p);
 }
 
-static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p, int wake_flags) {}
-
 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
 
 int scx_check_setscheduler(struct task_struct *p, int policy)
@@ -3725,9 +3889,45 @@ static void process_ddsp_deferred_locals(struct rq *rq)
 	}
 }
 
+/*
+ * Determine whether @p should be reenqueued from a local DSQ.
+ *
+ * @reenq_flags is mutable and accumulates state across the DSQ walk:
+ *
+ * - %SCX_REENQ_TSR_NOT_FIRST: Set after the first task is visited. "First"
+ *   tracks position in the DSQ list, not among IMMED tasks. A non-IMMED task at
+ *   the head consumes the first slot.
+ *
+ * - %SCX_REENQ_TSR_RQ_OPEN: Set by reenq_local() before the walk if
+ *   rq_is_open() is true.
+ *
+ * An IMMED task is kept (returns %false) only if it's the first task in the DSQ
+ * AND the current task is done — i.e. it will execute immediately. All other
+ * IMMED tasks are reenqueued. This means if a non-IMMED task sits at the head,
+ * every IMMED task behind it gets reenqueued.
+ *
+ * Reenqueued tasks go through ops.enqueue() with %SCX_ENQ_REENQ |
+ * %SCX_TASK_REENQ_IMMED. If the BPF scheduler dispatches back to the same local
+ * DSQ with %SCX_ENQ_IMMED while the CPU is still unavailable, this triggers
+ * another reenq cycle. Repetitions are bounded by %SCX_REENQ_LOCAL_MAX_REPEAT
+ * in process_deferred_reenq_locals().
+ */
 static bool local_task_should_reenq(struct task_struct *p, u64 *reenq_flags, u32 *reason)
 {
+	bool first;
+
+	first = !(*reenq_flags & SCX_REENQ_TSR_NOT_FIRST);
+	*reenq_flags |= SCX_REENQ_TSR_NOT_FIRST;
+
 	*reason = SCX_TASK_REENQ_KFUNC;
+
+	if ((p->scx.flags & SCX_TASK_IMMED) &&
+	    (!first || !(*reenq_flags & SCX_REENQ_TSR_RQ_OPEN))) {
+		__scx_add_event(scx_task_sched(p), SCX_EV_REENQ_IMMED, 1);
+		*reason = SCX_TASK_REENQ_IMMED;
+		return true;
+	}
+
 	return *reenq_flags & SCX_REENQ_ANY;
 }
 
@@ -3739,6 +3939,11 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
 
 	lockdep_assert_rq_held(rq);
 
+	if (WARN_ON_ONCE(reenq_flags & __SCX_REENQ_TSR_MASK))
+		reenq_flags &= ~__SCX_REENQ_TSR_MASK;
+	if (rq_is_open(rq, 0))
+		reenq_flags |= SCX_REENQ_TSR_RQ_OPEN;
+
 	/*
 	 * The BPF scheduler may choose to dispatch tasks back to
 	 * @rq->scx.local_dsq. Move all candidate tasks off to a private list
@@ -3792,11 +3997,14 @@ static u32 reenq_local(struct scx_sched *sch, struct rq *rq, u64 reenq_flags)
 
 static void process_deferred_reenq_locals(struct rq *rq)
 {
+	u64 seq = ++rq->scx.deferred_reenq_locals_seq;
+
 	lockdep_assert_rq_held(rq);
 
 	while (true) {
 		struct scx_sched *sch;
 		u64 reenq_flags;
+		bool skip = false;
 
 		scoped_guard (raw_spinlock, &rq->scx.deferred_reenq_lock) {
 			struct scx_deferred_reenq_local *drl =
@@ -3811,15 +4019,31 @@ static void process_deferred_reenq_locals(struct rq *rq)
 			sch_pcpu = container_of(drl, struct scx_sched_pcpu,
 						deferred_reenq_local);
 			sch = sch_pcpu->sch;
+
 			reenq_flags = drl->flags;
 			WRITE_ONCE(drl->flags, 0);
 			list_del_init(&drl->node);
+
+			if (likely(drl->seq != seq)) {
+				drl->seq = seq;
+				drl->cnt = 0;
+			} else {
+				if (unlikely(++drl->cnt > SCX_REENQ_LOCAL_MAX_REPEAT)) {
+					scx_error(sch, "SCX_ENQ_REENQ on SCX_DSQ_LOCAL repeated %u times",
+						  drl->cnt);
+					skip = true;
+				}
+
+				__scx_add_event(sch, SCX_EV_REENQ_LOCAL_REPEAT, 1);
+			}
 		}
 
-		/* see schedule_dsq_reenq() */
-		smp_mb();
+		if (!skip) {
+			/* see schedule_dsq_reenq() */
+			smp_mb();
 
-		reenq_local(sch, rq, reenq_flags);
+			reenq_local(sch, rq, reenq_flags);
+		}
 	}
 }
 
@@ -4208,10 +4432,6 @@ static void scx_cgroup_unlock(void) {}
 /*
  * Omitted operations:
  *
- * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
- *   isn't tied to the CPU at that point. Preemption is implemented by resetting
- *   the victim task's slice to 0 and triggering reschedule on the target CPU.
- *
  * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
  *
  * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
@@ -4580,6 +4800,8 @@ static ssize_t scx_attr_events_show(struct kobject *kobj,
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_IMMED);
+	at += scx_attr_event_show(buf, at, &events, SCX_EV_REENQ_LOCAL_REPEAT);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_REFILL_SLICE_DFL);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
 	at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
@@ -6019,6 +6241,8 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
 	scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
 	scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
 	scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+	scx_dump_event(s, &events, SCX_EV_REENQ_IMMED);
+	scx_dump_event(s, &events, SCX_EV_REENQ_LOCAL_REPEAT);
 	scx_dump_event(s, &events, SCX_EV_REFILL_SLICE_DFL);
 	scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
 	scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
@@ -7532,6 +7756,13 @@ void __init init_sched_ext_class(void)
  */
 static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 enq_flags)
 {
+	if ((enq_flags & SCX_ENQ_IMMED) &&
+	    unlikely(dsq_id != SCX_DSQ_LOCAL &&
+		     (dsq_id & SCX_DSQ_LOCAL_ON) != SCX_DSQ_LOCAL_ON)) {
+		scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id);
+		return false;
+	}
+
 	return true;
 }
 
@@ -9101,6 +9332,8 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
 		scx_agg_event(events, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
 		scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
 		scx_agg_event(events, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+		scx_agg_event(events, e_cpu, SCX_EV_REENQ_IMMED);
+		scx_agg_event(events, e_cpu, SCX_EV_REENQ_LOCAL_REPEAT);
 		scx_agg_event(events, e_cpu, SCX_EV_REFILL_SLICE_DFL);
 		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DURATION);
 		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index c78dadaadab8..2ef855f7c861 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -31,6 +31,8 @@ enum scx_consts {
 	SCX_BYPASS_LB_MIN_DELTA_DIV	= 4,
 	SCX_BYPASS_LB_BATCH		= 256,
 
+	SCX_REENQ_LOCAL_MAX_REPEAT	= 256,
+
 	SCX_SUB_MAX_DEPTH		= 4,
 };
 
@@ -887,6 +889,24 @@ struct scx_event_stats {
 	 */
 	s64		SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
 
+	/*
+	 * The number of times a task, enqueued on a local DSQ with
+	 * SCX_ENQ_IMMED, was re-enqueued because the CPU was not available for
+	 * immediate execution.
+	 */
+	s64		SCX_EV_REENQ_IMMED;
+
+	/*
+	 * The number of times a reenq of local DSQ caused another reenq of
+	 * local DSQ. This can happen when %SCX_ENQ_IMMED races against a higher
+	 * priority class task even if the BPF scheduler always satisfies the
+	 * prerequisites for %SCX_ENQ_IMMED at the time of enqueue. However,
+	 * that scenario is very unlikely and this count going up regularly
+	 * indicates that the BPF scheduler is handling %SCX_ENQ_REENQ
+	 * incorrectly causing recursive reenqueues.
+	 */
+	s64		SCX_EV_REENQ_LOCAL_REPEAT;
+
 	/*
 	 * Total number of times a task's time slice was refilled with the
 	 * default value (SCX_SLICE_DFL).
@@ -951,6 +971,8 @@ struct scx_dsp_ctx {
 struct scx_deferred_reenq_local {
 	struct list_head	node;
 	u64			flags;
+	u64			seq;
+	u32			cnt;
 };
 
 struct scx_sched_pcpu {
@@ -1074,6 +1096,24 @@ enum scx_enq_flags {
 	 */
 	SCX_ENQ_PREEMPT		= 1LLU << 32,
 
+	/*
+	 * Only allowed on local DSQs. Guarantees that the task either gets
+	 * on the CPU immediately and stays on it, or gets reenqueued back
+	 * to the BPF scheduler. It will never linger on a local DSQ or be
+	 * silently put back after preemption.
+	 *
+	 * The protection persists until the next fresh enqueue - it
+	 * survives SAVE/RESTORE cycles, slice extensions and preemption.
+	 * If the task can't stay on the CPU for any reason, it gets
+	 * reenqueued back to the BPF scheduler.
+	 *
+	 * Exiting and migration-disabled tasks bypass ops.enqueue() and
+	 * are placed directly on a local DSQ without IMMED protection
+	 * unless %SCX_OPS_ENQ_EXITING and %SCX_OPS_ENQ_MIGRATION_DISABLED
+	 * are set respectively.
+	 */
+	SCX_ENQ_IMMED		= 1LLU << 33,
+
 	/*
 	 * The task being enqueued was previously enqueued on a DSQ, but was
 	 * removed and is being re-enqueued. See SCX_TASK_REENQ_* flags to find
@@ -1098,6 +1138,7 @@ enum scx_enq_flags {
 	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
 	SCX_ENQ_DSQ_PRIQ	= 1LLU << 57,
 	SCX_ENQ_NESTED		= 1LLU << 58,
+	SCX_ENQ_GDSQ_FALLBACK	= 1LLU << 59,	/* fell back to global DSQ */
 };
 
 enum scx_deq_flags {
@@ -1127,6 +1168,12 @@ enum scx_reenq_flags {
 	__SCX_REENQ_FILTER_MASK	= 0xffffLLU,
 
 	__SCX_REENQ_USER_MASK	= SCX_REENQ_ANY,
+
+	/* bits 32-35 used by task_should_reenq() */
+	SCX_REENQ_TSR_RQ_OPEN	= 1LLU << 32,
+	SCX_REENQ_TSR_NOT_FIRST	= 1LLU << 33,
+
+	__SCX_REENQ_TSR_MASK	= 0xfLLU << 32,
 };
 
 enum scx_pick_idle_cpu_flags {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 60627119d0ab..5b93f6190d31 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -800,6 +800,7 @@ struct scx_rq {
 	u32			cpuperf_target;		/* [0, SCHED_CAPACITY_SCALE] */
 	bool			cpu_released;
 	u32			flags;
+	u32			nr_immed;		/* ENQ_IMMED tasks on local_dsq */
 	u64			clock;			/* current per-rq clock -- see scx_bpf_now() */
 	cpumask_var_t		cpus_to_kick;
 	cpumask_var_t		cpus_to_kick_if_idle;
@@ -810,6 +811,7 @@ struct scx_rq {
 	struct task_struct	*sub_dispatch_prev;
 
 	raw_spinlock_t		deferred_reenq_lock;
+	u64			deferred_reenq_locals_seq;
 	struct list_head	deferred_reenq_locals;	/* scheds requesting reenq of local DSQ */
 	struct list_head	deferred_reenq_users;	/* user DSQs requesting reenq */
 	struct balance_callback	deferred_bal_cb;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 704728864d83..cba37432eec0 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -404,6 +404,11 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags)
 		scx_bpf_error("kernel too old to reenqueue foreign local or user DSQs");
 }
 
+/*
+ * v7.1: %SCX_ENQ_IMMED.
+ */
+#define SCX_ENQ_IMMED	__COMPAT_ENUM_OR_ZERO(enum scx_enq_flags, SCX_ENQ_IMMED)
+
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().

From da32a2986e5fb3c70562ad610918834696e87322 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Mar 2026 09:43:23 -1000
Subject: [PATCH 076/134] sched_ext: Plumb enq_flags through the consume path

Add enq_flags parameter to consume_dispatch_q() and consume_remote_task(),
passing it through to move_{local,remote}_task_to_local_dsq(). All callers
pass 0.

No functional change. This prepares for SCX_ENQ_IMMED support on the consume
path.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c75c35b67a18..428b01cf02b0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2355,13 +2355,14 @@ static bool unlink_dsq_and_lock_src_rq(struct task_struct *p,
 		!WARN_ON_ONCE(src_rq != task_rq(p));
 }
 
-static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
+static bool consume_remote_task(struct rq *this_rq,
+				struct task_struct *p, u64 enq_flags,
 				struct scx_dispatch_q *dsq, struct rq *src_rq)
 {
 	raw_spin_rq_unlock(this_rq);
 
 	if (unlink_dsq_and_lock_src_rq(p, dsq, src_rq)) {
-		move_remote_task_to_local_dsq(p, 0, src_rq, this_rq);
+		move_remote_task_to_local_dsq(p, enq_flags, src_rq, this_rq);
 		return true;
 	} else {
 		raw_spin_rq_unlock(src_rq);
@@ -2441,7 +2442,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
 }
 
 static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
-			       struct scx_dispatch_q *dsq)
+			       struct scx_dispatch_q *dsq, u64 enq_flags)
 {
 	struct task_struct *p;
 retry:
@@ -2471,13 +2472,13 @@ retry:
 
 		if (rq == task_rq) {
 			task_unlink_from_dsq(p, dsq);
-			move_local_task_to_local_dsq(p, 0, dsq, rq);
+			move_local_task_to_local_dsq(p, enq_flags, dsq, rq);
 			raw_spin_unlock(&dsq->lock);
 			return true;
 		}
 
 		if (task_can_run_on_remote_rq(sch, p, rq, false)) {
-			if (likely(consume_remote_task(rq, p, dsq, task_rq)))
+			if (likely(consume_remote_task(rq, p, enq_flags, dsq, task_rq)))
 				return true;
 			goto retry;
 		}
@@ -2491,7 +2492,7 @@ static bool consume_global_dsq(struct scx_sched *sch, struct rq *rq)
 {
 	int node = cpu_to_node(cpu_of(rq));
 
-	return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq);
+	return consume_dispatch_q(sch, rq, &sch->pnode[node]->global_dsq, 0);
 }
 
 /**
@@ -2727,7 +2728,7 @@ scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 	if (bypass_dsp_enabled(sch)) {
 		/* if @sch is bypassing, only the bypass DSQs are active */
 		if (scx_bypassing(sch, cpu))
-			return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+			return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0);
 
 #ifdef CONFIG_EXT_SUB_SCHED
 		/*
@@ -2745,7 +2746,7 @@ scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 		struct scx_sched_pcpu *pcpu = per_cpu_ptr(sch->pcpu, cpu);
 
 		if (!(pcpu->bypass_host_seq++ % SCX_BYPASS_HOST_NTH) &&
-		    consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu))) {
+		    consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0)) {
 			__scx_add_event(sch, SCX_EV_SUB_BYPASS_DISPATCH, 1);
 			return true;
 		}
@@ -2817,7 +2818,7 @@ scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 	 * scheduler's ops.dispatch() doesn't yield any tasks.
 	 */
 	if (bypass_dsp_enabled(sch))
-		return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu));
+		return consume_dispatch_q(sch, rq, bypass_dsq(sch, cpu), 0);
 
 	return false;
 }
@@ -8195,7 +8196,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux
 		return false;
 	}
 
-	if (consume_dispatch_q(sch, dspc->rq, dsq)) {
+	if (consume_dispatch_q(sch, dspc->rq, dsq, 0)) {
 		/*
 		 * A successfully consumed task can be dequeued before it starts
 		 * running while the CPU is trying to migrate other dispatched

From 860683763ebf4662cb72a312279334e02718308f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Mar 2026 09:43:23 -1000
Subject: [PATCH 077/134] sched_ext: Add enq_flags to
 scx_bpf_dsq_move_to_local()

scx_bpf_dsq_move_to_local() moves a task from a non-local DSQ to the
current CPU's local DSQ. This is an indirect way of dispatching to a local
DSQ and should support enq_flags like direct dispatches do - e.g.
SCX_ENQ_HEAD for head-of-queue insertion and SCX_ENQ_IMMED for immediate
execution guarantees.

Add scx_bpf_dsq_move_to_local___v2() with an enq_flags parameter. The
original becomes a v1 compat wrapper passing 0. The compat macro is updated
to a three-level chain: v2 (7.1+) -> v1 (current) -> scx_bpf_consume
(pre-rename). All in-tree BPF schedulers are updated to pass 0.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c                       | 21 ++++++++++++++++++---
 tools/sched_ext/include/scx/compat.bpf.h | 15 ++++++++++-----
 tools/sched_ext/scx_central.bpf.c        |  4 ++--
 tools/sched_ext/scx_cpu0.bpf.c           |  2 +-
 tools/sched_ext/scx_flatcg.bpf.c         |  6 +++---
 tools/sched_ext/scx_qmap.bpf.c           |  4 ++--
 tools/sched_ext/scx_sdt.bpf.c            |  2 +-
 tools/sched_ext/scx_simple.bpf.c         |  2 +-
 8 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 428b01cf02b0..1b014bdee824 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8160,9 +8160,11 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
  * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
  * @dsq_id: DSQ to move task from
  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
+ * @enq_flags: %SCX_ENQ_*
  *
  * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
- * local DSQ for execution. Can only be called from ops.dispatch().
+ * local DSQ for execution with @enq_flags applied. Can only be called from
+ * ops.dispatch().
  *
  * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
  * before trying to move from the specified DSQ. It may also grab rq locks and
@@ -8171,7 +8173,8 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
  * Returns %true if a task has been moved, %false if there isn't any task to
  * move.
  */
-__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux)
+__bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags,
+						const struct bpf_prog_aux *aux)
 {
 	struct scx_dispatch_q *dsq;
 	struct scx_sched *sch;
@@ -8186,6 +8189,9 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux
 	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return false;
 
+	if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, enq_flags))
+		return false;
+
 	dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
 
 	flush_dispatch_buf(sch, dspc->rq);
@@ -8196,7 +8202,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux
 		return false;
 	}
 
-	if (consume_dispatch_q(sch, dspc->rq, dsq, 0)) {
+	if (consume_dispatch_q(sch, dspc->rq, dsq, enq_flags)) {
 		/*
 		 * A successfully consumed task can be dequeued before it starts
 		 * running while the CPU is trying to migrate other dispatched
@@ -8210,6 +8216,14 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux
 	}
 }
 
+/*
+ * COMPAT: ___v2 was introduced in v7.1. Remove this and ___v2 tag in the future.
+ */
+__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id, const struct bpf_prog_aux *aux)
+{
+	return scx_bpf_dsq_move_to_local___v2(dsq_id, 0, aux);
+}
+
 /**
  * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
  * @it__iter: DSQ iterator in progress
@@ -8353,6 +8367,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index cba37432eec0..83b3425e63b2 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -28,8 +28,11 @@ struct cgroup *scx_bpf_task_cgroup___new(struct task_struct *p) __ksym __weak;
  *
  * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by
  * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()").
+ *
+ * v7.1: scx_bpf_dsq_move_to_local___v2() to add @enq_flags.
  */
-bool scx_bpf_dsq_move_to_local___new(u64 dsq_id) __ksym __weak;
+bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dsq_move_to_local___v1(u64 dsq_id) __ksym __weak;
 void scx_bpf_dsq_move_set_slice___new(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
 void scx_bpf_dsq_move_set_vtime___new(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
 bool scx_bpf_dsq_move___new(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
@@ -41,10 +44,12 @@ void scx_bpf_dispatch_from_dsq_set_vtime___old(struct bpf_iter_scx_dsq *it__iter
 bool scx_bpf_dispatch_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 bool scx_bpf_dispatch_vtime_from_dsq___old(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 
-#define scx_bpf_dsq_move_to_local(dsq_id)					\
-	(bpf_ksym_exists(scx_bpf_dsq_move_to_local___new) ?			\
-	 scx_bpf_dsq_move_to_local___new((dsq_id)) :				\
-	 scx_bpf_consume___old((dsq_id)))
+#define scx_bpf_dsq_move_to_local(dsq_id, enq_flags)				\
+	(bpf_ksym_exists(scx_bpf_dsq_move_to_local___v2) ?			\
+	 scx_bpf_dsq_move_to_local___v2((dsq_id), (enq_flags)) :		\
+	 (bpf_ksym_exists(scx_bpf_dsq_move_to_local___v1) ?			\
+	  scx_bpf_dsq_move_to_local___v1((dsq_id)) :				\
+	  scx_bpf_consume___old((dsq_id))))
 
 #define scx_bpf_dsq_move_set_slice(it__iter, slice)				\
 	(bpf_ksym_exists(scx_bpf_dsq_move_set_slice___new) ?			\
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 1c2376b75b5d..399e8d3f8bec 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -214,13 +214,13 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 		}
 
 		/* look for a task to run on the central CPU */
-		if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ_ID))
+		if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ_ID, 0))
 			return;
 		dispatch_to_cpu(central_cpu);
 	} else {
 		bool *gimme;
 
-		if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ_ID))
+		if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ_ID, 0))
 			return;
 
 		gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
diff --git a/tools/sched_ext/scx_cpu0.bpf.c b/tools/sched_ext/scx_cpu0.bpf.c
index 9b67ab11b04c..0b1a7ce879b0 100644
--- a/tools/sched_ext/scx_cpu0.bpf.c
+++ b/tools/sched_ext/scx_cpu0.bpf.c
@@ -66,7 +66,7 @@ void BPF_STRUCT_OPS(cpu0_enqueue, struct task_struct *p, u64 enq_flags)
 void BPF_STRUCT_OPS(cpu0_dispatch, s32 cpu, struct task_struct *prev)
 {
 	if (cpu == 0)
-		scx_bpf_dsq_move_to_local(DSQ_CPU0);
+		scx_bpf_dsq_move_to_local(DSQ_CPU0, 0);
 }
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(cpu0_init)
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index a8a9234bb41e..1351377f64d5 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -660,7 +660,7 @@ static bool try_pick_next_cgroup(u64 *cgidp)
 		goto out_free;
 	}
 
-	if (!scx_bpf_dsq_move_to_local(cgid)) {
+	if (!scx_bpf_dsq_move_to_local(cgid, 0)) {
 		bpf_cgroup_release(cgrp);
 		stat_inc(FCG_STAT_PNC_EMPTY);
 		goto out_stash;
@@ -740,7 +740,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 		goto pick_next_cgroup;
 
 	if (time_before(now, cpuc->cur_at + cgrp_slice_ns)) {
-		if (scx_bpf_dsq_move_to_local(cpuc->cur_cgid)) {
+		if (scx_bpf_dsq_move_to_local(cpuc->cur_cgid, 0)) {
 			stat_inc(FCG_STAT_CNS_KEEP);
 			return;
 		}
@@ -780,7 +780,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 pick_next_cgroup:
 	cpuc->cur_at = now;
 
-	if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ)) {
+	if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ, 0)) {
 		cpuc->cur_cgid = 0;
 		return;
 	}
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index a4a1b84fe359..6d34115cb8bd 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -395,7 +395,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	if (dispatch_highpri(false))
 		return;
 
-	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
+	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
 		return;
 
 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -460,7 +460,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
 				if (dispatch_highpri(false))
 					return;
-				scx_bpf_dsq_move_to_local(SHARED_DSQ);
+				scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
 				return;
 			}
 			if (!cpuc->dsp_cnt)
diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
index 31b09958e8d5..10248b71ef02 100644
--- a/tools/sched_ext/scx_sdt.bpf.c
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -643,7 +643,7 @@ void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags)
 
 void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev)
 {
-	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+	scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
 }
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p,
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index b456bd7cae77..9ad6f0949987 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -89,7 +89,7 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 
 void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
 {
-	scx_bpf_dsq_move_to_local(SHARED_DSQ);
+	scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
 }
 
 void BPF_STRUCT_OPS(simple_running, struct task_struct *p)

From 3229ac4a5ef5a838e82a784226432c92d3db90a8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Mar 2026 09:43:23 -1000
Subject: [PATCH 078/134] sched_ext: Add SCX_OPS_ALWAYS_ENQ_IMMED ops flag

SCX_ENQ_IMMED makes enqueue to local DSQs succeed only if the task can start
running immediately. Otherwise, the task is re-enqueued through ops.enqueue().
This provides tighter control but requires specifying the flag on every
insertion.

Add SCX_OPS_ALWAYS_ENQ_IMMED ops flag. When set, SCX_ENQ_IMMED is
automatically applied to all local DSQ enqueues including through
scx_bpf_dsq_move_to_local().

scx_qmap is updated with -I option to test the feature and -F option for
IMMED stress testing which forces every Nth enqueue to a busy local DSQ.

v2: - Cover scx_bpf_dsq_move_to_local() path (now has enq_flags via ___v2).
    - scx_qmap: Remove sched_switch and cpu_release handlers (superseded by
      kernel-side wakeup_preempt_scx()). Add -F for IMMED stress testing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c                   | 31 +++++++------
 kernel/sched/ext_internal.h          |  9 +++-
 tools/sched_ext/include/scx/compat.h |  1 +
 tools/sched_ext/scx_qmap.bpf.c       | 66 ++++++++++++----------------
 tools/sched_ext/scx_qmap.c           | 13 +++++-
 5 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 1b014bdee824..0ce205e3e999 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7755,20 +7755,25 @@ void __init init_sched_ext_class(void)
 /********************************************************************************
  * Helpers that can be called from the BPF scheduler.
  */
-static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 enq_flags)
+static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags)
 {
-	if ((enq_flags & SCX_ENQ_IMMED) &&
-	    unlikely(dsq_id != SCX_DSQ_LOCAL &&
-		     (dsq_id & SCX_DSQ_LOCAL_ON) != SCX_DSQ_LOCAL_ON)) {
-		scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id);
-		return false;
+	bool is_local = dsq_id == SCX_DSQ_LOCAL ||
+		(dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON;
+
+	if (*enq_flags & SCX_ENQ_IMMED) {
+		if (unlikely(!is_local)) {
+			scx_error(sch, "SCX_ENQ_IMMED on a non-local DSQ 0x%llx", dsq_id);
+			return false;
+		}
+	} else if ((sch->ops.flags & SCX_OPS_ALWAYS_ENQ_IMMED) && is_local) {
+		*enq_flags |= SCX_ENQ_IMMED;
 	}
 
 	return true;
 }
 
 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
-				    u64 dsq_id, u64 enq_flags)
+				    u64 dsq_id, u64 *enq_flags)
 {
 	if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
 		return false;
@@ -7780,8 +7785,8 @@ static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p
 		return false;
 	}
 
-	if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
-		scx_error(sch, "invalid enq_flags 0x%llx", enq_flags);
+	if (unlikely(*enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
+		scx_error(sch, "invalid enq_flags 0x%llx", *enq_flags);
 		return false;
 	}
 
@@ -7875,7 +7880,7 @@ __bpf_kfunc bool scx_bpf_dsq_insert___v2(struct task_struct *p, u64 dsq_id,
 	if (unlikely(!sch))
 		return false;
 
-	if (!scx_dsq_insert_preamble(sch, p, dsq_id, enq_flags))
+	if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags))
 		return false;
 
 	if (slice)
@@ -7901,7 +7906,7 @@ __bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id,
 static bool scx_dsq_insert_vtime(struct scx_sched *sch, struct task_struct *p,
 				 u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags)
 {
-	if (!scx_dsq_insert_preamble(sch, p, dsq_id, enq_flags))
+	if (!scx_dsq_insert_preamble(sch, p, dsq_id, &enq_flags))
 		return false;
 
 	if (slice)
@@ -8028,7 +8033,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	    !scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return false;
 
-	if (!scx_vet_enq_flags(sch, dsq_id, enq_flags))
+	if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags))
 		return false;
 
 	/*
@@ -8189,7 +8194,7 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags,
 	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return false;
 
-	if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, enq_flags))
+	if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags))
 		return false;
 
 	dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 2ef855f7c861..b4f36d8b9c1d 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -182,13 +182,20 @@ enum scx_ops_flags {
 	 */
 	SCX_OPS_BUILTIN_IDLE_PER_NODE	= 1LLU << 6,
 
+	/*
+	 * If set, %SCX_ENQ_IMMED is assumed to be set on all local DSQ
+	 * enqueues.
+	 */
+	SCX_OPS_ALWAYS_ENQ_IMMED	= 1LLU << 7,
+
 	SCX_OPS_ALL_FLAGS		= SCX_OPS_KEEP_BUILTIN_IDLE |
 					  SCX_OPS_ENQ_LAST |
 					  SCX_OPS_ENQ_EXITING |
 					  SCX_OPS_ENQ_MIGRATION_DISABLED |
 					  SCX_OPS_ALLOW_QUEUED_WAKEUP |
 					  SCX_OPS_SWITCH_PARTIAL |
-					  SCX_OPS_BUILTIN_IDLE_PER_NODE,
+					  SCX_OPS_BUILTIN_IDLE_PER_NODE |
+					  SCX_OPS_ALWAYS_ENQ_IMMED,
 
 	/* high 8 bits are internal, don't include in SCX_OPS_ALL_FLAGS */
 	__SCX_OPS_INTERNAL_MASK		= 0xffLLU << 56,
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 50297d4b9533..9e0c8f3161e8 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -116,6 +116,7 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
 #define SCX_OPS_ENQ_MIGRATION_DISABLED SCX_OPS_FLAG(SCX_OPS_ENQ_MIGRATION_DISABLED)
 #define SCX_OPS_ALLOW_QUEUED_WAKEUP SCX_OPS_FLAG(SCX_OPS_ALLOW_QUEUED_WAKEUP)
 #define SCX_OPS_BUILTIN_IDLE_PER_NODE SCX_OPS_FLAG(SCX_OPS_BUILTIN_IDLE_PER_NODE)
+#define SCX_OPS_ALWAYS_ENQ_IMMED SCX_OPS_FLAG(SCX_OPS_ALWAYS_ENQ_IMMED)
 
 #define SCX_PICK_IDLE_FLAG(name) __COMPAT_ENUM_OR_ZERO("scx_pick_idle_cpu_flags", #name)
 
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 6d34115cb8bd..f3587fb709c9 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -11,8 +11,6 @@
  *
  * - BPF-side queueing using PIDs.
  * - Sleepable per-task storage allocation using ops.prep_enable().
- * - Using ops.cpu_release() to handle a higher priority scheduling class taking
- *   the CPU away.
  * - Core-sched support.
  *
  * This scheduler is primarily for demonstration and testing of sched_ext
@@ -47,6 +45,8 @@ const volatile bool print_msgs;
 const volatile u64 sub_cgroup_id;
 const volatile s32 disallow_tgid;
 const volatile bool suppress_dump;
+const volatile bool always_enq_immed;
+const volatile u32 immed_stress_nth;
 
 u64 nr_highpri_queued;
 u32 test_error_cnt;
@@ -144,8 +144,10 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
 {
 	s32 cpu;
 
-	if (p->nr_cpus_allowed == 1 ||
-	    scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+	if (!always_enq_immed && p->nr_cpus_allowed == 1)
+		return prev_cpu;
+
+	if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
 		return prev_cpu;
 
 	cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
@@ -238,6 +240,22 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
 
+	/*
+	 * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
+	 * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1
+	 * and exercise the kernel IMMED reenqueue trigger paths.
+	 */
+	if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) {
+		static u32 immed_stress_cnt;
+
+		if (!(++immed_stress_cnt % immed_stress_nth)) {
+			tctx->force_local = false;
+			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
+					   slice_ns, enq_flags);
+			return;
+		}
+	}
+
 	/*
 	 * If qmap_select_cpu() is telling us to or this is the last runnable
 	 * task on the CPU, enqueue locally.
@@ -558,40 +576,11 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
 	return task_qdist(a) > task_qdist(b);
 }
 
-SEC("tp_btf/sched_switch")
-int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev,
-	     struct task_struct *next, unsigned long prev_state)
-{
-	if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
-		return 0;
-
-	/*
-	 * If @cpu is taken by a higher priority scheduling class, it is no
-	 * longer available for executing sched_ext tasks. As we don't want the
-	 * tasks in @cpu's local dsq to sit there until @cpu becomes available
-	 * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ
-	 * handling in qmap_enqueue().
-	 */
-	switch (next->policy) {
-	case 1: /* SCHED_FIFO */
-	case 2: /* SCHED_RR */
-	case 6: /* SCHED_DEADLINE */
-		scx_bpf_reenqueue_local();
-
-		/* trigger re-enqueue on CPU0 just to exercise LOCAL_ON */
-		if (__COMPAT_has_generic_reenq())
-			scx_bpf_dsq_reenq(SCX_DSQ_LOCAL_ON | 0, 0);
-	}
-
-	return 0;
-}
-
-void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
-{
-	/* see qmap_sched_switch() to learn how to do this on newer kernels */
-	if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere())
-		scx_bpf_reenqueue_local();
-}
+/*
+ * sched_switch tracepoint and cpu_release handlers are no longer needed.
+ * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED
+ * tasks when a higher-priority scheduling class takes the CPU.
+ */
 
 s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
 		   struct scx_init_task_args *args)
@@ -999,7 +988,6 @@ SCX_OPS_DEFINE(qmap_ops,
 	       .dispatch		= (void *)qmap_dispatch,
 	       .tick			= (void *)qmap_tick,
 	       .core_sched_before	= (void *)qmap_core_sched_before,
-	       .cpu_release		= (void *)qmap_cpu_release,
 	       .init_task		= (void *)qmap_init_task,
 	       .dump			= (void *)qmap_dump,
 	       .dump_cpu		= (void *)qmap_dump_cpu,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index 5916bbe0d77f..e7c89a2bc3d8 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -21,7 +21,7 @@ const char help_fmt[] =
 "See the top-level comment in .bpf.c for more details.\n"
 "\n"
 "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
-"       [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-v]\n"
+"       [-P] [-M] [-H] [-d PID] [-D LEN] [-S] [-p] [-I] [-F COUNT] [-v]\n"
 "\n"
 "  -s SLICE_US   Override slice duration\n"
 "  -e COUNT      Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -36,6 +36,8 @@ const char help_fmt[] =
 "  -D LEN        Set scx_exit_info.dump buffer length\n"
 "  -S            Suppress qmap-specific debug dump\n"
 "  -p            Switch only tasks on SCHED_EXT policy instead of all\n"
+"  -I            Turn on SCX_OPS_ALWAYS_ENQ_IMMED\n"
+"  -F COUNT      IMMED stress: force every COUNT'th enqueue to a busy local DSQ (use with -I)\n"
 "  -v            Print libbpf debug messages\n"
 "  -h            Display this help and exit\n";
 
@@ -68,7 +70,7 @@ int main(int argc, char **argv)
 
 	skel->rodata->slice_ns = __COMPAT_ENUM_OR_ZERO("scx_public_consts", "SCX_SLICE_DFL");
 
-	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:Spvh")) != -1) {
+	while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PMHc:d:D:SpIF:vh")) != -1) {
 		switch (opt) {
 		case 's':
 			skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -121,6 +123,13 @@ int main(int argc, char **argv)
 		case 'p':
 			skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
 			break;
+		case 'I':
+			skel->rodata->always_enq_immed = true;
+			skel->struct_ops.qmap_ops->flags |= SCX_OPS_ALWAYS_ENQ_IMMED;
+			break;
+		case 'F':
+			skel->rodata->immed_stress_nth = strtoul(optarg, NULL, 0);
+			break;
 		case 'v':
 			verbose = true;
 			break;

From 238eba8c210d02ec2908d9c9143db4edcea3bfa1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 13 Mar 2026 09:43:23 -1000
Subject: [PATCH 079/134] sched_ext: Use schedule_deferred_locked() in
 schedule_dsq_reenq()

schedule_dsq_reenq() always uses schedule_deferred() which falls back to
irq_work. However, callers like schedule_reenq_local() already hold the
target rq lock, and scx_bpf_dsq_reenq() may hold it via the ops callback.

Add a locked_rq parameter so schedule_dsq_reenq() can use
schedule_deferred_locked() when the target rq is already held. The locked
variant can use cheaper paths (balance callbacks, wakeup hooks) instead of
always bouncing through irq_work.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0ce205e3e999..a234e57a4555 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1218,8 +1218,10 @@ static void schedule_deferred_locked(struct rq *rq)
 }
 
 static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq,
-			       u64 reenq_flags)
+			       u64 reenq_flags, struct rq *locked_rq)
 {
+	struct rq *rq;
+
 	/*
 	 * Allowing reenqueues doesn't make sense while bypassing. This also
 	 * blocks from new reenqueues to be scheduled on dead scheds.
@@ -1228,7 +1230,8 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 		return;
 
 	if (dsq->id == SCX_DSQ_LOCAL) {
-		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
+		rq = container_of(dsq, struct rq, scx.local_dsq);
+
 		struct scx_sched_pcpu *sch_pcpu = per_cpu_ptr(sch->pcpu, cpu_of(rq));
 		struct scx_deferred_reenq_local *drl = &sch_pcpu->deferred_reenq_local;
 
@@ -1247,10 +1250,9 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 				list_move_tail(&drl->node, &rq->scx.deferred_reenq_locals);
 			WRITE_ONCE(drl->flags, drl->flags | reenq_flags);
 		}
-
-		schedule_deferred(rq);
 	} else if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) {
-		struct rq *rq = this_rq();
+		rq = this_rq();
+
 		struct scx_dsq_pcpu *dsq_pcpu = per_cpu_ptr(dsq->pcpu, cpu_of(rq));
 		struct scx_deferred_reenq_user *dru = &dsq_pcpu->deferred_reenq_user;
 
@@ -1269,11 +1271,15 @@ static void schedule_dsq_reenq(struct scx_sched *sch, struct scx_dispatch_q *dsq
 				list_move_tail(&dru->node, &rq->scx.deferred_reenq_users);
 			WRITE_ONCE(dru->flags, dru->flags | reenq_flags);
 		}
-
-		schedule_deferred(rq);
 	} else {
 		scx_error(sch, "DSQ 0x%llx not allowed for reenq", dsq->id);
+		return;
 	}
+
+	if (rq == locked_rq)
+		schedule_deferred_locked(rq);
+	else
+		schedule_deferred(rq);
 }
 
 static void schedule_reenq_local(struct rq *rq, u64 reenq_flags)
@@ -1283,7 +1289,7 @@ static void schedule_reenq_local(struct rq *rq, u64 reenq_flags)
 	if (WARN_ON_ONCE(!root))
 		return;
 
-	schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags);
+	schedule_dsq_reenq(root, &rq->scx.local_dsq, reenq_flags, rq);
 }
 
 /**
@@ -8845,7 +8851,7 @@ __bpf_kfunc void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags,
 		reenq_flags |= SCX_REENQ_ANY;
 
 	dsq = find_dsq_for_dispatch(sch, this_rq(), dsq_id, smp_processor_id());
-	schedule_dsq_reenq(sch, dsq, reenq_flags);
+	schedule_dsq_reenq(sch, dsq, reenq_flags, scx_locked_rq());
 }
 
 /**

From 12b49dd15e4bf4e906759ac445797ba2213e52dd Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Sat, 14 Mar 2026 07:51:04 +0100
Subject: [PATCH 080/134] selftests/sched_ext: Update
 scx_bpf_dsq_move_to_local() in kselftests

After commit 860683763ebf ("sched_ext: Add enq_flags to
scx_bpf_dsq_move_to_local()") some of the kselftests are failing to
build:

 exit.bpf.c:44:34: error: too few arguments provided to function-like macro invocation
    44 |         scx_bpf_dsq_move_to_local(DSQ_ID);

Update the kselftests adding the new argument to
scx_bpf_dsq_move_to_local().

Fixes: 860683763ebf ("sched_ext: Add enq_flags to scx_bpf_dsq_move_to_local()")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/dequeue.bpf.c          | 2 +-
 tools/testing/selftests/sched_ext/exit.bpf.c             | 2 +-
 tools/testing/selftests/sched_ext/maximal.bpf.c          | 2 +-
 tools/testing/selftests/sched_ext/numa.bpf.c             | 2 +-
 tools/testing/selftests/sched_ext/peek_dsq.bpf.c         | 8 ++++----
 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
index 597b88563d7d..624e2ccb0688 100644
--- a/tools/testing/selftests/sched_ext/dequeue.bpf.c
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -342,7 +342,7 @@ void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
 
 		bpf_task_release(p);
 	} else {
-		scx_bpf_dsq_move_to_local(SHARED_DSQ);
+		scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
 	}
 }
 
diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c
index 4bc36182d3ff..2e848820a44b 100644
--- a/tools/testing/selftests/sched_ext/exit.bpf.c
+++ b/tools/testing/selftests/sched_ext/exit.bpf.c
@@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
 	if (exit_point == EXIT_DISPATCH)
 		EXIT_CLEANLY();
 
-	scx_bpf_dsq_move_to_local(DSQ_ID);
+	scx_bpf_dsq_move_to_local(DSQ_ID, 0);
 }
 
 void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
index 01cf4f3da4e0..a3aabeb82e6b 100644
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
 
 void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev)
 {
-	scx_bpf_dsq_move_to_local(DSQ_ID);
+	scx_bpf_dsq_move_to_local(DSQ_ID, 0);
 }
 
 void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags)
diff --git a/tools/testing/selftests/sched_ext/numa.bpf.c b/tools/testing/selftests/sched_ext/numa.bpf.c
index a79d86ed54a1..78cc49a7f9a6 100644
--- a/tools/testing/selftests/sched_ext/numa.bpf.c
+++ b/tools/testing/selftests/sched_ext/numa.bpf.c
@@ -68,7 +68,7 @@ void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev)
 {
 	int node = __COMPAT_scx_bpf_cpu_node(cpu);
 
-	scx_bpf_dsq_move_to_local(node);
+	scx_bpf_dsq_move_to_local(node, 0);
 }
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init)
diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
index 784f2f6c1af9..96e3a336a8a6 100644
--- a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
@@ -95,7 +95,7 @@ static int scan_dsq_pool(void)
 			record_peek_result(task->pid);
 
 			/* Try to move this task to local */
-			if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) {
+			if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0) == 0) {
 				moved = 1;
 				break;
 			}
@@ -156,19 +156,19 @@ void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev)
 		dsq_peek_result2_pid = peek_result ? peek_result->pid : -1;
 
 		/* Now consume the task since we've peeked at it */
-		scx_bpf_dsq_move_to_local(test_dsq_id);
+		scx_bpf_dsq_move_to_local(test_dsq_id, 0);
 
 		/* Mark phase 1 as complete */
 		phase1_complete = 1;
 		bpf_printk("Phase 1 complete, starting phase 2 stress testing");
 	} else if (!phase1_complete) {
 		/* Still in phase 1, use real DSQ */
-		scx_bpf_dsq_move_to_local(real_dsq_id);
+		scx_bpf_dsq_move_to_local(real_dsq_id, 0);
 	} else {
 		/* Phase 2: Scan all DSQs in the pool and try to move a task */
 		if (!scan_dsq_pool()) {
 			/* No tasks found in DSQ pool, fall back to real DSQ */
-			scx_bpf_dsq_move_to_local(real_dsq_id);
+			scx_bpf_dsq_move_to_local(real_dsq_id, 0);
 		}
 	}
 }
diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
index bfcb96cd4954..7aa5dc6bfb93 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
@@ -53,7 +53,7 @@ ddsp:
 
 void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p)
 {
-	if (scx_bpf_dsq_move_to_local(VTIME_DSQ))
+	if (scx_bpf_dsq_move_to_local(VTIME_DSQ, 0))
 		consumed = true;
 }
 

From e36bc38ebfac95ecd088d4bc0ceb3ffcef2ebdfa Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Sat, 14 Mar 2026 09:39:34 +0800
Subject: [PATCH 081/134] sched_ext: Fix uninitialized ret in
 scx_alloc_and_add_sched()

Under CONFIG_EXT_SUB_SCHED, the kzalloc() and kstrdup() failure
paths jump to err_stop_helper without first setting ret. The
function then returns ERR_PTR(ret) with ret uninitialized, which
can produce ERR_PTR(0) (NULL), causing the caller's IS_ERR() check
to pass and leading to a NULL pointer dereference.

Set ret = -ENOMEM before each goto to fix the error path.

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a234e57a4555..9202c6d7a771 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6444,13 +6444,17 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 
 #ifdef CONFIG_EXT_SUB_SCHED
 	char *buf = kzalloc(PATH_MAX, GFP_KERNEL);
-	if (!buf)
+	if (!buf) {
+		ret = -ENOMEM;
 		goto err_stop_helper;
+	}
 	cgroup_path(cgrp, buf, PATH_MAX);
 	sch->cgrp_path = kstrdup(buf, GFP_KERNEL);
 	kfree(buf);
-	if (!sch->cgrp_path)
+	if (!sch->cgrp_path) {
+		ret = -ENOMEM;
 		goto err_stop_helper;
+	}
 
 	sch->cgrp = cgrp;
 	INIT_LIST_HEAD(&sch->children);

From c959218c6533cf7e373cb5ccddb93f582ee5d47b Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Sat, 14 Mar 2026 12:20:51 +0800
Subject: [PATCH 082/134] sched_ext/selftests: Fix incorrect include guard
 comments

Fix two mismatched closing comments in header include guards:

- util.h: closing comment says __SCX_TEST_H__ but the guard is
  __SCX_TEST_UTIL_H__
- exit_test.h: closing comment has a spurious '#' character before
  the guard name

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/exit_test.h | 2 +-
 tools/testing/selftests/sched_ext/util.h      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h
index 94f0268b9cb8..2723e0fda801 100644
--- a/tools/testing/selftests/sched_ext/exit_test.h
+++ b/tools/testing/selftests/sched_ext/exit_test.h
@@ -17,4 +17,4 @@ enum exit_test_case {
 	NUM_EXITS,
 };
 
-#endif  // # __EXIT_TEST_H__
+#endif  // __EXIT_TEST_H__
diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h
index bc13dfec1267..681cec04b439 100644
--- a/tools/testing/selftests/sched_ext/util.h
+++ b/tools/testing/selftests/sched_ext/util.h
@@ -10,4 +10,4 @@
 long file_read_long(const char *path);
 int file_write_long(const char *path, long val);
 
-#endif // __SCX_TEST_H__
+#endif // __SCX_TEST_UTIL_H__

From 6712c4fefca0422851b71d1a58a32ea03f69310f Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Sun, 15 Mar 2026 16:24:40 +0800
Subject: [PATCH 083/134] sched_ext: Update demo schedulers and selftests to
 use scx_bpf_task_set_dsq_vtime()

Direct writes to p->scx.dsq_vtime are deprecated in favor of
scx_bpf_task_set_dsq_vtime(). Update scx_simple, scx_flatcg, and
select_cpu_vtime selftest to use the new kfunc with
scale_by_task_weight_inverse().

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_flatcg.bpf.c                     | 12 +++++++-----
 tools/sched_ext/scx_simple.bpf.c                     |  6 ++++--
 .../selftests/sched_ext/select_cpu_vtime.bpf.c       |  6 ++++--
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 1351377f64d5..6d3a028d7b59 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -551,9 +551,11 @@ void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
 	 * too much, determine the execution time by taking explicit timestamps
 	 * instead of depending on @p->scx.slice.
 	 */
-	if (!fifo_sched)
-		p->scx.dsq_vtime +=
-			(SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+	if (!fifo_sched) {
+		u64 delta = scale_by_task_weight_inverse(p, SCX_SLICE_DFL - p->scx.slice);
+
+		scx_bpf_task_set_dsq_vtime(p, p->scx.dsq_vtime + delta);
+	}
 
 	taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
 	if (!taskc) {
@@ -822,7 +824,7 @@ s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
 	if (!(cgc = find_cgrp_ctx(args->cgroup)))
 		return -ENOENT;
 
-	p->scx.dsq_vtime = cgc->tvtime_now;
+	scx_bpf_task_set_dsq_vtime(p, cgc->tvtime_now);
 
 	return 0;
 }
@@ -924,7 +926,7 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
 		return;
 
 	delta = time_delta(p->scx.dsq_vtime, from_cgc->tvtime_now);
-	p->scx.dsq_vtime = to_cgc->tvtime_now + delta;
+	scx_bpf_task_set_dsq_vtime(p, to_cgc->tvtime_now + delta);
 }
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init)
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index 9ad6f0949987..cc40552b2b5f 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -121,12 +121,14 @@ void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
 	 * too much, determine the execution time by taking explicit timestamps
 	 * instead of depending on @p->scx.slice.
 	 */
-	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+	u64 delta = scale_by_task_weight_inverse(p, SCX_SLICE_DFL - p->scx.slice);
+
+	scx_bpf_task_set_dsq_vtime(p, p->scx.dsq_vtime + delta);
 }
 
 void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
 {
-	p->scx.dsq_vtime = vtime_now;
+	scx_bpf_task_set_dsq_vtime(p, vtime_now);
 }
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
index 7aa5dc6bfb93..eec70d388cbf 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
@@ -66,12 +66,14 @@ void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p)
 void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p,
 		    bool runnable)
 {
-	p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+	u64 delta = scale_by_task_weight_inverse(p, SCX_SLICE_DFL - p->scx.slice);
+
+	scx_bpf_task_set_dsq_vtime(p, p->scx.dsq_vtime + delta);
 }
 
 void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p)
 {
-	p->scx.dsq_vtime = vtime_now;
+	scx_bpf_task_set_dsq_vtime(p, vtime_now);
 }
 
 s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)

From f96bc0fa92be8dc0ec97bbe5bec6d5df26f9585b Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Sun, 15 Mar 2026 16:24:41 +0800
Subject: [PATCH 084/134] sched_ext: Update selftests to drop
 ops.cpu_acquire/release()

ops.cpu_acquire/release() are deprecated by commit a3f5d4822253
("sched_ext: Allow scx_bpf_reenqueue_local() to be called from
anywhere") in favor of handling CPU preemption via the sched_switch
tracepoint.

In the maximal selftest, replace the cpu_acquire/release stubs with a
minimal sched_switch TP program. Attach all non-struct_ops programs
(including the new TP) via maximal__attach() after disabling auto-attach
for the maximal_ops struct_ops map, which is managed manually in run().

Apply the same fix to reload_loop, which also uses the maximal skeleton.

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/maximal.bpf.c | 15 ++++++---------
 tools/testing/selftests/sched_ext/maximal.c     |  3 +++
 tools/testing/selftests/sched_ext/reload_loop.c |  3 +++
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
index a3aabeb82e6b..04a369078aac 100644
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -67,13 +67,12 @@ void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p,
 void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle)
 {}
 
-void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu,
-		    struct scx_cpu_acquire_args *args)
-{}
-
-void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu,
-		    struct scx_cpu_release_args *args)
-{}
+SEC("tp_btf/sched_switch")
+int BPF_PROG(maximal_sched_switch, bool preempt, struct task_struct *prev,
+	     struct task_struct *next, unsigned int prev_state)
+{
+	return 0;
+}
 
 void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu)
 {}
@@ -150,8 +149,6 @@ struct sched_ext_ops maximal_ops = {
 	.set_weight		= (void *) maximal_set_weight,
 	.set_cpumask		= (void *) maximal_set_cpumask,
 	.update_idle		= (void *) maximal_update_idle,
-	.cpu_acquire		= (void *) maximal_cpu_acquire,
-	.cpu_release		= (void *) maximal_cpu_release,
 	.cpu_online		= (void *) maximal_cpu_online,
 	.cpu_offline		= (void *) maximal_cpu_offline,
 	.init_task		= (void *) maximal_init_task,
diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c
index c6be50a9941d..1dc369224670 100644
--- a/tools/testing/selftests/sched_ext/maximal.c
+++ b/tools/testing/selftests/sched_ext/maximal.c
@@ -19,6 +19,9 @@ static enum scx_test_status setup(void **ctx)
 	SCX_ENUM_INIT(skel);
 	SCX_FAIL_IF(maximal__load(skel), "Failed to load skel");
 
+	bpf_map__set_autoattach(skel->maps.maximal_ops, false);
+	SCX_FAIL_IF(maximal__attach(skel), "Failed to attach skel");
+
 	*ctx = skel;
 
 	return SCX_TEST_PASS;
diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c
index 308211d80436..49297b83d748 100644
--- a/tools/testing/selftests/sched_ext/reload_loop.c
+++ b/tools/testing/selftests/sched_ext/reload_loop.c
@@ -23,6 +23,9 @@ static enum scx_test_status setup(void **ctx)
 	SCX_ENUM_INIT(skel);
 	SCX_FAIL_IF(maximal__load(skel), "Failed to load skel");
 
+	bpf_map__set_autoattach(skel->maps.maximal_ops, false);
+	SCX_FAIL_IF(maximal__attach(skel), "Failed to attach skel");
+
 	return SCX_TEST_PASS;
 }
 

From 0c66b0da006415880444ec821e3cb66a89273865 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 15 Mar 2026 19:43:27 -1000
Subject: [PATCH 085/134] sched_ext: Fix cgroup double-put on sub-sched abort
 path

The abort path in scx_sub_enable_workfn() fell through to out_put_cgrp,
double-putting the cgroup ref already owned by sch->cgrp. It also skipped
kthread_flush_work() needed to flush the disable path.

Relocate the abort block above err_unlock_and_disable so it falls through to
err_disable.

Fixes: 337ec00b1d9c ("sched_ext: Implement cgroup sub-sched enabling and disabling")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9202c6d7a771..2f70effcc4a6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7050,6 +7050,13 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	ret = 0;
 	goto out_unlock;
 
+out_put_cgrp:
+	cgroup_put(cgrp);
+out_unlock:
+	mutex_unlock(&scx_enable_mutex);
+	cmd->ret = ret;
+	return;
+
 abort:
 	put_task_struct(p);
 	scx_task_iter_stop(&sti);
@@ -7063,15 +7070,6 @@ abort:
 		}
 	}
 	scx_task_iter_stop(&sti);
-	scx_cgroup_unlock();
-	percpu_up_write(&scx_fork_rwsem);
-out_put_cgrp:
-	cgroup_put(cgrp);
-out_unlock:
-	mutex_unlock(&scx_enable_mutex);
-	cmd->ret = ret;
-	return;
-
 err_unlock_and_disable:
 	/* we'll soon enter disable path, keep bypass on */
 	scx_cgroup_unlock();

From 618a9db0158b1c51fd33822cf804f5a09f829837 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 15 Mar 2026 19:43:28 -1000
Subject: [PATCH 086/134] sched_ext: Use kobject_put() for
 kobject_init_and_add() failure in scx_alloc_and_add_sched()

kobject_init_and_add() failure requires kobject_put() for proper cleanup, but
the error paths were using kfree(sch) possibly leaking the kobject name. The
kset_create_and_add() failure was already using kobject_put() correctly.

Switch the kobject_init_and_add() error paths to use kobject_put(). As the
release path puts the cgroup ref, make scx_alloc_and_add_sched() always
consume @cgrp via a new err_put_cgrp label at the bottom of the error chain
and update scx_sub_enable_workfn() accordingly.

Fixes: 17108735b47d ("sched_ext: Use dynamic allocation for scx_sched")
Reported-by: David Carlier <devnexen@gmail.com>
Link: https://lore.kernel.org/r/20260314134457.46216-1-devnexen@gmail.com
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2f70effcc4a6..b942918fa364 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6353,6 +6353,10 @@ static struct scx_sched_pnode *alloc_pnode(struct scx_sched *sch, int node)
 	return pnode;
 }
 
+/*
+ * Allocate and initialize a new scx_sched. @cgrp's reference is always
+ * consumed whether the function succeeds or fails.
+ */
 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 						 struct cgroup *cgrp,
 						 struct scx_sched *parent)
@@ -6362,8 +6366,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids;
 
 	sch = kzalloc_flex(*sch, ancestors, level);
-	if (!sch)
-		return ERR_PTR(-ENOMEM);
+	if (!sch) {
+		ret = -ENOMEM;
+		goto err_put_cgrp;
+	}
 
 	sch->exit_info = alloc_exit_info(ops->exit_dump_len);
 	if (!sch->exit_info) {
@@ -6468,8 +6474,8 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 		ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
 
 	if (ret < 0) {
-		kfree(sch->cgrp_path);
-		goto err_stop_helper;
+		kobject_put(&sch->kobj);
+		return ERR_PTR(ret);
 	}
 
 	if (ops->sub_attach) {
@@ -6479,11 +6485,12 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 			return ERR_PTR(-ENOMEM);
 		}
 	}
-
 #else	/* CONFIG_EXT_SUB_SCHED */
 	ret = kobject_init_and_add(&sch->kobj, &scx_ktype, NULL, "root");
-	if (ret < 0)
-		goto err_stop_helper;
+	if (ret < 0) {
+		kobject_put(&sch->kobj);
+		return ERR_PTR(ret);
+	}
 #endif	/* CONFIG_EXT_SUB_SCHED */
 	return sch;
 
@@ -6506,6 +6513,8 @@ err_free_ei:
 	free_exit_info(sch->exit_info);
 err_free_sch:
 	kfree(sch);
+err_put_cgrp:
+	cgroup_put(cgrp);
 	return ERR_PTR(ret);
 }
 
@@ -6577,6 +6586,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 {
 	struct scx_enable_cmd *cmd = container_of(work, struct scx_enable_cmd, work);
 	struct sched_ext_ops *ops = cmd->ops;
+	struct cgroup *cgrp = root_cgroup();
 	struct scx_sched *sch;
 	struct scx_task_iter sti;
 	struct task_struct *p;
@@ -6593,7 +6603,8 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_unlock;
 
-	sch = scx_alloc_and_add_sched(ops, root_cgroup(), NULL);
+	cgroup_get(cgrp);
+	sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);
 		goto err_free_ksyncs;
@@ -6887,11 +6898,12 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	kobject_get(&parent->kobj);
 	raw_spin_unlock_irq(&scx_sched_lock);
 
+	/* scx_alloc_and_add_sched() consumes @cgrp whether it succeeds or not */
 	sch = scx_alloc_and_add_sched(ops, cgrp, parent);
 	kobject_put(&parent->kobj);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);
-		goto out_put_cgrp;
+		goto out_unlock;
 	}
 
 	ret = scx_link_sched(sch);

From 2008fb257323cdb0870d070f1c599bc3fed4be9b Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Tue, 17 Mar 2026 01:49:27 +0800
Subject: [PATCH 087/134] sched_ext: Fix slab-out-of-bounds in
 scx_alloc_and_add_sched()

ancestors[] is a flexible array member that needs level + 1 slots to
hold all ancestors including self (indices 0..level), but kzalloc_flex()
only allocates `level` slots:

  sch = kzalloc_flex(*sch, ancestors, level);
  ...
  sch->ancestors[level] = sch;  /* one past the end */

For the root scheduler (level = 0), zero slots are allocated and
ancestors[0] is written immediately past the end of the object.

KASAN reports:

  BUG: KASAN: slab-out-of-bounds in scx_alloc_and_add_sched+0x1c17/0x1d10
  Write of size 8 at addr ffff888066b56538 by task scx_enable_help/667

  The buggy address is located 0 bytes to the right of
   allocated 1336-byte region [ffff888066b56000, ffff888066b56538)

Fix by passing level + 1 to kzalloc_flex().

Tested with vng + scx_lavd, KASAN no longer triggers.

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b942918fa364..ab8150b8de57 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6365,7 +6365,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 	s32 level = parent ? parent->level + 1 : 0;
 	s32 node, cpu, ret, bypass_fail_cpu = nr_cpu_ids;
 
-	sch = kzalloc_flex(*sch, ancestors, level);
+	sch = kzalloc_flex(*sch, ancestors, level + 1);
 	if (!sch) {
 		ret = -ENOMEM;
 		goto err_put_cgrp;

From 2e5e5b3738ddda91b9a7ee9399efa5245c992233 Mon Sep 17 00:00:00 2001
From: zhidao su <suzhidao@xiaomi.com>
Date: Tue, 17 Mar 2026 15:52:09 +0800
Subject: [PATCH 088/134] sched_ext: Fix typos in comments

Fix five typos across three files:

- kernel/sched/ext.c: 'monotically' -> 'monotonically' (line 55)
- kernel/sched/ext.c: 'used by to check' -> 'used to check' (line 56)
- kernel/sched/ext.c: 'hardlockdup' -> 'hardlockup' (line 3881)
- kernel/sched/ext_idle.c: 'don't perfectly overlaps' ->
  'don't perfectly overlap' (line 371)
- tools/sched_ext/scx_flatcg.bpf.c: 'shaer' -> 'share' (line 21)

Signed-off-by: zhidao su <suzhidao@xiaomi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c               | 6 +++---
 kernel/sched/ext_idle.c          | 2 +-
 tools/sched_ext/scx_flatcg.bpf.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index ab8150b8de57..94548ee9ad85 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -73,8 +73,8 @@ static struct scx_sched *scx_enabling_sub_sched;
 #endif	/* CONFIG_EXT_SUB_SCHED */
 
 /*
- * A monotically increasing sequence number that is incremented every time a
- * scheduler is enabled. This can be used by to check if any custom sched_ext
+ * A monotonically increasing sequence number that is incremented every time a
+ * scheduler is enabled. This can be used to check if any custom sched_ext
  * scheduler has ever been used in the system.
  */
 static atomic_long_t scx_enable_seq = ATOMIC_LONG_INIT(0);
@@ -4956,7 +4956,7 @@ void scx_softlockup(u32 dur_s)
  * a good state before taking more drastic actions.
  *
  * Returns %true if sched_ext is enabled and abort was initiated, which may
- * resolve the reported hardlockdup. %false if sched_ext is not enabled or
+ * resolve the reported hardlockup. %false if sched_ext is not enabled or
  * someone else already initiated abort.
  */
 bool scx_hardlockup(int cpu)
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 03be4d664267..c7e405262697 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -368,7 +368,7 @@ void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
 
 	/*
 	 * Enable NUMA optimization only when there are multiple NUMA domains
-	 * among the online CPUs and the NUMA domains don't perfectly overlaps
+	 * among the online CPUs and the NUMA domains don't perfectly overlap
 	 * with the LLC domains.
 	 *
 	 * If all CPUs belong to the same NUMA node and the same LLC domain,
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 6d3a028d7b59..2f6ff19ca9d5 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -18,7 +18,7 @@
  * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
  * share in that competition is 100/(200+100) == 1/3. B's eventual share in the
  * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
- * eventual shaer is the same at 1/6. D is only competing at the top level and
+ * eventual share is the same at 1/6. D is only competing at the top level and
  * its share is 200/(100+200) == 2/3.
  *
  * So, instead of hierarchically scheduling level-by-level, we can consider it

From f6689792ffc4bc226636a513f8b0ac7bd45c5091 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Tue, 17 Mar 2026 23:13:11 +0800
Subject: [PATCH 089/134] selftests/sched_ext: Show failed test names in
 summary

When tests fail, the runner only printed the failure count, making
it hard to tell which tests failed without scrolling through output.

Track failed test names in an array and print them after the summary
so failures are immediately visible at the end of the run.

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/runner.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
index 90043fd74a60..37ad56c3eb29 100644
--- a/tools/testing/selftests/sched_ext/runner.c
+++ b/tools/testing/selftests/sched_ext/runner.c
@@ -133,6 +133,7 @@ static bool test_valid(const struct scx_test *test)
 int main(int argc, char **argv)
 {
 	const char *filter = NULL;
+	const char *failed_tests[MAX_SCX_TESTS];
 	unsigned testnum = 0, i;
 	unsigned passed = 0, skipped = 0, failed = 0;
 	int opt;
@@ -201,7 +202,7 @@ int main(int argc, char **argv)
 			skipped++;
 			break;
 		case SCX_TEST_FAIL:
-			failed++;
+			failed_tests[failed++] = test->name;
 			break;
 		}
 	}
@@ -210,6 +211,11 @@ int main(int argc, char **argv)
 	printf("PASSED:  %u\n", passed);
 	printf("SKIPPED: %u\n", skipped);
 	printf("FAILED:  %u\n", failed);
+	if (failed > 0) {
+		printf("\nFailed tests:\n");
+		for (i = 0; i < failed; i++)
+			printf("  - %s\n", failed_tests[i]);
+	}
 
 	return 0;
 }

From 2197cecdb02c57b08340059452540fcf101fa30d Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Fri, 20 Mar 2026 18:28:31 +0100
Subject: [PATCH 090/134] sched_ext: idle: Prioritize idle SMT sibling

In the default built-in idle CPU selection policy, when @prev_cpu is
busy and no fully idle core is available, try to place the task on its
SMT sibling if that sibling is idle, before searching any other idle CPU
in the same LLC.

Migration to the sibling is cheap and keeps the task on the same core,
preserving L1 cache and reducing wakeup latency.

On large SMT systems this appears to consistently boost throughput by
roughly 2-3% on CPU-bound workloads (running a number of tasks equal to
the number of SMT cores).

Cc: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_idle.c | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index c7e405262697..d9596427b5aa 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -424,18 +424,24 @@ static inline bool task_affinity_all(const struct task_struct *p)
  *   - prefer the last used CPU to take advantage of cached data (L1, L2) and
  *     branch prediction optimizations.
  *
- * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * 3. Prefer @prev_cpu's SMT sibling:
+ *   - if @prev_cpu is busy and no fully idle core is available, try to
+ *     place the task on an idle SMT sibling of @prev_cpu; keeping the
+ *     task on the same core makes migration cheaper, preserves L1 cache
+ *     locality and reduces wakeup latency.
+ *
+ * 4. Pick a CPU within the same LLC (Last-Level Cache):
  *   - if the above conditions aren't met, pick a CPU that shares the same
  *     LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
  *     cache locality.
  *
- * 4. Pick a CPU within the same NUMA node, if enabled:
+ * 5. Pick a CPU within the same NUMA node, if enabled:
  *   - choose a CPU from the same NUMA node, if the node cpumask is a
  *     subset of @cpus_allowed, to reduce memory access latency.
  *
- * 5. Pick any idle CPU within the @cpus_allowed domain.
+ * 6. Pick any idle CPU within the @cpus_allowed domain.
  *
- * Step 3 and 4 are performed only if the system has, respectively,
+ * Step 4 and 5 are performed only if the system has, respectively,
  * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
  * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs.
  *
@@ -616,6 +622,18 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 		goto out_unlock;
 	}
 
+	/*
+	 * Use @prev_cpu's sibling if it's idle.
+	 */
+	if (sched_smt_active()) {
+		for_each_cpu_and(cpu, cpu_smt_mask(prev_cpu), allowed) {
+			if (cpu == prev_cpu)
+				continue;
+			if (scx_idle_test_and_clear_cpu(cpu))
+				goto out_unlock;
+		}
+	}
+
 	/*
 	 * Search for any idle CPU in the same LLC domain.
 	 */

From 7e226f036a71c032cd1eb985dd02bc8314e69adf Mon Sep 17 00:00:00 2001
From: zhidao su <suzhidao@xiaomi.com>
Date: Thu, 19 Mar 2026 13:30:25 +0800
Subject: [PATCH 091/134] sched_ext: Documentation: Document events sysfs file
 and module parameters

Two categories of sched_ext diagnostics are currently undocumented:

1. Per-scheduler events sysfs file
   Each active BPF scheduler exposes a set of diagnostic counters at
   /sys/kernel/sched_ext/<name>/events.  These counters are defined
   (with detailed comments) in kernel/sched/ext_internal.h but have
   no corresponding documentation in sched-ext.rst.  BPF scheduler
   developers must read kernel source to understand what each counter
   means.

   Add a description of the events file, an example of its output, and
   a brief explanation of every counter.

2. Module parameters
   kernel/sched/ext.c registers two parameters under the sched_ext.
   prefix (slice_bypass_us, bypass_lb_intv_us) via module_param_cb()
   with MODULE_PARM_DESC() strings, but sched-ext.rst makes no mention
   of them.  Users who need to tune bypass-mode behavior have no
   in-tree documentation to consult.

   Add a "Module Parameters" section documenting both knobs: their
   default values, valid ranges (taken from the set_*() validators in
   ext.c), and the note from the source that they are primarily for
   debugging.

No functional changes.

Signed-off-by: zhidao su <suzhidao@xiaomi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 68 +++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index f4f7d8f4f9e4..9e4dbabc03c0 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -93,6 +93,55 @@ scheduler has been loaded):
     # cat /sys/kernel/sched_ext/enable_seq
     1
 
+Each running scheduler also exposes a per-scheduler ``events`` file under
+``/sys/kernel/sched_ext/<scheduler-name>/events`` that tracks diagnostic
+counters. Each counter occupies one ``name value`` line:
+
+.. code-block:: none
+
+    # cat /sys/kernel/sched_ext/simple/events
+    SCX_EV_SELECT_CPU_FALLBACK 0
+    SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE 0
+    SCX_EV_DISPATCH_KEEP_LAST 123
+    SCX_EV_ENQ_SKIP_EXITING 0
+    SCX_EV_ENQ_SKIP_MIGRATION_DISABLED 0
+    SCX_EV_REENQ_IMMED 0
+    SCX_EV_REENQ_LOCAL_REPEAT 0
+    SCX_EV_REFILL_SLICE_DFL 456789
+    SCX_EV_BYPASS_DURATION 0
+    SCX_EV_BYPASS_DISPATCH 0
+    SCX_EV_BYPASS_ACTIVATE 0
+    SCX_EV_INSERT_NOT_OWNED 0
+    SCX_EV_SUB_BYPASS_DISPATCH 0
+
+The counters are described in ``kernel/sched/ext_internal.h``; briefly:
+
+* ``SCX_EV_SELECT_CPU_FALLBACK``: ops.select_cpu() returned a CPU unusable by
+  the task and the core scheduler silently picked a fallback CPU.
+* ``SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE``: a local-DSQ dispatch was redirected
+  to the global DSQ because the target CPU went offline.
+* ``SCX_EV_DISPATCH_KEEP_LAST``: a task continued running because no other
+  task was available (only when ``SCX_OPS_ENQ_LAST`` is not set).
+* ``SCX_EV_ENQ_SKIP_EXITING``: an exiting task was dispatched to the local DSQ
+  directly, bypassing ops.enqueue() (only when ``SCX_OPS_ENQ_EXITING`` is not set).
+* ``SCX_EV_ENQ_SKIP_MIGRATION_DISABLED``: a migration-disabled task was
+  dispatched to its local DSQ directly (only when
+  ``SCX_OPS_ENQ_MIGRATION_DISABLED`` is not set).
+* ``SCX_EV_REENQ_IMMED``: a task dispatched with ``SCX_ENQ_IMMED`` was
+  re-enqueued because the target CPU was not available for immediate execution.
+* ``SCX_EV_REENQ_LOCAL_REPEAT``: a reenqueue of the local DSQ triggered
+  another reenqueue; recurring counts indicate incorrect ``SCX_ENQ_REENQ``
+  handling in the BPF scheduler.
+* ``SCX_EV_REFILL_SLICE_DFL``: a task's time slice was refilled with the
+  default value (``SCX_SLICE_DFL``).
+* ``SCX_EV_BYPASS_DURATION``: total nanoseconds spent in bypass mode.
+* ``SCX_EV_BYPASS_DISPATCH``: number of tasks dispatched while in bypass mode.
+* ``SCX_EV_BYPASS_ACTIVATE``: number of times bypass mode was activated.
+* ``SCX_EV_INSERT_NOT_OWNED``: attempted to insert a task not owned by this
+  scheduler into a DSQ; such attempts are silently ignored.
+* ``SCX_EV_SUB_BYPASS_DISPATCH``: tasks dispatched from sub-scheduler bypass
+  DSQs (only relevant with ``CONFIG_EXT_SUB_SCHED``).
+
 ``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more
 detailed information:
 
@@ -441,6 +490,25 @@ Where to Look
     scheduling. Tasks with CPU affinity are direct-dispatched in FIFO order;
     all others are scheduled in user space by a simple vruntime scheduler.
 
+Module Parameters
+=================
+
+sched_ext exposes two module parameters under the ``sched_ext.`` prefix that
+control bypass-mode behaviour. These knobs are primarily for debugging; there
+is usually no reason to change them during normal operation. They can be read
+and written at runtime (mode 0600) via
+``/sys/module/sched_ext/parameters/``.
+
+``sched_ext.slice_bypass_us`` (default: 5000 µs)
+    The time slice assigned to all tasks when the scheduler is in bypass mode,
+    i.e. during BPF scheduler load, unload, and error recovery. Valid range is
+    100 µs to 100 ms.
+
+``sched_ext.bypass_lb_intv_us`` (default: 500000 µs)
+    The interval at which the bypass-mode load balancer redistributes tasks
+    across CPUs. Set to 0 to disable load balancing during bypass mode. Valid
+    range is 0 to 10 s.
+
 ABI Instability
 ===============
 

From 818dbedd043e94f270403c795fe7856bfa61e013 Mon Sep 17 00:00:00 2001
From: zhidao su <suzhidao@xiaomi.com>
Date: Thu, 19 Mar 2026 13:30:26 +0800
Subject: [PATCH 092/134] selftests/sched_ext: Return non-zero exit code on
 test failure

runner.c always returned 0 regardless of test results.  The kselftest
framework (tools/testing/selftests/kselftest/runner.sh) invokes the runner
binary and treats a non-zero exit code as a test failure; with the old
code, failed sched_ext tests were silently hidden from the parent harness
even though individual "not ok" TAP lines were emitted.

Return 1 when at least one test failed, 0 when all tests passed or were
skipped.

Signed-off-by: zhidao su <suzhidao@xiaomi.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/runner.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
index 37ad56c3eb29..4c68efa1512a 100644
--- a/tools/testing/selftests/sched_ext/runner.c
+++ b/tools/testing/selftests/sched_ext/runner.c
@@ -217,7 +217,7 @@ int main(int argc, char **argv)
 			printf("  - %s\n", failed_tests[i]);
 	}
 
-	return 0;
+	return failed > 0 ? 1 : 0;
 }
 
 void scx_test_register(struct scx_test *test)

From 068014daad8d07bc11b24c223bc2d2c331b458bd Mon Sep 17 00:00:00 2001
From: Ke Zhao <ke.zhao.kernel@gmail.com>
Date: Wed, 18 Mar 2026 16:53:49 +0800
Subject: [PATCH 093/134] tools/sched_ext: Update stale scx_ops_error() comment
 in fcg_cgroup_move()

The function scx_ops_error() was dropped, but the
comment here is left pointing to the old name.
Update to be consistent with current API.

Signed-off-by: Ke Zhao <ke.zhao.kernel@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_flatcg.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 2f6ff19ca9d5..fec359581826 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -921,7 +921,7 @@ void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
 	struct fcg_cgrp_ctx *from_cgc, *to_cgc;
 	s64 delta;
 
-	/* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
+	/* find_cgrp_ctx() triggers scx_bpf_error() on lookup failures */
 	if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
 		return;
 

From e73b1d7210c02ff223e2786934d5a6e73eab1999 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Sun, 22 Mar 2026 21:48:16 +0800
Subject: [PATCH 094/134] sched_ext: Fix build errors and unused label warning
 in non-cgroup configs

When building with SCHED_CLASS_EXT=y but CGROUPS=n, clang reports errors
for undeclared cgroup_put() and cgroup_get() calls, and a warning for the
unused err_stop_helper label.

EXT_SUB_SCHED is def_bool y depending only on SCHED_CLASS_EXT, but it
fundamentally requires cgroups (cgroup_path, cgroup_get, cgroup_put,
cgroup_id, etc.). Add the missing CGROUPS dependency to EXT_SUB_SCHED in
init/Kconfig.

Guard cgroup_put() and cgroup_get() in the common paths with:
  #if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)

Guard the err_stop_helper label with #ifdef CONFIG_EXT_SUB_SCHED since
all gotos targeting it are inside that same ifdef block.

Tested with both CGROUPS enabled and disabled.

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202603210903.IrKhPd6k-lkp@intel.com/
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 init/Kconfig       | 2 +-
 kernel/sched/ext.c | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 06abd8e272cb..487a93e34be9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1178,7 +1178,7 @@ endif #CGROUP_SCHED
 
 config EXT_SUB_SCHED
         def_bool y
-        depends on SCHED_CLASS_EXT
+        depends on SCHED_CLASS_EXT && CGROUPS
 
 config SCHED_MM_CID
 	def_bool y
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 94548ee9ad85..2e7a1259bd7c 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6494,8 +6494,10 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops,
 #endif	/* CONFIG_EXT_SUB_SCHED */
 	return sch;
 
+#ifdef CONFIG_EXT_SUB_SCHED
 err_stop_helper:
 	kthread_destroy_worker(sch->helper);
+#endif
 err_free_pcpu:
 	for_each_possible_cpu(cpu) {
 		if (cpu == bypass_fail_cpu)
@@ -6514,7 +6516,9 @@ err_free_ei:
 err_free_sch:
 	kfree(sch);
 err_put_cgrp:
+#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
 	cgroup_put(cgrp);
+#endif
 	return ERR_PTR(ret);
 }
 
@@ -6603,7 +6607,9 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_unlock;
 
+#if defined(CONFIG_EXT_GROUP_SCHED) || defined(CONFIG_EXT_SUB_SCHED)
 	cgroup_get(cgrp);
+#endif
 	sch = scx_alloc_and_add_sched(ops, cgrp, NULL);
 	if (IS_ERR(sch)) {
 		ret = PTR_ERR(sch);

From 63f500c32a37d490ec623a3130e488cdb9bd6cf7 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Sun, 22 Mar 2026 07:51:46 +0100
Subject: [PATCH 095/134] sched_ext: Guard cpu_smt_mask() with CONFIG_SCHED_SMT

Wrap cpu_smt_mask() usage with CONFIG_SCHED_SMT to avoid build failures
on kernels built without SMT support.

Fixes: 2197cecdb02c ("sched_ext: idle: Prioritize idle SMT sibling")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202603221422.XIueJOE9-lkp@intel.com/
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext_idle.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index d9596427b5aa..857d8e902b44 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -622,6 +622,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 		goto out_unlock;
 	}
 
+#ifdef CONFIG_SCHED_SMT
 	/*
 	 * Use @prev_cpu's sibling if it's idle.
 	 */
@@ -633,6 +634,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
 				goto out_unlock;
 		}
 	}
+#endif
 
 	/*
 	 * Search for any idle CPU in the same LLC domain.

From f03ffe53ab6ffc798ed8291090cebf19c6e5fa3b Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Sun, 22 Mar 2026 07:35:46 +0100
Subject: [PATCH 096/134] tools/sched_ext: Add compat handling for
 sub-scheduler ops

Extend SCX_OPS_OPEN() with compatibility handling for ops.sub_attach()
and ops.sub_detach(), allowing scx C schedulers with sub-scheduler
support to run on kernels both with and without its support.

Cc: Cheng-Yang Chou <yphbchou0911@gmail.com>
Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/compat.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index 9e0c8f3161e8..039854c490d5 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -160,6 +160,7 @@ static inline long scx_hotplug_seq(void)
  * COMPAT:
  * - v6.17: ops.cgroup_set_bandwidth()
  * - v6.19: ops.cgroup_set_idle()
+ * - v7.1:  ops.sub_attach(), ops.sub_detach(), ops.sub_cgroup_id
  */
 #define SCX_OPS_OPEN(__ops_name, __scx_name) ({					\
 	struct __scx_name *__skel;						\
@@ -181,6 +182,21 @@ static inline long scx_hotplug_seq(void)
 		fprintf(stderr, "WARNING: kernel doesn't support ops.cgroup_set_idle()\n"); \
 		__skel->struct_ops.__ops_name->cgroup_set_idle = NULL;	\
 	}									\
+	if (__skel->struct_ops.__ops_name->sub_attach &&			\
+	    !__COMPAT_struct_has_field("sched_ext_ops", "sub_attach")) {	\
+		fprintf(stderr, "WARNING: kernel doesn't support ops.sub_attach()\n"); \
+		__skel->struct_ops.__ops_name->sub_attach = NULL;		\
+	}									\
+	if (__skel->struct_ops.__ops_name->sub_detach &&			\
+	    !__COMPAT_struct_has_field("sched_ext_ops", "sub_detach")) {	\
+		fprintf(stderr, "WARNING: kernel doesn't support ops.sub_detach()\n"); \
+		__skel->struct_ops.__ops_name->sub_detach = NULL;		\
+	}									\
+	if (__skel->struct_ops.__ops_name->sub_cgroup_id > 0 &&		\
+	    !__COMPAT_struct_has_field("sched_ext_ops", "sub_cgroup_id")) { \
+		fprintf(stderr, "WARNING: kernel doesn't support ops.sub_cgroup_id\n"); \
+		__skel->struct_ops.__ops_name->sub_cgroup_id = 0;		\
+	}									\
 	__skel; 								\
 })
 

From 76edc2761ab8bd27fe4c4b8b2fb71baefc4a31e8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 22 Mar 2026 10:33:08 -1000
Subject: [PATCH 097/134] sched_ext: Use irq_work_queue_on() in
 schedule_deferred()

schedule_deferred() uses irq_work_queue() which always queues on the
calling CPU. The deferred work can run from any CPU correctly, and the
_locked() path already processes remote rqs from the calling CPU. However,
when falling through to the irq_work path, queuing on the target CPU is
preferable as the work can run sooner via IPI delivery rather than waiting
for the calling CPU to re-enable IRQs.

Currently, only reenqueue operations use this path - either BPF-initiated
reenqueue targeting a remote rq, or IMMED reenqueue when the target CPU is
busy running userspace (not in balance or wakeup, so the _locked() fast
paths aren't available). Use irq_work_queue_on() to target the owning CPU.

This improves IMMED reenqueue latency when tasks are dispatched to
remote local DSQs. Testing on a 24-CPU AMD Ryzen 3900X with scx_qmap
-I -F 50 (ALWAYS_ENQ_IMMED, every 50th enqueue forced to prev_cpu's
local DSQ) under heavy mixed load (2x CPU oversubscription, yield and
context-switch pressure, SCHED_FIFO bursts, periodic fork storms, mixed
nice levels, C-states disabled), measuring local DSQ residence time
(insert to remove) over 5 x 120s runs (~1.2M tasks per set):

  >128us outliers:  71 -> 39  (-45%)
  >256us outliers:  59 -> 36  (-39%)

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
---
 kernel/sched/ext.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2e7a1259bd7c..72a07eb050a3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1164,10 +1164,18 @@ static void deferred_irq_workfn(struct irq_work *irq_work)
 static void schedule_deferred(struct rq *rq)
 {
 	/*
-	 * Queue an irq work. They are executed on IRQ re-enable which may take
-	 * a bit longer than the scheduler hook in schedule_deferred_locked().
+	 * This is the fallback when schedule_deferred_locked() can't use
+	 * the cheaper balance callback or wakeup hook paths (the target
+	 * CPU is not in balance or wakeup). Currently, this is primarily
+	 * hit by reenqueue operations targeting a remote CPU.
+	 *
+	 * Queue on the target CPU. The deferred work can run from any CPU
+	 * correctly - the _locked() path already processes remote rqs from
+	 * the calling CPU - but targeting the owning CPU allows IPI delivery
+	 * without waiting for the calling CPU to re-enable IRQs and is
+	 * cheaper as the reenqueue runs locally.
 	 */
-	irq_work_queue(&rq->scx.deferred_irq_work);
+	irq_work_queue_on(&rq->scx.deferred_irq_work, cpu_of(rq));
 }
 
 /**

From c50dcf533149e313a61d483769eb48682a1b0cdd Mon Sep 17 00:00:00 2001
From: zhidao su <suzhidao@xiaomi.com>
Date: Sun, 22 Mar 2026 15:35:33 +0800
Subject: [PATCH 098/134] selftests/sched_ext: Add tests for SCX_ENQ_IMMED and
 scx_bpf_dsq_reenq()

Add three selftests covering features introduced in v7.1:

- dsq_reenq: Verify scx_bpf_dsq_reenq() on user DSQs triggers
  ops.enqueue() with SCX_ENQ_REENQ and SCX_TASK_REENQ_KFUNC in
  p->scx.flags.

- enq_immed: Verify SCX_OPS_ALWAYS_ENQ_IMMED slow path where tasks
  dispatched to a busy CPU's local DSQ are re-enqueued through
  ops.enqueue() with SCX_TASK_REENQ_IMMED.

- consume_immed: Verify SCX_ENQ_IMMED via the consume path using
  scx_bpf_dsq_move_to_local___v2() with explicit SCX_ENQ_IMMED.

All three tests skip gracefully on kernels that predate the required
features by checking availability via __COMPAT_has_ksym() /
__COMPAT_read_enum() before loading.

Signed-off-by: zhidao su <suzhidao@xiaomi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/Makefile    |   3 +
 .../selftests/sched_ext/consume_immed.bpf.c   |  88 +++++++++++++
 .../selftests/sched_ext/consume_immed.c       | 115 +++++++++++++++++
 .../selftests/sched_ext/dsq_reenq.bpf.c       | 120 ++++++++++++++++++
 tools/testing/selftests/sched_ext/dsq_reenq.c |  95 ++++++++++++++
 .../selftests/sched_ext/enq_immed.bpf.c       |  63 +++++++++
 tools/testing/selftests/sched_ext/enq_immed.c | 117 +++++++++++++++++
 7 files changed, 601 insertions(+)
 create mode 100644 tools/testing/selftests/sched_ext/consume_immed.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/consume_immed.c
 create mode 100644 tools/testing/selftests/sched_ext/dsq_reenq.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/dsq_reenq.c
 create mode 100644 tools/testing/selftests/sched_ext/enq_immed.bpf.c
 create mode 100644 tools/testing/selftests/sched_ext/enq_immed.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index a3bbe2c7911b..84e4f69b8833 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -162,8 +162,11 @@ endef
 all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog)))
 
 auto-test-targets :=			\
+	consume_immed			\
 	create_dsq			\
 	dequeue				\
+	dsq_reenq			\
+	enq_immed			\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/consume_immed.bpf.c b/tools/testing/selftests/sched_ext/consume_immed.bpf.c
new file mode 100644
index 000000000000..9c7808f5abe1
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/consume_immed.bpf.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Validate SCX_ENQ_IMMED semantics through the consume path.
+ *
+ * This is the orthogonal counterpart to enq_immed:
+ *
+ *   enq_immed:      SCX_ENQ_IMMED via scx_bpf_dsq_insert() to local DSQ
+ *                   with SCX_OPS_ALWAYS_ENQ_IMMED
+ *
+ *   consume_immed:  SCX_ENQ_IMMED via scx_bpf_dsq_move_to_local() with
+ *                   explicit SCX_ENQ_IMMED in enq_flags (requires v2 kfunc)
+ *
+ * Worker threads belonging to test_tgid are inserted into USER_DSQ.
+ * ops.dispatch() on CPU 0 consumes from USER_DSQ with SCX_ENQ_IMMED.
+ * With multiple workers competing for CPU 0, dsq->nr > 1 triggers the
+ * IMMED slow path (reenqueue with SCX_TASK_REENQ_IMMED).
+ *
+ * Requires scx_bpf_dsq_move_to_local___v2() (v7.1+) for enq_flags support.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+#define USER_DSQ	0
+
+/* Set by userspace to identify the test process group. */
+const volatile u32 test_tgid;
+
+/*
+ * SCX_TASK_REENQ_REASON_MASK and SCX_TASK_REENQ_IMMED are exported via
+ * vmlinux BTF as part of enum scx_ent_flags.
+ */
+
+u64 nr_consume_immed_reenq;
+
+void BPF_STRUCT_OPS(consume_immed_enqueue, struct task_struct *p,
+		    u64 enq_flags)
+{
+	if (enq_flags & SCX_ENQ_REENQ) {
+		u32 reason = p->scx.flags & SCX_TASK_REENQ_REASON_MASK;
+
+		if (reason == SCX_TASK_REENQ_IMMED)
+			__sync_fetch_and_add(&nr_consume_immed_reenq, 1);
+	}
+
+	if (p->tgid == (pid_t)test_tgid)
+		scx_bpf_dsq_insert(p, USER_DSQ, SCX_SLICE_DFL, enq_flags);
+	else
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL,
+				   enq_flags);
+}
+
+void BPF_STRUCT_OPS(consume_immed_dispatch, s32 cpu, struct task_struct *prev)
+{
+	if (cpu == 0)
+		scx_bpf_dsq_move_to_local(USER_DSQ, SCX_ENQ_IMMED);
+	else
+		scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL, 0);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(consume_immed_init)
+{
+	/*
+	 * scx_bpf_dsq_move_to_local___v2() adds the enq_flags parameter.
+	 * On older kernels the consume path cannot pass SCX_ENQ_IMMED.
+	 */
+	if (!bpf_ksym_exists(scx_bpf_dsq_move_to_local___v2)) {
+		scx_bpf_error("scx_bpf_dsq_move_to_local v2 not available");
+		return -EOPNOTSUPP;
+	}
+
+	return scx_bpf_create_dsq(USER_DSQ, -1);
+}
+
+void BPF_STRUCT_OPS(consume_immed_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(consume_immed_ops,
+	       .enqueue		= (void *)consume_immed_enqueue,
+	       .dispatch	= (void *)consume_immed_dispatch,
+	       .init		= (void *)consume_immed_init,
+	       .exit		= (void *)consume_immed_exit,
+	       .name		= "consume_immed")
diff --git a/tools/testing/selftests/sched_ext/consume_immed.c b/tools/testing/selftests/sched_ext/consume_immed.c
new file mode 100644
index 000000000000..7f9594cfa9cb
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/consume_immed.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace test for SCX_ENQ_IMMED via the consume path.
+ *
+ * Validates that scx_bpf_dsq_move_to_local(USER_DSQ, SCX_ENQ_IMMED) on
+ * a busy CPU triggers the IMMED slow path, re-enqueuing tasks through
+ * ops.enqueue() with SCX_TASK_REENQ_IMMED.
+ *
+ * Skipped on single-CPU systems where local DSQ contention cannot occur.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "consume_immed.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_WORKERS		4
+#define TEST_DURATION_SEC	3
+
+static volatile bool stop_workers;
+
+static void *worker_fn(void *arg)
+{
+	while (!stop_workers) {
+		volatile unsigned long i;
+
+		for (i = 0; i < 100000UL; i++)
+			;
+		usleep(100);
+	}
+	return NULL;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct consume_immed *skel;
+
+	if (!__COMPAT_has_ksym("scx_bpf_dsq_move_to_local___v2")) {
+		fprintf(stderr,
+			"SKIP: scx_bpf_dsq_move_to_local v2 not available\n");
+		return SCX_TEST_SKIP;
+	}
+
+	skel = consume_immed__open();
+	SCX_FAIL_IF(!skel, "Failed to open");
+	SCX_ENUM_INIT(skel);
+
+	skel->rodata->test_tgid = (u32)getpid();
+
+	SCX_FAIL_IF(consume_immed__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct consume_immed *skel = ctx;
+	struct bpf_link *link;
+	pthread_t workers[NUM_WORKERS];
+	long nproc;
+	int i;
+	u64 reenq;
+
+	nproc = sysconf(_SC_NPROCESSORS_ONLN);
+	if (nproc <= 1) {
+		fprintf(stderr,
+			"SKIP: single CPU, consume IMMED slow path may not trigger\n");
+		return SCX_TEST_SKIP;
+	}
+
+	link = bpf_map__attach_struct_ops(skel->maps.consume_immed_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	stop_workers = false;
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(pthread_create(&workers[i], NULL, worker_fn, NULL),
+			    "Failed to create worker %d", i);
+	}
+
+	sleep(TEST_DURATION_SEC);
+
+	reenq = skel->bss->nr_consume_immed_reenq;
+
+	stop_workers = true;
+	for (i = 0; i < NUM_WORKERS; i++)
+		pthread_join(workers[i], NULL);
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+	SCX_GT(reenq, 0);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct consume_immed *skel = ctx;
+
+	consume_immed__destroy(skel);
+}
+
+struct scx_test consume_immed = {
+	.name		= "consume_immed",
+	.description	= "Verify SCX_ENQ_IMMED slow path via "
+			  "scx_bpf_dsq_move_to_local() consume path",
+	.setup		= setup,
+	.run		= run,
+	.cleanup	= cleanup,
+};
+REGISTER_SCX_TEST(&consume_immed)
diff --git a/tools/testing/selftests/sched_ext/dsq_reenq.bpf.c b/tools/testing/selftests/sched_ext/dsq_reenq.bpf.c
new file mode 100644
index 000000000000..750bb10508df
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dsq_reenq.bpf.c
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Validate scx_bpf_dsq_reenq() semantics on user DSQs.
+ *
+ * A BPF timer periodically calls scx_bpf_dsq_reenq() on a user DSQ,
+ * causing tasks to be re-enqueued through ops.enqueue() with SCX_ENQ_REENQ
+ * set and SCX_TASK_REENQ_KFUNC recorded in p->scx.flags.
+ *
+ * The test verifies:
+ *  - scx_bpf_dsq_reenq() triggers ops.enqueue() with SCX_ENQ_REENQ
+ *  - The reenqueue reason is SCX_TASK_REENQ_KFUNC (bit 12 set)
+ *  - Tasks are correctly re-dispatched after reenqueue
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+#define USER_DSQ	0
+
+/*
+ * SCX_TASK_REENQ_REASON_MASK and SCX_TASK_REENQ_KFUNC are exported via
+ * vmlinux BTF as part of enum scx_ent_flags.
+ */
+
+/* 5ms timer interval */
+#define REENQ_TIMER_NS		(5 * 1000 * 1000ULL)
+
+/*
+ * Number of times ops.enqueue() was called with SCX_ENQ_REENQ set and
+ * SCX_TASK_REENQ_KFUNC recorded in p->scx.flags.
+ */
+u64 nr_reenq_kfunc;
+
+struct reenq_timer_val {
+	struct bpf_timer timer;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, u32);
+	__type(value, struct reenq_timer_val);
+} reenq_timer SEC(".maps");
+
+/*
+ * Timer callback: reenqueue all tasks currently sitting on USER_DSQ back
+ * through ops.enqueue() with SCX_ENQ_REENQ | SCX_TASK_REENQ_KFUNC.
+ */
+static int reenq_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+	scx_bpf_dsq_reenq(USER_DSQ, 0);
+	bpf_timer_start(timer, REENQ_TIMER_NS, 0);
+	return 0;
+}
+
+void BPF_STRUCT_OPS(dsq_reenq_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	/*
+	 * If this is a kfunc-triggered reenqueue, verify that
+	 * SCX_TASK_REENQ_KFUNC is recorded in p->scx.flags.
+	 */
+	if (enq_flags & SCX_ENQ_REENQ) {
+		u32 reason = p->scx.flags & SCX_TASK_REENQ_REASON_MASK;
+
+		if (reason == SCX_TASK_REENQ_KFUNC)
+			__sync_fetch_and_add(&nr_reenq_kfunc, 1);
+	}
+
+	/*
+	 * Always dispatch to USER_DSQ so the timer can reenqueue tasks again
+	 * on the next tick.
+	 */
+	scx_bpf_dsq_insert(p, USER_DSQ, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(dsq_reenq_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(USER_DSQ, 0);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dsq_reenq_init)
+{
+	struct reenq_timer_val *tval;
+	u32 key = 0;
+	s32 ret;
+
+	ret = scx_bpf_create_dsq(USER_DSQ, -1);
+	if (ret)
+		return ret;
+
+	if (!__COMPAT_has_generic_reenq()) {
+		scx_bpf_error("scx_bpf_dsq_reenq() not available");
+		return -EOPNOTSUPP;
+	}
+
+	tval = bpf_map_lookup_elem(&reenq_timer, &key);
+	if (!tval)
+		return -ESRCH;
+
+	bpf_timer_init(&tval->timer, &reenq_timer, CLOCK_MONOTONIC);
+	bpf_timer_set_callback(&tval->timer, reenq_timerfn);
+
+	return bpf_timer_start(&tval->timer, REENQ_TIMER_NS, 0);
+}
+
+void BPF_STRUCT_OPS(dsq_reenq_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(dsq_reenq_ops,
+	       .enqueue		= (void *)dsq_reenq_enqueue,
+	       .dispatch	= (void *)dsq_reenq_dispatch,
+	       .init		= (void *)dsq_reenq_init,
+	       .exit		= (void *)dsq_reenq_exit,
+	       .timeout_ms	= 10000,
+	       .name		= "dsq_reenq")
diff --git a/tools/testing/selftests/sched_ext/dsq_reenq.c b/tools/testing/selftests/sched_ext/dsq_reenq.c
new file mode 100644
index 000000000000..b0d99f9c9a9a
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dsq_reenq.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace test for scx_bpf_dsq_reenq() semantics.
+ *
+ * Attaches the dsq_reenq BPF scheduler, runs workload threads that
+ * sleep and yield to keep tasks on USER_DSQ, waits for the BPF timer
+ * to fire several times, then verifies that at least one kfunc-triggered
+ * reenqueue was observed (ops.enqueue() called with SCX_ENQ_REENQ and
+ * SCX_TASK_REENQ_KFUNC in p->scx.flags).
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <unistd.h>
+#include <pthread.h>
+#include "dsq_reenq.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_WORKERS	4
+#define TEST_DURATION_SEC 3
+
+static volatile bool stop_workers;
+static pthread_t workers[NUM_WORKERS];
+
+static void *worker_fn(void *arg)
+{
+	while (!stop_workers) {
+		usleep(500);
+		sched_yield();
+	}
+	return NULL;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct dsq_reenq *skel;
+
+	if (!__COMPAT_has_ksym("scx_bpf_dsq_reenq")) {
+		fprintf(stderr, "SKIP: scx_bpf_dsq_reenq() not available\n");
+		return SCX_TEST_SKIP;
+	}
+
+	skel = dsq_reenq__open();
+	SCX_FAIL_IF(!skel, "Failed to open");
+	SCX_ENUM_INIT(skel);
+	SCX_FAIL_IF(dsq_reenq__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct dsq_reenq *skel = ctx;
+	struct bpf_link *link;
+	int i;
+
+	link = bpf_map__attach_struct_ops(skel->maps.dsq_reenq_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	stop_workers = false;
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(pthread_create(&workers[i], NULL, worker_fn, NULL),
+			    "Failed to create worker %d", i);
+	}
+
+	sleep(TEST_DURATION_SEC);
+
+	stop_workers = true;
+	for (i = 0; i < NUM_WORKERS; i++)
+		pthread_join(workers[i], NULL);
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+	SCX_GT(skel->bss->nr_reenq_kfunc, 0);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct dsq_reenq *skel = ctx;
+
+	dsq_reenq__destroy(skel);
+}
+
+struct scx_test dsq_reenq = {
+	.name		= "dsq_reenq",
+	.description	= "Verify scx_bpf_dsq_reenq() triggers enqueue with "
+			  "SCX_ENQ_REENQ and SCX_TASK_REENQ_KFUNC reason",
+	.setup		= setup,
+	.run		= run,
+	.cleanup	= cleanup,
+};
+REGISTER_SCX_TEST(&dsq_reenq)
diff --git a/tools/testing/selftests/sched_ext/enq_immed.bpf.c b/tools/testing/selftests/sched_ext/enq_immed.bpf.c
new file mode 100644
index 000000000000..805dd0256218
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/enq_immed.bpf.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Validate SCX_ENQ_IMMED fast/slow path semantics via the direct insert path.
+ *
+ * With SCX_OPS_ALWAYS_ENQ_IMMED set, the kernel automatically adds
+ * SCX_ENQ_IMMED to every local DSQ dispatch.  When the target CPU's local
+ * DSQ already has tasks queued (dsq->nr > 1), the kernel re-enqueues the
+ * task through ops.enqueue() with SCX_ENQ_REENQ and SCX_TASK_REENQ_IMMED
+ * recorded in p->scx.flags (the "slow path").
+ *
+ * Worker threads are pinned to CPU 0 via SCX_DSQ_LOCAL_ON to guarantee
+ * local DSQ contention.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/* Set by userspace to identify the test process group. */
+const volatile u32 test_tgid;
+
+/*
+ * SCX_TASK_REENQ_REASON_MASK and SCX_TASK_REENQ_IMMED are exported via
+ * vmlinux BTF as part of enum scx_ent_flags.
+ */
+
+u64 nr_immed_reenq;
+
+void BPF_STRUCT_OPS(enq_immed_enqueue, struct task_struct *p, u64 enq_flags)
+{
+	if (enq_flags & SCX_ENQ_REENQ) {
+		u32 reason = p->scx.flags & SCX_TASK_REENQ_REASON_MASK;
+
+		if (reason == SCX_TASK_REENQ_IMMED)
+			__sync_fetch_and_add(&nr_immed_reenq, 1);
+	}
+
+	if (p->tgid == (pid_t)test_tgid)
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | 0, SCX_SLICE_DFL,
+				   enq_flags);
+	else
+		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL,
+				   enq_flags);
+}
+
+void BPF_STRUCT_OPS(enq_immed_dispatch, s32 cpu, struct task_struct *prev)
+{
+	scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL, 0);
+}
+
+void BPF_STRUCT_OPS(enq_immed_exit, struct scx_exit_info *ei)
+{
+	UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(enq_immed_ops,
+	       .enqueue		= (void *)enq_immed_enqueue,
+	       .dispatch	= (void *)enq_immed_dispatch,
+	       .exit		= (void *)enq_immed_exit,
+	       .flags		= SCX_OPS_ALWAYS_ENQ_IMMED,
+	       .name		= "enq_immed")
diff --git a/tools/testing/selftests/sched_ext/enq_immed.c b/tools/testing/selftests/sched_ext/enq_immed.c
new file mode 100644
index 000000000000..44681e41975d
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/enq_immed.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace test for SCX_ENQ_IMMED via the direct insert path.
+ *
+ * Validates that dispatching tasks to a busy CPU's local DSQ with
+ * SCX_OPS_ALWAYS_ENQ_IMMED triggers the IMMED slow path: the kernel
+ * re-enqueues the task through ops.enqueue() with SCX_TASK_REENQ_IMMED.
+ *
+ * Skipped on single-CPU systems where local DSQ contention cannot occur.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "enq_immed.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_WORKERS		4
+#define TEST_DURATION_SEC	3
+
+static volatile bool stop_workers;
+
+static void *worker_fn(void *arg)
+{
+	while (!stop_workers) {
+		volatile unsigned long i;
+
+		for (i = 0; i < 100000UL; i++)
+			;
+		usleep(100);
+	}
+	return NULL;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+	struct enq_immed *skel;
+	u64 v;
+
+	if (!__COMPAT_read_enum("scx_ops_flags",
+				"SCX_OPS_ALWAYS_ENQ_IMMED", &v)) {
+		fprintf(stderr,
+			"SKIP: SCX_OPS_ALWAYS_ENQ_IMMED not available\n");
+		return SCX_TEST_SKIP;
+	}
+
+	skel = enq_immed__open();
+	SCX_FAIL_IF(!skel, "Failed to open");
+	SCX_ENUM_INIT(skel);
+
+	skel->rodata->test_tgid = (u32)getpid();
+
+	SCX_FAIL_IF(enq_immed__load(skel), "Failed to load skel");
+
+	*ctx = skel;
+	return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+	struct enq_immed *skel = ctx;
+	struct bpf_link *link;
+	pthread_t workers[NUM_WORKERS];
+	long nproc;
+	int i;
+	u64 reenq;
+
+	nproc = sysconf(_SC_NPROCESSORS_ONLN);
+	if (nproc <= 1) {
+		fprintf(stderr,
+			"SKIP: single CPU, IMMED slow path may not trigger\n");
+		return SCX_TEST_SKIP;
+	}
+
+	link = bpf_map__attach_struct_ops(skel->maps.enq_immed_ops);
+	SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+	stop_workers = false;
+	for (i = 0; i < NUM_WORKERS; i++) {
+		SCX_FAIL_IF(pthread_create(&workers[i], NULL, worker_fn, NULL),
+			    "Failed to create worker %d", i);
+	}
+
+	sleep(TEST_DURATION_SEC);
+
+	reenq = skel->bss->nr_immed_reenq;
+
+	stop_workers = true;
+	for (i = 0; i < NUM_WORKERS; i++)
+		pthread_join(workers[i], NULL);
+
+	bpf_link__destroy(link);
+
+	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+	SCX_GT(reenq, 0);
+
+	return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+	struct enq_immed *skel = ctx;
+
+	enq_immed__destroy(skel);
+}
+
+struct scx_test enq_immed = {
+	.name		= "enq_immed",
+	.description	= "Verify SCX_ENQ_IMMED slow path via direct insert "
+			  "with SCX_OPS_ALWAYS_ENQ_IMMED",
+	.setup		= setup,
+	.run		= run,
+	.cleanup	= cleanup,
+};
+REGISTER_SCX_TEST(&enq_immed)

From cb251eae7b0aec8a7924fb27bcb5b0388a3706bc Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Mon, 23 Mar 2026 23:17:33 +0800
Subject: [PATCH 099/134] tools/sched_ext: Add scx_bpf_sub_dispatch() compat
 wrapper

Add a transparent compatibility wrapper for the scx_bpf_sub_dispatch()
kfunc in compat.bpf.h. This allows BPF schedulers using the sub-sched
dispatch feature to build and run on older kernels that lack the kfunc.

To avoid requiring code changes in individual schedulers, the
transparent wrapper pattern is used instead of a __COMPAT prefix. The
kfunc is declared with a ___compat suffix, while the static inline
wrapper retains the original scx_bpf_sub_dispatch() name.

When the kfunc is unavailable, the wrapper safely falls back to
returning false. This is acceptable because the dispatch path cannot
do anything useful without underlying sub-sched support anyway.

Tested scx_qmap on v6.14 successfully.

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/common.bpf.h |  1 -
 tools/sched_ext/include/scx/compat.bpf.h | 13 +++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index a63a98a96b86..19459dedde41 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -101,7 +101,6 @@ struct rq *scx_bpf_locked_rq(void) __ksym;
 struct task_struct *scx_bpf_cpu_curr(s32 cpu) __ksym __weak;
 u64 scx_bpf_now(void) __ksym __weak;
 void scx_bpf_events(struct scx_event_stats *events, size_t events__sz) __ksym __weak;
-bool scx_bpf_sub_dispatch(u64 cgroup_id) __ksym __weak;
 
 /*
  * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 83b3425e63b2..654b566bd94a 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -108,6 +108,19 @@ static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id)
 	return p;
 }
 
+/*
+ * v7.1: scx_bpf_sub_dispatch() for sub-sched dispatch. Preserve until
+ * we drop the compat layer for older kernels that lack the kfunc.
+ */
+bool scx_bpf_sub_dispatch___compat(u64 cgroup_id) __ksym __weak;
+
+static inline bool scx_bpf_sub_dispatch(u64 cgroup_id)
+{
+	if (bpf_ksym_exists(scx_bpf_sub_dispatch___compat))
+		return scx_bpf_sub_dispatch___compat(cgroup_id);
+	return false;
+}
+
 /**
  * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on
  * in a compatible way. We will preserve this __COMPAT helper until v6.16.

From 4624211bc633481523475d0586a47c0a31c91fa4 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Mon, 23 Mar 2026 18:48:29 +0800
Subject: [PATCH 100/134] sched_ext: Fix invalid kobj cast in scx_uevent()

When scx_alloc_and_add_sched() creates the sub-scheduler kset, it sets
sch->kobj as the parent. Because sch->kobj.kset points to scx_kset,
registering this sub-kset triggers a KOBJ_ADD uevent. The uevent walk
finds scx_kset and calls scx_uevent() with the sub-kset's kobject.

scx_uevent() unconditionally uses container_of() to cast the incoming
kobject to struct scx_sched, producing a wild pointer when the kobject
belongs to the kset itself rather than a scheduler instance. Accessing
sch->ops.name through this pointer causes a KASAN slab-out-of-bounds
read:

  BUG: KASAN: slab-out-of-bounds in string+0x3b6/0x4c0
  Read of size 1 at addr ffff888004d04348 by task scx_enable_help/748

  Call Trace:
   string+0x3b6/0x4c0
   vsnprintf+0x3ec/0x1550
   add_uevent_var+0x160/0x3a0
   scx_uevent+0x22/0x30
   kobject_uevent_env+0x5dc/0x1730
   kset_register+0x192/0x280
   scx_alloc_and_add_sched+0x130d/0x1c60
   ...

Fix this by checking the kobject's ktype against scx_ktype before
performing the cast, and returning 0 for non-matching kobjects.

Tested with vng and scx_qmap without triggering any KASAN errors.

Fixes: ebeca1f930ea ("sched_ext: Introduce cgroup sub-sched support")
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 72a07eb050a3..2472231ec556 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4842,7 +4842,17 @@ static const struct kobj_type scx_ktype = {
 
 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
 {
-	const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj);
+	const struct scx_sched *sch;
+
+	/*
+	 * scx_uevent() can be reached by both scx_sched kobjects (scx_ktype)
+	 * and sub-scheduler kset kobjects (kset_ktype) through the parent
+	 * chain walk. Filter out the latter to avoid invalid casts.
+	 */
+	if (kobj->ktype != &scx_ktype)
+		return 0;
+
+	sch = container_of(kobj, struct scx_sched, kobj);
 
 	return add_uevent_var(env, "SCXOPS=%s", sch->ops.name);
 }

From 9edd04c4189e047d4b4f6efd1255e2a32cb167b8 Mon Sep 17 00:00:00 2001
From: zhidao su <suzhidao@xiaomi.com>
Date: Wed, 25 Mar 2026 02:47:18 +0800
Subject: [PATCH 101/134] docs: Raise minimum pahole version to 1.26 for
 KF_IMPLICIT_ARGS kfuncs

Since Linux 7.0, kfuncs annotated with KF_IMPLICIT_ARGS require pahole
v1.26 or later.  Without it, such kfuncs will have incorrect BTF
prototypes in vmlinux, causing BPF programs to fail to load with a
"func_proto incompatible with vmlinux" error.  Many sched_ext kfuncs
are affected (e.g. scx_bpf_create_dsq, scx_bpf_kick_cpu).

The root cause: scripts/Makefile.btf passes --btf_features=decl_tag_kfuncs
to pahole only when pahole >= 1.26.  Without that flag, pahole emits no
DECL_TAG BTF entries for __bpf_kfunc-annotated functions.  As a result,
resolve_btfids/main.c::collect_kfuncs() finds no bpf_kfunc DECL_TAGs,
short-circuits at line 1002, and btf2btf() never creates the _impl
variants or strips the implicit 'aux' argument from the visible proto.
The vmlinux BTF retains the 3-param prototype while BPF programs declare
the 2-param version, triggering the mismatch.

Raise the minimum version in the requirements table from 1.22 to 1.26
and add a note explaining the failure mode, so users understand why
their BPF programs fail on distributions shipping pahole v1.25 (e.g.
Ubuntu 24.04 LTS).

Suggested-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: zhidao su <suzhidao@xiaomi.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/process/changes.rst | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 6b373e193548..02068d72a101 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -38,7 +38,7 @@ bash                   4.2              bash --version
 binutils               2.30             ld -v
 flex                   2.5.35           flex --version
 bison                  2.0              bison --version
-pahole                 1.22             pahole --version
+pahole                 1.26             pahole --version
 util-linux             2.10o            mount --version
 kmod                   13               depmod -V
 e2fsprogs              1.41.4           e2fsck -V
@@ -145,6 +145,11 @@ Since Linux 5.2, if CONFIG_DEBUG_INFO_BTF is selected, the build system
 generates BTF (BPF Type Format) from DWARF in vmlinux, a bit later from kernel
 modules as well.  This requires pahole v1.22 or later.
 
+Since Linux 7.0, kfuncs annotated with KF_IMPLICIT_ARGS require pahole v1.26
+or later.  Without it, such kfuncs will have incorrect BTF prototypes in
+vmlinux, causing BPF programs to fail to load with a "func_proto incompatible
+with vmlinux" error.  Many sched_ext kfuncs are affected.
+
 It is found in the 'dwarves' or 'pahole' distro packages or from
 https://fedorapeople.org/~acme/dwarves/.
 

From 7ef26d62f3fd841fd18ab6fa93a6bc2de9d17dff Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Wed, 25 Mar 2026 03:14:04 +0800
Subject: [PATCH 102/134] selftests/sched_ext: Skip rt_stall on older kernels
 and list skipped tests

rt_stall tests the ext DL server which was introduced in commit
cd959a356205 ("sched_ext: Add a DL server for sched_ext tasks").
On older kernels that lack this feature, the test calls ksft_exit_fail()
internally which terminates the entire runner process, preventing
subsequent tests from running.

Add a guard in setup() that checks for the ext_server field in struct rq
via __COMPAT_struct_has_field() and returns SCX_TEST_SKIP if not present.

Also print the names of skipped tests in the results summary, mirroring
the existing behavior for failed tests.

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/rt_stall.c | 5 +++++
 tools/testing/selftests/sched_ext/runner.c   | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
index 81ea9b4883e5..a5041fc2e44f 100644
--- a/tools/testing/selftests/sched_ext/rt_stall.c
+++ b/tools/testing/selftests/sched_ext/rt_stall.c
@@ -119,6 +119,11 @@ static enum scx_test_status setup(void **ctx)
 {
 	struct rt_stall *skel;
 
+	if (!__COMPAT_struct_has_field("rq", "ext_server")) {
+		fprintf(stderr, "SKIP: ext DL server not supported\n");
+		return SCX_TEST_SKIP;
+	}
+
 	skel = rt_stall__open();
 	SCX_FAIL_IF(!skel, "Failed to open");
 	SCX_ENUM_INIT(skel);
diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
index 4c68efa1512a..d84f71eee049 100644
--- a/tools/testing/selftests/sched_ext/runner.c
+++ b/tools/testing/selftests/sched_ext/runner.c
@@ -134,6 +134,7 @@ int main(int argc, char **argv)
 {
 	const char *filter = NULL;
 	const char *failed_tests[MAX_SCX_TESTS];
+	const char *skipped_tests[MAX_SCX_TESTS];
 	unsigned testnum = 0, i;
 	unsigned passed = 0, skipped = 0, failed = 0;
 	int opt;
@@ -199,7 +200,7 @@ int main(int argc, char **argv)
 			passed++;
 			break;
 		case SCX_TEST_SKIP:
-			skipped++;
+			skipped_tests[skipped++] = test->name;
 			break;
 		case SCX_TEST_FAIL:
 			failed_tests[failed++] = test->name;
@@ -211,6 +212,11 @@ int main(int argc, char **argv)
 	printf("PASSED:  %u\n", passed);
 	printf("SKIPPED: %u\n", skipped);
 	printf("FAILED:  %u\n", failed);
+	if (skipped > 0) {
+		printf("\nSkipped tests:\n");
+		for (i = 0; i < skipped; i++)
+			printf("  - %s\n", skipped_tests[i]);
+	}
 	if (failed > 0) {
 		printf("\nFailed tests:\n");
 		for (i = 0; i < failed; i++)

From 60d4b17e886a90e707eaa06f8b07539f603cec5b Mon Sep 17 00:00:00 2001
From: Zqiang <qiang.zhang@linux.dev>
Date: Wed, 25 Mar 2026 11:11:00 +0800
Subject: [PATCH 103/134] sched_ext: Choose the right sch->ops.name to output
 in the print_scx_info()

The print_scx_info() always output scx_root structure's->ops.name,
but for built with CONFIG_EXT_SUB_SCHED=y kernels, the tasks may be
attach an sub scx_sched structure. this commit therefore use the
scx_task_sched_rcu() to correctly get scx_sched structure to output
ops.name, and drop state check.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Zqiang <qiang.zhang@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2472231ec556..551bfb99157d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7680,14 +7680,18 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
  */
 void print_scx_info(const char *log_lvl, struct task_struct *p)
 {
-	struct scx_sched *sch = scx_root;
+	struct scx_sched *sch;
 	enum scx_enable_state state = scx_enable_state();
 	const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
 	char runnable_at_buf[22] = "?";
 	struct sched_class *class;
 	unsigned long runnable_at;
 
-	if (state == SCX_DISABLED)
+	guard(rcu)();
+
+	sch = scx_task_sched_rcu(p);
+
+	if (!sch)
 		return;
 
 	/*

From 6bf36c68b0a23afba108920d21c1c108f83371d6 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Wed, 25 Mar 2026 12:51:48 +0800
Subject: [PATCH 104/134] tools/sched_ext: Regenerate autogen enum headers

Regenerate enum_defs.autogen.h, enums.autogen.h and enums.autogen.bpf.h
using the upstream scripts [1][2] to sync with recent kernel enum
additions.

[1] https://github.com/sched-ext/scx/blob/main/scripts/gen_enum_defs.py
[2] https://github.com/sched-ext/scx/blob/main/scripts/gen_enums.py

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/enum_defs.autogen.h | 10 ++++++++++
 tools/sched_ext/include/scx/enums.autogen.bpf.h |  9 +++++++++
 tools/sched_ext/include/scx/enums.autogen.h     |  3 +++
 3 files changed, 22 insertions(+)

diff --git a/tools/sched_ext/include/scx/enum_defs.autogen.h b/tools/sched_ext/include/scx/enum_defs.autogen.h
index 40b30dad8ede..da4b459820fd 100644
--- a/tools/sched_ext/include/scx/enum_defs.autogen.h
+++ b/tools/sched_ext/include/scx/enum_defs.autogen.h
@@ -20,6 +20,7 @@
 #define HAVE_SCX_BYPASS_LB_DONOR_PCT
 #define HAVE_SCX_BYPASS_LB_MIN_DELTA_DIV
 #define HAVE_SCX_BYPASS_LB_BATCH
+#define HAVE_SCX_REENQ_LOCAL_MAX_REPEAT
 #define HAVE_SCX_SUB_MAX_DEPTH
 #define HAVE_SCX_CPU_PREEMPT_RT
 #define HAVE_SCX_CPU_PREEMPT_DL
@@ -51,18 +52,21 @@
 #define HAVE_SCX_ENQ_HEAD
 #define HAVE_SCX_ENQ_CPU_SELECTED
 #define HAVE_SCX_ENQ_PREEMPT
+#define HAVE_SCX_ENQ_IMMED
 #define HAVE_SCX_ENQ_REENQ
 #define HAVE_SCX_ENQ_LAST
 #define HAVE___SCX_ENQ_INTERNAL_MASK
 #define HAVE_SCX_ENQ_CLEAR_OPSS
 #define HAVE_SCX_ENQ_DSQ_PRIQ
 #define HAVE_SCX_ENQ_NESTED
+#define HAVE_SCX_ENQ_GDSQ_FALLBACK
 #define HAVE_SCX_TASK_DSQ_ON_PRIQ
 #define HAVE_SCX_TASK_QUEUED
 #define HAVE_SCX_TASK_IN_CUSTODY
 #define HAVE_SCX_TASK_RESET_RUNNABLE_AT
 #define HAVE_SCX_TASK_DEQD_FOR_SLEEP
 #define HAVE_SCX_TASK_SUB_INIT
+#define HAVE_SCX_TASK_IMMED
 #define HAVE_SCX_TASK_STATE_SHIFT
 #define HAVE_SCX_TASK_STATE_BITS
 #define HAVE_SCX_TASK_STATE_MASK
@@ -75,6 +79,8 @@
 #define HAVE_SCX_TASK_REENQ_REASON_MASK
 #define HAVE_SCX_TASK_REENQ_NONE
 #define HAVE_SCX_TASK_REENQ_KFUNC
+#define HAVE_SCX_TASK_REENQ_IMMED
+#define HAVE_SCX_TASK_REENQ_PREEMPTED
 #define HAVE_SCX_TASK_CURSOR
 #define HAVE_SCX_ECODE_RSN_HOTPLUG
 #define HAVE_SCX_ECODE_RSN_CGROUP_OFFLINE
@@ -114,6 +120,7 @@
 #define HAVE_SCX_OPS_ENQ_MIGRATION_DISABLED
 #define HAVE_SCX_OPS_ALLOW_QUEUED_WAKEUP
 #define HAVE_SCX_OPS_BUILTIN_IDLE_PER_NODE
+#define HAVE_SCX_OPS_ALWAYS_ENQ_IMMED
 #define HAVE_SCX_OPS_ALL_FLAGS
 #define HAVE___SCX_OPS_INTERNAL_MASK
 #define HAVE_SCX_OPS_HAS_CPU_PREEMPT
@@ -131,6 +138,9 @@
 #define HAVE_SCX_REENQ_ANY
 #define HAVE___SCX_REENQ_FILTER_MASK
 #define HAVE___SCX_REENQ_USER_MASK
+#define HAVE_SCX_REENQ_TSR_RQ_OPEN
+#define HAVE_SCX_REENQ_TSR_NOT_FIRST
+#define HAVE___SCX_REENQ_TSR_MASK
 #define HAVE_SCX_RQ_ONLINE
 #define HAVE_SCX_RQ_CAN_STOP_TICK
 #define HAVE_SCX_RQ_BAL_KEEP
diff --git a/tools/sched_ext/include/scx/enums.autogen.bpf.h b/tools/sched_ext/include/scx/enums.autogen.bpf.h
index 5da50f937684..dafccbb6b69d 100644
--- a/tools/sched_ext/include/scx/enums.autogen.bpf.h
+++ b/tools/sched_ext/include/scx/enums.autogen.bpf.h
@@ -67,6 +67,12 @@ const volatile u64 __SCX_TASK_RESET_RUNNABLE_AT __weak;
 const volatile u64 __SCX_TASK_DEQD_FOR_SLEEP __weak;
 #define SCX_TASK_DEQD_FOR_SLEEP __SCX_TASK_DEQD_FOR_SLEEP
 
+const volatile u64 __SCX_TASK_SUB_INIT __weak;
+#define SCX_TASK_SUB_INIT __SCX_TASK_SUB_INIT
+
+const volatile u64 __SCX_TASK_IMMED __weak;
+#define SCX_TASK_IMMED __SCX_TASK_IMMED
+
 const volatile u64 __SCX_TASK_STATE_SHIFT __weak;
 #define SCX_TASK_STATE_SHIFT __SCX_TASK_STATE_SHIFT
 
@@ -115,6 +121,9 @@ const volatile u64 __SCX_ENQ_HEAD __weak;
 const volatile u64 __SCX_ENQ_PREEMPT __weak;
 #define SCX_ENQ_PREEMPT __SCX_ENQ_PREEMPT
 
+const volatile u64 __SCX_ENQ_IMMED __weak;
+#define SCX_ENQ_IMMED __SCX_ENQ_IMMED
+
 const volatile u64 __SCX_ENQ_REENQ __weak;
 #define SCX_ENQ_REENQ __SCX_ENQ_REENQ
 
diff --git a/tools/sched_ext/include/scx/enums.autogen.h b/tools/sched_ext/include/scx/enums.autogen.h
index fc9a7a4d9dea..bbd4901f4fce 100644
--- a/tools/sched_ext/include/scx/enums.autogen.h
+++ b/tools/sched_ext/include/scx/enums.autogen.h
@@ -26,6 +26,8 @@
 	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_QUEUED); \
 	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_RESET_RUNNABLE_AT); \
 	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_DEQD_FOR_SLEEP); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_SUB_INIT); \
+	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_IMMED); \
 	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_STATE_SHIFT); \
 	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_STATE_BITS); \
 	SCX_ENUM_SET(skel, scx_ent_flags, SCX_TASK_STATE_MASK); \
@@ -42,6 +44,7 @@
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_WAKEUP); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_HEAD); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_PREEMPT); \
+	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_IMMED); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_REENQ); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_LAST); \
 	SCX_ENUM_SET(skel, scx_enq_flags, SCX_ENQ_CLEAR_OPSS); \

From a313357a346839d40b3a4dec393c71bf30cbb34c Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 25 Mar 2026 22:21:00 +0100
Subject: [PATCH 105/134] sched_ext: Documentation: Clarify ops.dispatch() role
 in task lifecycle

ops.dispatch() is invoked when a CPU becomes available. This can occur
when a task voluntarily yields the CPU, exhausts its time slice, or is
preempted for other reasons.

If the task is still runnable, refilling its time slice in
ops.dispatch() (either by the BPF scheduler or the sched_ext core)
allows it to continue running without triggering ops.stopping().
However, this behavior is not clearly reflected in the current task
lifecycle diagram.

Update the diagram to better represent this interaction.

Fixes: 9465f44d2df2 ("sched_ext: Documentation: Clarify time slice handling in task lifecycle")
Cc: stable@vger.kernel.org # v6.17+
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 9e4dbabc03c0..404b4e4c33f7 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -433,13 +433,15 @@ by a sched_ext scheduler:
                 ops.dequeue(); /* Exiting BPF scheduler */
             }
             ops.running();      /* Task starts running on its assigned CPU */
-            while (task->scx.slice > 0 && task is runnable)
-                ops.tick();     /* Called every 1/HZ seconds */
+
+            while task_is_runnable(p) {
+                while (task->scx.slice > 0 && task_is_runnable(p))
+                    ops.tick();     /* Called every 1/HZ seconds */
+
+                ops.dispatch();     /* task->scx.slice can be refilled */
+            }
+
             ops.stopping();     /* Task stops running (time slice expires or wait) */
-
-            /* Task's CPU becomes available */
-
-            ops.dispatch();     /* task->scx.slice can be refilled */
         }
 
         ops.quiescent();        /* Task releases its assigned CPU (wait) */

From 3d6379196d5fc9698b683ec40ffd9452d2183c4a Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Thu, 26 Mar 2026 09:29:03 +0800
Subject: [PATCH 106/134] sched_ext: Fix missing return after scx_error() in
 scx_dsq_move()

When scx_bpf_dsq_move[_vtime]() is called on a task that belongs to a
different scheduler, scx_error() is invoked to flag the violation.
scx_error() schedules an asynchronous scheduler teardown via irq_work
and returns immediately, so execution falls through and the DSQ move
proceeds on a cross-scheduler task regardless, potentially corrupting
DSQ state.

Add the missing return false so the function exits right after
reporting the error, consistent with the other early-exit checks in
the same function (e.g. scx_vet_enq_flags() failure at the top).

Fixes: bb4d9fd55158 ("sched_ext: scx_dsq_move() should validate the task belongs to the right scheduler")
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 551bfb99157d..a5d8871ac865 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8094,6 +8094,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	if (unlikely(!scx_task_on_sched(sch, p))) {
 		scx_error(sch, "scx_bpf_dsq_move[_vtime]() on %s[%d] but the task belongs to a different scheduler",
 			  p->comm, p->pid);
+		return false;
 	}
 
 	/*

From 3eb8f022919187e2fe786f677569d2bd5a0b1915 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Thu, 26 Mar 2026 09:48:27 +0800
Subject: [PATCH 107/134] sched_ext: Fix missing SCX_EV_SUB_BYPASS_DISPATCH
 aggregation in scx_read_events()

025b1bd41965 introduced SCX_EV_SUB_BYPASS_DISPATCH to track scheduling
of bypassed descendant tasks, and correctly increments it per-CPU and
displays it in sysfs and dump output. However, scx_read_events() which
aggregates per-CPU counters into a summary was not updated to include
this event, causing it to always read as zero in sysfs, in debug dumps,
and via the scx_bpf_events() kfunc.

Add the missing scx_agg_event() call for SCX_EV_SUB_BYPASS_DISPATCH.

Fixes: 025b1bd41965 ("sched_ext: Implement hierarchical bypass mode")
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a5d8871ac865..7043fb941130 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -9409,6 +9409,7 @@ static void scx_read_events(struct scx_sched *sch, struct scx_event_stats *event
 		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_DISPATCH);
 		scx_agg_event(events, e_cpu, SCX_EV_BYPASS_ACTIVATE);
 		scx_agg_event(events, e_cpu, SCX_EV_INSERT_NOT_OWNED);
+		scx_agg_event(events, e_cpu, SCX_EV_SUB_BYPASS_DISPATCH);
 	}
 }
 

From f546c77038ab898726e7344255217fbec382b97f Mon Sep 17 00:00:00 2001
From: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Date: Thu, 26 Mar 2026 10:51:55 +0800
Subject: [PATCH 108/134] tools/sched_ext: scx_pair: fix pair_ctx indexing for
 CPU pairs

scx_pair sizes pair_ctx to nr_cpu_ids / 2, so valid pair_ctx keys are
dense pair indexes in the range [0, nr_cpu_ids / 2).

However, the userspace setup code stores pair_id as the first CPU number
in each pair. On an 8-CPU system with "-S 1", that produces pair IDs
0, 2, 4 and 6 for pairs [0,1], [2,3], [4,5] and [6,7]. CPUs in the
latter half then look up pair_ctx with out-of-range keys and the BPF
scheduler aborts with:

EXIT: scx_bpf_error (scx_pair.bpf.c:328: failed to lookup pairc and
in_pair_mask for cpu[5])

Assign pair_id using a dense pair counter instead so that each CPU pair
maps to a valid pair_ctx entry. Besides, reject odd CPU configuration, as
scx_pair requires all CPUs to be paired.

Fixes: f0262b102c7c ("tools/sched_ext: add scx_pair scheduler")
Signed-off-by: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_pair.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c
index 61fd86b44c40..41b136d43a55 100644
--- a/tools/sched_ext/scx_pair.c
+++ b/tools/sched_ext/scx_pair.c
@@ -48,6 +48,7 @@ int main(int argc, char **argv)
 	struct bpf_link *link;
 	__u64 seq = 0, ecode;
 	__s32 stride, i, opt, outer_fd;
+	__u32 pair_id = 0;
 
 	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
@@ -82,6 +83,14 @@ restart:
 		scx_pair__destroy(skel);
 		return -1;
 	}
+
+	if (skel->rodata->nr_cpu_ids & 1) {
+		fprintf(stderr, "scx_pair requires an even CPU count, got %u\n",
+			skel->rodata->nr_cpu_ids);
+		scx_pair__destroy(skel);
+		return -1;
+	}
+
 	bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2);
 
 	/* Resize arrays so their element count is equal to cpu count. */
@@ -109,10 +118,11 @@ restart:
 
 		skel->rodata_pair_cpu->pair_cpu[i] = j;
 		skel->rodata_pair_cpu->pair_cpu[j] = i;
-		skel->rodata_pair_id->pair_id[i] = i;
-		skel->rodata_pair_id->pair_id[j] = i;
+		skel->rodata_pair_id->pair_id[i] = pair_id;
+		skel->rodata_pair_id->pair_id[j] = pair_id;
 		skel->rodata_in_pair_idx->in_pair_idx[i] = 0;
 		skel->rodata_in_pair_idx->in_pair_idx[j] = 1;
+		pair_id++;
 
 		printf("[%d, %d] ", i, j);
 	}

From ea70239320394266ec8ccf43ff3a6415e43b8163 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Mar 2026 10:07:42 -1000
Subject: [PATCH 109/134] tools/sched_ext: Remove redundant SCX_ENQ_IMMED
 compat definition

compat.bpf.h defined a fallback SCX_ENQ_IMMED macro using
__COMPAT_ENUM_OR_ZERO(). After 6bf36c68b0a2 ("tools/sched_ext:
Regenerate autogen enum headers") added SCX_ENQ_IMMED to the autogen
headers, including both triggers -Wmacro-redefined warnings.

The autogen definition through const volatile __weak already resolves to
0 on older kernels, providing the same backward compatibility. Remove
the now-redundant compat fallback.

Fixes: 6bf36c68b0a2 ("tools/sched_ext: Regenerate autogen enum headers")
Link: https://lore.kernel.org/r/20260326100313.338388-1-zhaomzhao@126.com
Reported-by: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/compat.bpf.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 654b566bd94a..8977b5a2caa1 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -422,11 +422,6 @@ static inline void scx_bpf_dsq_reenq(u64 dsq_id, u64 reenq_flags)
 		scx_bpf_error("kernel too old to reenqueue foreign local or user DSQs");
 }
 
-/*
- * v7.1: %SCX_ENQ_IMMED.
- */
-#define SCX_ENQ_IMMED	__COMPAT_ENUM_OR_ZERO(enum scx_enq_flags, SCX_ENQ_IMMED)
-
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().

From 1f51b42bf9784dd012b8f0c135140b047eb5c6bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Mar 2026 14:32:07 -1000
Subject: [PATCH 110/134] Revert "docs: Raise minimum pahole version to 1.26
 for KF_IMPLICIT_ARGS kfuncs"

This reverts commit 9edd04c4189e047d4b4f6efd1255e2a32cb167b8.

The doc-only change is insufficient. The actual fix requires bumping the
minimum pahole version to 1.27 with proper Kconfig and Makefile.btf
changes, and should be routed through the BPF tree.

Link: https://lore.kernel.org/all/e0ca748d-3204-4160-b37d-0f76cbac8c6c@linux.dev
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/process/changes.rst | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst
index 02068d72a101..6b373e193548 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -38,7 +38,7 @@ bash                   4.2              bash --version
 binutils               2.30             ld -v
 flex                   2.5.35           flex --version
 bison                  2.0              bison --version
-pahole                 1.26             pahole --version
+pahole                 1.22             pahole --version
 util-linux             2.10o            mount --version
 kmod                   13               depmod -V
 e2fsprogs              1.41.4           e2fsck -V
@@ -145,11 +145,6 @@ Since Linux 5.2, if CONFIG_DEBUG_INFO_BTF is selected, the build system
 generates BTF (BPF Type Format) from DWARF in vmlinux, a bit later from kernel
 modules as well.  This requires pahole v1.22 or later.
 
-Since Linux 7.0, kfuncs annotated with KF_IMPLICIT_ARGS require pahole v1.26
-or later.  Without it, such kfuncs will have incorrect BTF prototypes in
-vmlinux, causing BPF programs to fail to load with a "func_proto incompatible
-with vmlinux" error.  Many sched_ext kfuncs are affected.
-
 It is found in the 'dwarves' or 'pahole' distro packages or from
 https://fedorapeople.org/~acme/dwarves/.
 

From c6f99d0ecc900ceea27eeaba6c431ae97e7b5599 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 26 Mar 2026 14:45:37 -1000
Subject: [PATCH 111/134] Revert "selftests/sched_ext: Add tests for
 SCX_ENQ_IMMED and scx_bpf_dsq_reenq()"

This reverts commit c50dcf533149.

The tests are superficial, likely AI-generated slop, and flaky. They
don't add actual value and just churn the selftests.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/Makefile    |   3 -
 .../selftests/sched_ext/consume_immed.bpf.c   |  88 -------------
 .../selftests/sched_ext/consume_immed.c       | 115 -----------------
 .../selftests/sched_ext/dsq_reenq.bpf.c       | 120 ------------------
 tools/testing/selftests/sched_ext/dsq_reenq.c |  95 --------------
 .../selftests/sched_ext/enq_immed.bpf.c       |  63 ---------
 tools/testing/selftests/sched_ext/enq_immed.c | 117 -----------------
 7 files changed, 601 deletions(-)
 delete mode 100644 tools/testing/selftests/sched_ext/consume_immed.bpf.c
 delete mode 100644 tools/testing/selftests/sched_ext/consume_immed.c
 delete mode 100644 tools/testing/selftests/sched_ext/dsq_reenq.bpf.c
 delete mode 100644 tools/testing/selftests/sched_ext/dsq_reenq.c
 delete mode 100644 tools/testing/selftests/sched_ext/enq_immed.bpf.c
 delete mode 100644 tools/testing/selftests/sched_ext/enq_immed.c

diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 84e4f69b8833..a3bbe2c7911b 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -162,11 +162,8 @@ endef
 all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog)))
 
 auto-test-targets :=			\
-	consume_immed			\
 	create_dsq			\
 	dequeue				\
-	dsq_reenq			\
-	enq_immed			\
 	enq_last_no_enq_fails		\
 	ddsp_bogus_dsq_fail		\
 	ddsp_vtimelocal_fail		\
diff --git a/tools/testing/selftests/sched_ext/consume_immed.bpf.c b/tools/testing/selftests/sched_ext/consume_immed.bpf.c
deleted file mode 100644
index 9c7808f5abe1..000000000000
--- a/tools/testing/selftests/sched_ext/consume_immed.bpf.c
+++ /dev/null
@@ -1,88 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Validate SCX_ENQ_IMMED semantics through the consume path.
- *
- * This is the orthogonal counterpart to enq_immed:
- *
- *   enq_immed:      SCX_ENQ_IMMED via scx_bpf_dsq_insert() to local DSQ
- *                   with SCX_OPS_ALWAYS_ENQ_IMMED
- *
- *   consume_immed:  SCX_ENQ_IMMED via scx_bpf_dsq_move_to_local() with
- *                   explicit SCX_ENQ_IMMED in enq_flags (requires v2 kfunc)
- *
- * Worker threads belonging to test_tgid are inserted into USER_DSQ.
- * ops.dispatch() on CPU 0 consumes from USER_DSQ with SCX_ENQ_IMMED.
- * With multiple workers competing for CPU 0, dsq->nr > 1 triggers the
- * IMMED slow path (reenqueue with SCX_TASK_REENQ_IMMED).
- *
- * Requires scx_bpf_dsq_move_to_local___v2() (v7.1+) for enq_flags support.
- */
-
-#include <scx/common.bpf.h>
-
-char _license[] SEC("license") = "GPL";
-
-UEI_DEFINE(uei);
-
-#define USER_DSQ	0
-
-/* Set by userspace to identify the test process group. */
-const volatile u32 test_tgid;
-
-/*
- * SCX_TASK_REENQ_REASON_MASK and SCX_TASK_REENQ_IMMED are exported via
- * vmlinux BTF as part of enum scx_ent_flags.
- */
-
-u64 nr_consume_immed_reenq;
-
-void BPF_STRUCT_OPS(consume_immed_enqueue, struct task_struct *p,
-		    u64 enq_flags)
-{
-	if (enq_flags & SCX_ENQ_REENQ) {
-		u32 reason = p->scx.flags & SCX_TASK_REENQ_REASON_MASK;
-
-		if (reason == SCX_TASK_REENQ_IMMED)
-			__sync_fetch_and_add(&nr_consume_immed_reenq, 1);
-	}
-
-	if (p->tgid == (pid_t)test_tgid)
-		scx_bpf_dsq_insert(p, USER_DSQ, SCX_SLICE_DFL, enq_flags);
-	else
-		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL,
-				   enq_flags);
-}
-
-void BPF_STRUCT_OPS(consume_immed_dispatch, s32 cpu, struct task_struct *prev)
-{
-	if (cpu == 0)
-		scx_bpf_dsq_move_to_local(USER_DSQ, SCX_ENQ_IMMED);
-	else
-		scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL, 0);
-}
-
-s32 BPF_STRUCT_OPS_SLEEPABLE(consume_immed_init)
-{
-	/*
-	 * scx_bpf_dsq_move_to_local___v2() adds the enq_flags parameter.
-	 * On older kernels the consume path cannot pass SCX_ENQ_IMMED.
-	 */
-	if (!bpf_ksym_exists(scx_bpf_dsq_move_to_local___v2)) {
-		scx_bpf_error("scx_bpf_dsq_move_to_local v2 not available");
-		return -EOPNOTSUPP;
-	}
-
-	return scx_bpf_create_dsq(USER_DSQ, -1);
-}
-
-void BPF_STRUCT_OPS(consume_immed_exit, struct scx_exit_info *ei)
-{
-	UEI_RECORD(uei, ei);
-}
-
-SCX_OPS_DEFINE(consume_immed_ops,
-	       .enqueue		= (void *)consume_immed_enqueue,
-	       .dispatch	= (void *)consume_immed_dispatch,
-	       .init		= (void *)consume_immed_init,
-	       .exit		= (void *)consume_immed_exit,
-	       .name		= "consume_immed")
diff --git a/tools/testing/selftests/sched_ext/consume_immed.c b/tools/testing/selftests/sched_ext/consume_immed.c
deleted file mode 100644
index 7f9594cfa9cb..000000000000
--- a/tools/testing/selftests/sched_ext/consume_immed.c
+++ /dev/null
@@ -1,115 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Userspace test for SCX_ENQ_IMMED via the consume path.
- *
- * Validates that scx_bpf_dsq_move_to_local(USER_DSQ, SCX_ENQ_IMMED) on
- * a busy CPU triggers the IMMED slow path, re-enqueuing tasks through
- * ops.enqueue() with SCX_TASK_REENQ_IMMED.
- *
- * Skipped on single-CPU systems where local DSQ contention cannot occur.
- */
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <bpf/bpf.h>
-#include <scx/common.h>
-#include "consume_immed.bpf.skel.h"
-#include "scx_test.h"
-
-#define NUM_WORKERS		4
-#define TEST_DURATION_SEC	3
-
-static volatile bool stop_workers;
-
-static void *worker_fn(void *arg)
-{
-	while (!stop_workers) {
-		volatile unsigned long i;
-
-		for (i = 0; i < 100000UL; i++)
-			;
-		usleep(100);
-	}
-	return NULL;
-}
-
-static enum scx_test_status setup(void **ctx)
-{
-	struct consume_immed *skel;
-
-	if (!__COMPAT_has_ksym("scx_bpf_dsq_move_to_local___v2")) {
-		fprintf(stderr,
-			"SKIP: scx_bpf_dsq_move_to_local v2 not available\n");
-		return SCX_TEST_SKIP;
-	}
-
-	skel = consume_immed__open();
-	SCX_FAIL_IF(!skel, "Failed to open");
-	SCX_ENUM_INIT(skel);
-
-	skel->rodata->test_tgid = (u32)getpid();
-
-	SCX_FAIL_IF(consume_immed__load(skel), "Failed to load skel");
-
-	*ctx = skel;
-	return SCX_TEST_PASS;
-}
-
-static enum scx_test_status run(void *ctx)
-{
-	struct consume_immed *skel = ctx;
-	struct bpf_link *link;
-	pthread_t workers[NUM_WORKERS];
-	long nproc;
-	int i;
-	u64 reenq;
-
-	nproc = sysconf(_SC_NPROCESSORS_ONLN);
-	if (nproc <= 1) {
-		fprintf(stderr,
-			"SKIP: single CPU, consume IMMED slow path may not trigger\n");
-		return SCX_TEST_SKIP;
-	}
-
-	link = bpf_map__attach_struct_ops(skel->maps.consume_immed_ops);
-	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-
-	stop_workers = false;
-	for (i = 0; i < NUM_WORKERS; i++) {
-		SCX_FAIL_IF(pthread_create(&workers[i], NULL, worker_fn, NULL),
-			    "Failed to create worker %d", i);
-	}
-
-	sleep(TEST_DURATION_SEC);
-
-	reenq = skel->bss->nr_consume_immed_reenq;
-
-	stop_workers = true;
-	for (i = 0; i < NUM_WORKERS; i++)
-		pthread_join(workers[i], NULL);
-
-	bpf_link__destroy(link);
-
-	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
-	SCX_GT(reenq, 0);
-
-	return SCX_TEST_PASS;
-}
-
-static void cleanup(void *ctx)
-{
-	struct consume_immed *skel = ctx;
-
-	consume_immed__destroy(skel);
-}
-
-struct scx_test consume_immed = {
-	.name		= "consume_immed",
-	.description	= "Verify SCX_ENQ_IMMED slow path via "
-			  "scx_bpf_dsq_move_to_local() consume path",
-	.setup		= setup,
-	.run		= run,
-	.cleanup	= cleanup,
-};
-REGISTER_SCX_TEST(&consume_immed)
diff --git a/tools/testing/selftests/sched_ext/dsq_reenq.bpf.c b/tools/testing/selftests/sched_ext/dsq_reenq.bpf.c
deleted file mode 100644
index 750bb10508df..000000000000
--- a/tools/testing/selftests/sched_ext/dsq_reenq.bpf.c
+++ /dev/null
@@ -1,120 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Validate scx_bpf_dsq_reenq() semantics on user DSQs.
- *
- * A BPF timer periodically calls scx_bpf_dsq_reenq() on a user DSQ,
- * causing tasks to be re-enqueued through ops.enqueue() with SCX_ENQ_REENQ
- * set and SCX_TASK_REENQ_KFUNC recorded in p->scx.flags.
- *
- * The test verifies:
- *  - scx_bpf_dsq_reenq() triggers ops.enqueue() with SCX_ENQ_REENQ
- *  - The reenqueue reason is SCX_TASK_REENQ_KFUNC (bit 12 set)
- *  - Tasks are correctly re-dispatched after reenqueue
- */
-
-#include <scx/common.bpf.h>
-
-char _license[] SEC("license") = "GPL";
-
-UEI_DEFINE(uei);
-
-#define USER_DSQ	0
-
-/*
- * SCX_TASK_REENQ_REASON_MASK and SCX_TASK_REENQ_KFUNC are exported via
- * vmlinux BTF as part of enum scx_ent_flags.
- */
-
-/* 5ms timer interval */
-#define REENQ_TIMER_NS		(5 * 1000 * 1000ULL)
-
-/*
- * Number of times ops.enqueue() was called with SCX_ENQ_REENQ set and
- * SCX_TASK_REENQ_KFUNC recorded in p->scx.flags.
- */
-u64 nr_reenq_kfunc;
-
-struct reenq_timer_val {
-	struct bpf_timer timer;
-};
-
-struct {
-	__uint(type, BPF_MAP_TYPE_ARRAY);
-	__uint(max_entries, 1);
-	__type(key, u32);
-	__type(value, struct reenq_timer_val);
-} reenq_timer SEC(".maps");
-
-/*
- * Timer callback: reenqueue all tasks currently sitting on USER_DSQ back
- * through ops.enqueue() with SCX_ENQ_REENQ | SCX_TASK_REENQ_KFUNC.
- */
-static int reenq_timerfn(void *map, int *key, struct bpf_timer *timer)
-{
-	scx_bpf_dsq_reenq(USER_DSQ, 0);
-	bpf_timer_start(timer, REENQ_TIMER_NS, 0);
-	return 0;
-}
-
-void BPF_STRUCT_OPS(dsq_reenq_enqueue, struct task_struct *p, u64 enq_flags)
-{
-	/*
-	 * If this is a kfunc-triggered reenqueue, verify that
-	 * SCX_TASK_REENQ_KFUNC is recorded in p->scx.flags.
-	 */
-	if (enq_flags & SCX_ENQ_REENQ) {
-		u32 reason = p->scx.flags & SCX_TASK_REENQ_REASON_MASK;
-
-		if (reason == SCX_TASK_REENQ_KFUNC)
-			__sync_fetch_and_add(&nr_reenq_kfunc, 1);
-	}
-
-	/*
-	 * Always dispatch to USER_DSQ so the timer can reenqueue tasks again
-	 * on the next tick.
-	 */
-	scx_bpf_dsq_insert(p, USER_DSQ, SCX_SLICE_DFL, enq_flags);
-}
-
-void BPF_STRUCT_OPS(dsq_reenq_dispatch, s32 cpu, struct task_struct *prev)
-{
-	scx_bpf_dsq_move_to_local(USER_DSQ, 0);
-}
-
-s32 BPF_STRUCT_OPS_SLEEPABLE(dsq_reenq_init)
-{
-	struct reenq_timer_val *tval;
-	u32 key = 0;
-	s32 ret;
-
-	ret = scx_bpf_create_dsq(USER_DSQ, -1);
-	if (ret)
-		return ret;
-
-	if (!__COMPAT_has_generic_reenq()) {
-		scx_bpf_error("scx_bpf_dsq_reenq() not available");
-		return -EOPNOTSUPP;
-	}
-
-	tval = bpf_map_lookup_elem(&reenq_timer, &key);
-	if (!tval)
-		return -ESRCH;
-
-	bpf_timer_init(&tval->timer, &reenq_timer, CLOCK_MONOTONIC);
-	bpf_timer_set_callback(&tval->timer, reenq_timerfn);
-
-	return bpf_timer_start(&tval->timer, REENQ_TIMER_NS, 0);
-}
-
-void BPF_STRUCT_OPS(dsq_reenq_exit, struct scx_exit_info *ei)
-{
-	UEI_RECORD(uei, ei);
-}
-
-SCX_OPS_DEFINE(dsq_reenq_ops,
-	       .enqueue		= (void *)dsq_reenq_enqueue,
-	       .dispatch	= (void *)dsq_reenq_dispatch,
-	       .init		= (void *)dsq_reenq_init,
-	       .exit		= (void *)dsq_reenq_exit,
-	       .timeout_ms	= 10000,
-	       .name		= "dsq_reenq")
diff --git a/tools/testing/selftests/sched_ext/dsq_reenq.c b/tools/testing/selftests/sched_ext/dsq_reenq.c
deleted file mode 100644
index b0d99f9c9a9a..000000000000
--- a/tools/testing/selftests/sched_ext/dsq_reenq.c
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Userspace test for scx_bpf_dsq_reenq() semantics.
- *
- * Attaches the dsq_reenq BPF scheduler, runs workload threads that
- * sleep and yield to keep tasks on USER_DSQ, waits for the BPF timer
- * to fire several times, then verifies that at least one kfunc-triggered
- * reenqueue was observed (ops.enqueue() called with SCX_ENQ_REENQ and
- * SCX_TASK_REENQ_KFUNC in p->scx.flags).
- */
-#include <bpf/bpf.h>
-#include <scx/common.h>
-#include <unistd.h>
-#include <pthread.h>
-#include "dsq_reenq.bpf.skel.h"
-#include "scx_test.h"
-
-#define NUM_WORKERS	4
-#define TEST_DURATION_SEC 3
-
-static volatile bool stop_workers;
-static pthread_t workers[NUM_WORKERS];
-
-static void *worker_fn(void *arg)
-{
-	while (!stop_workers) {
-		usleep(500);
-		sched_yield();
-	}
-	return NULL;
-}
-
-static enum scx_test_status setup(void **ctx)
-{
-	struct dsq_reenq *skel;
-
-	if (!__COMPAT_has_ksym("scx_bpf_dsq_reenq")) {
-		fprintf(stderr, "SKIP: scx_bpf_dsq_reenq() not available\n");
-		return SCX_TEST_SKIP;
-	}
-
-	skel = dsq_reenq__open();
-	SCX_FAIL_IF(!skel, "Failed to open");
-	SCX_ENUM_INIT(skel);
-	SCX_FAIL_IF(dsq_reenq__load(skel), "Failed to load skel");
-
-	*ctx = skel;
-	return SCX_TEST_PASS;
-}
-
-static enum scx_test_status run(void *ctx)
-{
-	struct dsq_reenq *skel = ctx;
-	struct bpf_link *link;
-	int i;
-
-	link = bpf_map__attach_struct_ops(skel->maps.dsq_reenq_ops);
-	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-
-	stop_workers = false;
-	for (i = 0; i < NUM_WORKERS; i++) {
-		SCX_FAIL_IF(pthread_create(&workers[i], NULL, worker_fn, NULL),
-			    "Failed to create worker %d", i);
-	}
-
-	sleep(TEST_DURATION_SEC);
-
-	stop_workers = true;
-	for (i = 0; i < NUM_WORKERS; i++)
-		pthread_join(workers[i], NULL);
-
-	bpf_link__destroy(link);
-
-	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
-	SCX_GT(skel->bss->nr_reenq_kfunc, 0);
-
-	return SCX_TEST_PASS;
-}
-
-static void cleanup(void *ctx)
-{
-	struct dsq_reenq *skel = ctx;
-
-	dsq_reenq__destroy(skel);
-}
-
-struct scx_test dsq_reenq = {
-	.name		= "dsq_reenq",
-	.description	= "Verify scx_bpf_dsq_reenq() triggers enqueue with "
-			  "SCX_ENQ_REENQ and SCX_TASK_REENQ_KFUNC reason",
-	.setup		= setup,
-	.run		= run,
-	.cleanup	= cleanup,
-};
-REGISTER_SCX_TEST(&dsq_reenq)
diff --git a/tools/testing/selftests/sched_ext/enq_immed.bpf.c b/tools/testing/selftests/sched_ext/enq_immed.bpf.c
deleted file mode 100644
index 805dd0256218..000000000000
--- a/tools/testing/selftests/sched_ext/enq_immed.bpf.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Validate SCX_ENQ_IMMED fast/slow path semantics via the direct insert path.
- *
- * With SCX_OPS_ALWAYS_ENQ_IMMED set, the kernel automatically adds
- * SCX_ENQ_IMMED to every local DSQ dispatch.  When the target CPU's local
- * DSQ already has tasks queued (dsq->nr > 1), the kernel re-enqueues the
- * task through ops.enqueue() with SCX_ENQ_REENQ and SCX_TASK_REENQ_IMMED
- * recorded in p->scx.flags (the "slow path").
- *
- * Worker threads are pinned to CPU 0 via SCX_DSQ_LOCAL_ON to guarantee
- * local DSQ contention.
- */
-
-#include <scx/common.bpf.h>
-
-char _license[] SEC("license") = "GPL";
-
-UEI_DEFINE(uei);
-
-/* Set by userspace to identify the test process group. */
-const volatile u32 test_tgid;
-
-/*
- * SCX_TASK_REENQ_REASON_MASK and SCX_TASK_REENQ_IMMED are exported via
- * vmlinux BTF as part of enum scx_ent_flags.
- */
-
-u64 nr_immed_reenq;
-
-void BPF_STRUCT_OPS(enq_immed_enqueue, struct task_struct *p, u64 enq_flags)
-{
-	if (enq_flags & SCX_ENQ_REENQ) {
-		u32 reason = p->scx.flags & SCX_TASK_REENQ_REASON_MASK;
-
-		if (reason == SCX_TASK_REENQ_IMMED)
-			__sync_fetch_and_add(&nr_immed_reenq, 1);
-	}
-
-	if (p->tgid == (pid_t)test_tgid)
-		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | 0, SCX_SLICE_DFL,
-				   enq_flags);
-	else
-		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL,
-				   enq_flags);
-}
-
-void BPF_STRUCT_OPS(enq_immed_dispatch, s32 cpu, struct task_struct *prev)
-{
-	scx_bpf_dsq_move_to_local(SCX_DSQ_GLOBAL, 0);
-}
-
-void BPF_STRUCT_OPS(enq_immed_exit, struct scx_exit_info *ei)
-{
-	UEI_RECORD(uei, ei);
-}
-
-SCX_OPS_DEFINE(enq_immed_ops,
-	       .enqueue		= (void *)enq_immed_enqueue,
-	       .dispatch	= (void *)enq_immed_dispatch,
-	       .exit		= (void *)enq_immed_exit,
-	       .flags		= SCX_OPS_ALWAYS_ENQ_IMMED,
-	       .name		= "enq_immed")
diff --git a/tools/testing/selftests/sched_ext/enq_immed.c b/tools/testing/selftests/sched_ext/enq_immed.c
deleted file mode 100644
index 44681e41975d..000000000000
--- a/tools/testing/selftests/sched_ext/enq_immed.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Userspace test for SCX_ENQ_IMMED via the direct insert path.
- *
- * Validates that dispatching tasks to a busy CPU's local DSQ with
- * SCX_OPS_ALWAYS_ENQ_IMMED triggers the IMMED slow path: the kernel
- * re-enqueues the task through ops.enqueue() with SCX_TASK_REENQ_IMMED.
- *
- * Skipped on single-CPU systems where local DSQ contention cannot occur.
- */
-#define _GNU_SOURCE
-#include <stdio.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <bpf/bpf.h>
-#include <scx/common.h>
-#include "enq_immed.bpf.skel.h"
-#include "scx_test.h"
-
-#define NUM_WORKERS		4
-#define TEST_DURATION_SEC	3
-
-static volatile bool stop_workers;
-
-static void *worker_fn(void *arg)
-{
-	while (!stop_workers) {
-		volatile unsigned long i;
-
-		for (i = 0; i < 100000UL; i++)
-			;
-		usleep(100);
-	}
-	return NULL;
-}
-
-static enum scx_test_status setup(void **ctx)
-{
-	struct enq_immed *skel;
-	u64 v;
-
-	if (!__COMPAT_read_enum("scx_ops_flags",
-				"SCX_OPS_ALWAYS_ENQ_IMMED", &v)) {
-		fprintf(stderr,
-			"SKIP: SCX_OPS_ALWAYS_ENQ_IMMED not available\n");
-		return SCX_TEST_SKIP;
-	}
-
-	skel = enq_immed__open();
-	SCX_FAIL_IF(!skel, "Failed to open");
-	SCX_ENUM_INIT(skel);
-
-	skel->rodata->test_tgid = (u32)getpid();
-
-	SCX_FAIL_IF(enq_immed__load(skel), "Failed to load skel");
-
-	*ctx = skel;
-	return SCX_TEST_PASS;
-}
-
-static enum scx_test_status run(void *ctx)
-{
-	struct enq_immed *skel = ctx;
-	struct bpf_link *link;
-	pthread_t workers[NUM_WORKERS];
-	long nproc;
-	int i;
-	u64 reenq;
-
-	nproc = sysconf(_SC_NPROCESSORS_ONLN);
-	if (nproc <= 1) {
-		fprintf(stderr,
-			"SKIP: single CPU, IMMED slow path may not trigger\n");
-		return SCX_TEST_SKIP;
-	}
-
-	link = bpf_map__attach_struct_ops(skel->maps.enq_immed_ops);
-	SCX_FAIL_IF(!link, "Failed to attach scheduler");
-
-	stop_workers = false;
-	for (i = 0; i < NUM_WORKERS; i++) {
-		SCX_FAIL_IF(pthread_create(&workers[i], NULL, worker_fn, NULL),
-			    "Failed to create worker %d", i);
-	}
-
-	sleep(TEST_DURATION_SEC);
-
-	reenq = skel->bss->nr_immed_reenq;
-
-	stop_workers = true;
-	for (i = 0; i < NUM_WORKERS; i++)
-		pthread_join(workers[i], NULL);
-
-	bpf_link__destroy(link);
-
-	SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
-	SCX_GT(reenq, 0);
-
-	return SCX_TEST_PASS;
-}
-
-static void cleanup(void *ctx)
-{
-	struct enq_immed *skel = ctx;
-
-	enq_immed__destroy(skel);
-}
-
-struct scx_test enq_immed = {
-	.name		= "enq_immed",
-	.description	= "Verify SCX_ENQ_IMMED slow path via direct insert "
-			  "with SCX_OPS_ALWAYS_ENQ_IMMED",
-	.setup		= setup,
-	.run		= run,
-	.cleanup	= cleanup,
-};
-REGISTER_SCX_TEST(&enq_immed)

From d6edb15ad92cb61386c46662a5ae245c7feac5f0 Mon Sep 17 00:00:00 2001
From: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Date: Fri, 27 Mar 2026 14:17:57 +0800
Subject: [PATCH 112/134] scx_central: Defer timer start to central dispatch to
 fix init error

scx_central currently assumes that ops.init() runs on the selected
central CPU and aborts otherwise. This is no longer true, as ops.init()
is invoked from the scx_enable_helper thread, which can run on any
CPU.

As a result, sched_setaffinity() from userspace doesn't work, causing
scx_central to fail when loading with:

[ 1985.319942] sched_ext: central: scx_central.bpf.c:314: init from non-central CPU
[ 1985.320317]    scx_exit+0xa3/0xd0
[ 1985.320535]    scx_bpf_error_bstr+0xbd/0x220
[ 1985.320840]    bpf_prog_3a445a8163fa8149_central_init+0x103/0x1ba
[ 1985.321073]    bpf__sched_ext_ops_init+0x40/0xa8
[ 1985.321286]    scx_root_enable_workfn+0x507/0x1650
[ 1985.321461]    kthread_worker_fn+0x260/0x940
[ 1985.321745]    kthread+0x303/0x3e0
[ 1985.321901]    ret_from_fork+0x589/0x7d0
[ 1985.322065]    ret_from_fork_asm+0x1a/0x30

DEBUG DUMP
===================================================================

central: root
scx_enable_help[134] triggered exit kind 1025:
  scx_bpf_error (scx_central.bpf.c:314: init from non-central CPU)

Fix this by:
- Defer bpf_timer_start() to the first dispatch on the central CPU.
- Initialize the BPF timer in central_init() and kick the central CPU
to guarantee entering the dispatch path on the central CPU immediately.
- Remove the unnecessary sched_setaffinity() call in userspace.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_central.bpf.c | 62 +++++++++++++++++++++----------
 tools/sched_ext/scx_central.c     | 24 ------------
 2 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 399e8d3f8bec..4efcce099bd5 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -60,6 +60,7 @@ const volatile u32 nr_cpu_ids = 1;	/* !0 for veristat, set during init */
 const volatile u64 slice_ns;
 
 bool timer_pinned = true;
+bool timer_started;
 u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
 u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
 u64 nr_overflows;
@@ -179,9 +180,47 @@ static bool dispatch_to_cpu(s32 cpu)
 	return false;
 }
 
+static void start_central_timer(void)
+{
+	struct bpf_timer *timer;
+	u32 key = 0;
+	int ret;
+
+	if (likely(timer_started))
+		return;
+
+	timer = bpf_map_lookup_elem(&central_timer, &key);
+	if (!timer) {
+		scx_bpf_error("failed to lookup central timer");
+		return;
+	}
+
+	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
+	/*
+	 * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a
+	 * kernel which doesn't have it, bpf_timer_start() will return -EINVAL.
+	 * Retry without the PIN. This would be the perfect use case for
+	 * bpf_core_enum_value_exists() but the enum type doesn't have a name
+	 * and can't be used with bpf_core_enum_value_exists(). Oh well...
+	 */
+	if (ret == -EINVAL) {
+		timer_pinned = false;
+		ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+	}
+
+	if (ret) {
+		scx_bpf_error("bpf_timer_start failed (%d)", ret);
+		return;
+	}
+
+	timer_started = true;
+}
+
 void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 {
 	if (cpu == central_cpu) {
+		start_central_timer();
+
 		/* dispatch for all other CPUs first */
 		__sync_fetch_and_add(&nr_dispatches, 1);
 
@@ -310,29 +349,12 @@ int BPF_STRUCT_OPS_SLEEPABLE(central_init)
 	if (!timer)
 		return -ESRCH;
 
-	if (bpf_get_smp_processor_id() != central_cpu) {
-		scx_bpf_error("init from non-central CPU");
-		return -EINVAL;
-	}
-
 	bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
 	bpf_timer_set_callback(timer, central_timerfn);
 
-	ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
-	/*
-	 * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a
-	 * kernel which doesn't have it, bpf_timer_start() will return -EINVAL.
-	 * Retry without the PIN. This would be the perfect use case for
-	 * bpf_core_enum_value_exists() but the enum type doesn't have a name
-	 * and can't be used with bpf_core_enum_value_exists(). Oh well...
-	 */
-	if (ret == -EINVAL) {
-		timer_pinned = false;
-		ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
-	}
-	if (ret)
-		scx_bpf_error("bpf_timer_start failed (%d)", ret);
-	return ret;
+	scx_bpf_kick_cpu(central_cpu, 0);
+
+	return 0;
 }
 
 void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
index fd4c0eaa4326..4a72df39500d 100644
--- a/tools/sched_ext/scx_central.c
+++ b/tools/sched_ext/scx_central.c
@@ -5,7 +5,6 @@
  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
  */
 #define _GNU_SOURCE
-#include <sched.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <inttypes.h>
@@ -49,8 +48,6 @@ int main(int argc, char **argv)
 	struct bpf_link *link;
 	__u64 seq = 0, ecode;
 	__s32 opt;
-	cpu_set_t *cpuset;
-	size_t cpuset_size;
 
 	libbpf_set_print(libbpf_print_fn);
 	signal(SIGINT, sigint_handler);
@@ -96,27 +93,6 @@ restart:
 
 	SCX_OPS_LOAD(skel, central_ops, scx_central, uei);
 
-	/*
-	 * Affinitize the loading thread to the central CPU, as:
-	 * - That's where the BPF timer is first invoked in the BPF program.
-	 * - We probably don't want this user space component to take up a core
-	 *   from a task that would benefit from avoiding preemption on one of
-	 *   the tickless cores.
-	 *
-	 * Until BPF supports pinning the timer, it's not guaranteed that it
-	 * will always be invoked on the central CPU. In practice, this
-	 * suffices the majority of the time.
-	 */
-	cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
-	SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
-	cpuset_size = CPU_ALLOC_SIZE(skel->rodata->nr_cpu_ids);
-	CPU_ZERO_S(cpuset_size, cpuset);
-	CPU_SET_S(skel->rodata->central_cpu, cpuset_size, cpuset);
-	SCX_BUG_ON(sched_setaffinity(0, cpuset_size, cpuset),
-		   "Failed to affinitize to central CPU %d (max %d)",
-		   skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
-	CPU_FREE(cpuset);
-
 	link = SCX_OPS_ATTACH(skel, central_ops, scx_central);
 
 	if (!skel->data->timer_pinned)

From 238aa43f0b77ea7db7f306e010f688c77439ee62 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Fri, 27 Mar 2026 17:50:39 +0800
Subject: [PATCH 113/134] sched_ext: Document why built-in DSQs are unsupported
 sources in scx_bpf_dsq_move_to_local()

Add a comment explaining the design intent behind rejecting built-in DSQs
(%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) as sources. Local DSQs support
reenqueueing but the BPF scheduler cannot directly iterate or move tasks
from them. %SCX_DSQ_GLOBAL is similar but also doesn't support
reenqueueing because it maps to multiple per-node DSQs, making the scope
difficult to define.

Also annotate @dsq_id to make clear it must be a user-created DSQ.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7043fb941130..bfe923b7ffe0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8212,7 +8212,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
 
 /**
  * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
- * @dsq_id: DSQ to move task from
+ * @dsq_id: DSQ to move task from. Must be a user-created DSQ
  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
  * @enq_flags: %SCX_ENQ_*
  *
@@ -8220,6 +8220,14 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
  * local DSQ for execution with @enq_flags applied. Can only be called from
  * ops.dispatch().
  *
+ * Built-in DSQs (%SCX_DSQ_GLOBAL and %SCX_DSQ_LOCAL*) are not supported as
+ * sources. Local DSQs support reenqueueing (a task can be picked up for
+ * execution, dequeued for property changes, or reenqueued), but the BPF
+ * scheduler cannot directly iterate or move tasks from them. %SCX_DSQ_GLOBAL
+ * is similar but also doesn't support reenqueueing, as it maps to multiple
+ * per-node DSQs making the scope difficult to define; this may change in the
+ * future.
+ *
  * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
  * before trying to move from the specified DSQ. It may also grab rq locks and
  * thus can't be called under any BPF locks.

From b905ee77d5f557a83a485b4146210f54f13365fc Mon Sep 17 00:00:00 2001
From: Samuele Mariotti <smariotti@disroot.org>
Date: Thu, 2 Apr 2026 19:00:25 +0200
Subject: [PATCH 114/134] sched_ext: Fix missing warning in
 scx_set_task_state() default case

In scx_set_task_state(), the default case was setting the
warn flag, but then returning immediately. This is problematic
because the only purpose of the warn flag is to trigger
WARN_ONCE, but the early return prevented it from ever firing,
leaving invalid task states undetected and untraced.

To fix this, a WARN_ONCE call is now added directly in the
default case.

The fix addresses two aspects:

 - Guarantees the invalid task states are properly logged
   and traced.

 - Provides a distinct warning message
   ("sched_ext: Invalid task state") specifically for
   states outside the defined scx_task_state enum values,
   making it easier to distinguish from other transition
   warnings.

This ensures proper detection and reporting of invalid states.

Signed-off-by: Samuele Mariotti <smariotti@disroot.org>
Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9628c64e5592..0253887e63c0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3538,7 +3538,8 @@ static void scx_set_task_state(struct task_struct *p, u32 state)
 		warn = prev_state != SCX_TASK_READY;
 		break;
 	default:
-		warn = true;
+		WARN_ONCE(1, "sched_ext: Invalid task state %d -> %d for %s[%d]",
+			  prev_state, state, p->comm, p->pid);
 		return;
 	}
 

From a3c3fb2f86f8a1f266747622037f90eab58186ad Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Tue, 31 Mar 2026 17:18:33 +0800
Subject: [PATCH 115/134] tools/sched_ext: Fix off-by-one in scx_sdt payload
 zeroing

scx_alloc_free_idx() zeroes the payload of a freed arena allocation
one word at a time. The loop bound was alloc->pool.elem_size / 8, but
elem_size includes sizeof(struct sdt_data) (the 8-byte union sdt_id
header). This caused the loop to write one extra u64 past the
allocation, corrupting the tid field of the adjacent pool element.

Fix the loop bound to (elem_size - sizeof(struct sdt_data)) / 8 so
only the payload portion is zeroed.

Test plan:
- Add a temporary sanity check in scx_task_free() before the free call:

  if (mval->data->tid.idx != mval->tid.idx)
      scx_bpf_error("tid corruption: arena=%d storage=%d",
                    mval->data->tid.idx, (int)mval->tid.idx);

- stress-ng --fork 100 -t 10 & sudo ./build/bin/scx_sdt

Without this fix, running scx_sdt under fork-heavy load triggers the
corruption error. With the fix applied, the same workload completes
without error.

Fixes: 36929ebd17ae ("tools/sched_ext: add arena based scheduler")
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_sdt.bpf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
index 10248b71ef02..a1e33e6c412b 100644
--- a/tools/sched_ext/scx_sdt.bpf.c
+++ b/tools/sched_ext/scx_sdt.bpf.c
@@ -317,7 +317,8 @@ int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
 		};
 
 		/* Zero out one word at a time. */
-		for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) {
+		for (i = zero; i < (alloc->pool.elem_size - sizeof(struct sdt_data)) / 8
+		     && can_loop; i++) {
 			data->payload[i] = 0;
 		}
 	}

From a4f61f0a1afdb3c07025b91379f5c46dd89eb817 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Mon, 6 Apr 2026 13:47:55 +0200
Subject: [PATCH 116/134] sched_ext: Documentation: Add ops.dequeue() to task
 lifecycle

Document ops.dequeue() in the sched_ext task lifecycle now that its
semantics are well-defined.

Also update the pseudo-code to use task_is_runnable() consistently and
clarify the case where ops.dispatch() does not refill the time slice.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 404b4e4c33f7..9f03650abfeb 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -422,23 +422,29 @@ by a sched_ext scheduler:
 
         ops.runnable();         /* Task becomes ready to run */
 
-        while (task is runnable) {
+        while (task_is_runnable(task)) {
             if (task is not in a DSQ && task->scx.slice == 0) {
                 ops.enqueue();  /* Task can be added to a DSQ */
 
-                /* Any usable CPU becomes available */
-
-                ops.dispatch(); /* Task is moved to a local DSQ */
-
-                ops.dequeue(); /* Exiting BPF scheduler */
+                /* Task property change (i.e., affinity, nice, etc.)? */
+                if (sched_change(task)) {
+                    ops.dequeue(); /* Exiting BPF scheduler custody */
+                    continue;
+                }
             }
+
+            /* Any usable CPU becomes available */
+
+            ops.dispatch();     /* Task is moved to a local DSQ */
+            ops.dequeue();      /* Exiting BPF scheduler custody */
+
             ops.running();      /* Task starts running on its assigned CPU */
 
-            while task_is_runnable(p) {
-                while (task->scx.slice > 0 && task_is_runnable(p))
-                    ops.tick();     /* Called every 1/HZ seconds */
+            while (task_is_runnable(task) && task->scx.slice > 0) {
+                ops.tick();     /* Called every 1/HZ seconds */
 
-                ops.dispatch();     /* task->scx.slice can be refilled */
+                if (task->scx.slice == 0)
+                    ops.dispatch(); /* task->scx.slice can be refilled */
             }
 
             ops.stopping();     /* Task stops running (time slice expires or wait) */

From 393754191b85b3f76d9cc44dda5209ef23337e8a Mon Sep 17 00:00:00 2001
From: fangqiurong <fangqiurong@kylinos.cn>
Date: Tue, 7 Apr 2026 17:34:05 +0800
Subject: [PATCH 117/134] sched_ext: Documentation: Fix scx_bpf_move_to_local
 kfunc name

The correct kfunc name is scx_bpf_dsq_move_to_local(), not
scx_bpf_move_to_local(). Fix the two references in the
Scheduling Cycle section.

Signed-off-by: fangqiurong <fangqiurong@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 9f03650abfeb..ec594ae8086d 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -375,9 +375,9 @@ The following briefly shows how a waking task is scheduled and executed.
      rather than performing them immediately. There can be up to
      ``ops.dispatch_max_batch`` pending tasks.
 
-   * ``scx_bpf_move_to_local()`` moves a task from the specified non-local
+   * ``scx_bpf_dsq_move_to_local()`` moves a task from the specified non-local
      DSQ to the dispatching DSQ. This function cannot be called with any BPF
-     locks held. ``scx_bpf_move_to_local()`` flushes the pending insertions
+     locks held. ``scx_bpf_dsq_move_to_local()`` flushes the pending insertions
      tasks before trying to move from the specified DSQ.
 
 4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ,

From ff1befcb168395481fd6a28d8036b707cb7e7a13 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Wed, 8 Apr 2026 07:57:15 +0800
Subject: [PATCH 118/134] selftests/sched_ext: Improve runner error reporting
 for invalid arguments

Report an error for './runner foo' (positional arg instead of -t) and
for './runner -t foo' when the filter matches no tests. Previously both
cases produced no error output.

Pre-scan the test list before the main loop so the error is reported
immediately, avoiding spurious SKIP output from '-s' when no tests
match.

Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/runner.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
index d84f71eee049..c264807caa91 100644
--- a/tools/testing/selftests/sched_ext/runner.c
+++ b/tools/testing/selftests/sched_ext/runner.c
@@ -164,6 +164,26 @@ int main(int argc, char **argv)
 		}
 	}
 
+	if (optind < argc) {
+		fprintf(stderr, "Unexpected argument '%s'. Use -t to filter tests.\n",
+			argv[optind]);
+		return 1;
+	}
+
+	if (filter) {
+		for (i = 0; i < __scx_num_tests; i++) {
+			if (!should_skip_test(&__scx_tests[i], filter))
+				break;
+		}
+		if (i == __scx_num_tests) {
+			fprintf(stderr, "No tests matched filter '%s'\n", filter);
+			fprintf(stderr, "Available tests (use -l to list):\n");
+			for (i = 0; i < __scx_num_tests; i++)
+				fprintf(stderr, "  %s\n", __scx_tests[i].name);
+			return 1;
+		}
+	}
+
 	for (i = 0; i < __scx_num_tests; i++) {
 		enum scx_test_status status;
 		struct scx_test *test = &__scx_tests[i];

From 71ba9a5cb125998a875e3f008cbb28b028b609aa Mon Sep 17 00:00:00 2001
From: Kuba Piecuch <jpiecuch@google.com>
Date: Thu, 9 Apr 2026 16:57:44 +0000
Subject: [PATCH 119/134] sched_ext: Documentation: improve accuracy of task
 lifecycle pseudo-code

* Add ops.quiescent() and ops.runnable() to the sched_change path.
  When a queued task has one of its scheduling properties changed
  (e.g. nice, affinity), it goes through dequeue() -> quiescent() ->
  (property change callback, e.g. ops.set_weight()) -> runnable() ->
  enqueue().

* Change && to || in ops.enqueue() condition. We want to enqueue tasks
  that have a non-zero slice and are not in any DSQ.

* Call ops.dispatch() and ops.dequeue() only for tasks that have had
  ops.enqueue() called. This is to account for tasks direct-dispatched
  from ops.select_cpu().

* Add a note explaining that the pseudo-code provides a simplified view
  of the task lifecycle and list some examples of cases that the
  pseudo-code does not account for.

Fixes: a4f61f0a1afd ("sched_ext: Documentation: Add ops.dequeue() to task lifecycle")
Signed-off-by: Kuba Piecuch <jpiecuch@google.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/scheduler/sched-ext.rst | 45 ++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index ec594ae8086d..03d595d178ea 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -408,8 +408,8 @@ for more information.
 Task Lifecycle
 --------------
 
-The following pseudo-code summarizes the entire lifecycle of a task managed
-by a sched_ext scheduler:
+The following pseudo-code presents a rough overview of the entire lifecycle
+of a task managed by a sched_ext scheduler:
 
 .. code-block:: c
 
@@ -423,21 +423,26 @@ by a sched_ext scheduler:
         ops.runnable();         /* Task becomes ready to run */
 
         while (task_is_runnable(task)) {
-            if (task is not in a DSQ && task->scx.slice == 0) {
+            if (task is not in a DSQ || task->scx.slice == 0) {
                 ops.enqueue();  /* Task can be added to a DSQ */
 
                 /* Task property change (i.e., affinity, nice, etc.)? */
                 if (sched_change(task)) {
                     ops.dequeue(); /* Exiting BPF scheduler custody */
+                    ops.quiescent();
+
+                    /* Property change callback, e.g. ops.set_weight() */
+
+                    ops.runnable();
                     continue;
                 }
+
+                /* Any usable CPU becomes available */
+
+                ops.dispatch();     /* Task is moved to a local DSQ */
+                ops.dequeue();      /* Exiting BPF scheduler custody */
             }
 
-            /* Any usable CPU becomes available */
-
-            ops.dispatch();     /* Task is moved to a local DSQ */
-            ops.dequeue();      /* Exiting BPF scheduler custody */
-
             ops.running();      /* Task starts running on its assigned CPU */
 
             while (task_is_runnable(task) && task->scx.slice > 0) {
@@ -456,6 +461,30 @@ by a sched_ext scheduler:
     ops.disable();              /* Disable BPF scheduling for the task */
     ops.exit_task();            /* Task is destroyed */
 
+Note that the above pseudo-code does not cover all possible state transitions
+and edge cases, to name a few examples:
+
+* ``ops.dispatch()`` may fail to move the task to a local DSQ due to a racing
+  property change on that task, in which case ``ops.dispatch()`` will be
+  retried.
+
+* The task may be direct-dispatched to a local DSQ from ``ops.enqueue()``,
+  in which case ``ops.dispatch()`` and ``ops.dequeue()`` are skipped and we go
+  straight to ``ops.running()``.
+
+* Property changes may occur at virtually any point during the task's lifecycle,
+  not just when the task is queued and waiting to be dispatched. For example,
+  changing a property of a running task will lead to the callback sequence
+  ``ops.stopping()`` -> ``ops.quiescent()`` -> (property change callback) ->
+  ``ops.runnable()`` -> ``ops.running()``.
+
+* A sched_ext task can be preempted by a task from a higher-priority scheduling
+  class, in which case it will exit the tick-dispatch loop even though it is runnable
+  and has a non-zero slice.
+
+See the "Scheduling Cycle" section for a more detailed description of how
+a freshly woken up task gets on a CPU.
+
 Where to Look
 =============
 

From dcd47f27c01e795395379025525bfd47a99a91e1 Mon Sep 17 00:00:00 2001
From: fangqiurong <fangqiurong@kylinos.cn>
Date: Fri, 10 Apr 2026 14:47:53 +0800
Subject: [PATCH 120/134] selftests/sched_ext: Fix wrong DSQ ID in peek_dsq
 error message

The error path after scx_bpf_create_dsq(real_dsq_id, ...) was reporting
test_dsq_id instead of real_dsq_id in the error message, which would
mislead debugging.

Signed-off-by: fangqiurong <fangqiurong@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/testing/selftests/sched_ext/peek_dsq.bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
index 96e3a336a8a6..7f23fb17b1e0 100644
--- a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
+++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
@@ -197,7 +197,7 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init)
 	}
 	err = scx_bpf_create_dsq(real_dsq_id, -1);
 	if (err) {
-		scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
+		scx_bpf_error("Failed to create DSQ %d: %d", real_dsq_id, err);
 		return err;
 	}
 

From 9b5501d3c95924198914f3df920faae1594e68d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 121/134] sched_ext: Drop TRACING access to select_cpu kfuncs

The select_cpu kfuncs - scx_bpf_select_cpu_dfl(), scx_bpf_select_cpu_and()
and __scx_bpf_select_cpu_and() - take task_rq_lock() internally. Exposing
them via scx_kfunc_set_idle to BPF_PROG_TYPE_TRACING is unsafe: arbitrary
tracing contexts (kprobes, tracepoints, fentry, LSM) may run with @p's
pi_lock state unknown.

Move them out of scx_kfunc_ids_idle into a new scx_kfunc_ids_select_cpu
set registered only for STRUCT_OPS and SYSCALL.

Extracted from a larger verifier-time kfunc context filter patch
originally written by Juntong Deng.

Original-patch-by: Juntong Deng <juntong.deng@outlook.com>
Cc: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext_idle.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index ecf7e09b54ae..cd88aee47bd8 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -1469,9 +1469,6 @@ BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU)
-BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_idle)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
@@ -1479,13 +1476,33 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
 	.set			= &scx_kfunc_ids_idle,
 };
 
+/*
+ * The select_cpu kfuncs internally call task_rq_lock() when invoked from an
+ * rq-unlocked context, and thus cannot be safely called from arbitrary tracing
+ * contexts where @p's pi_lock state is unknown. Keep them out of
+ * BPF_PROG_TYPE_TRACING by registering them in their own set which is exposed
+ * only to STRUCT_OPS and SYSCALL programs.
+ */
+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
+	.owner			= THIS_MODULE,
+	.set			= &scx_kfunc_ids_select_cpu,
+};
+
 int scx_idle_init(void)
 {
 	int ret;
 
 	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
 	      register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
-	      register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
+	      register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ||
+	      register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
+	      register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu);
 
 	return ret;
 }

From a37e134317c68941fb3e0a4890d95de41eac63f5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 122/134] sched_ext: Add select_cpu kfuncs to
 scx_kfunc_ids_unlocked

select_cpu_from_kfunc() has an extra scx_kf_allowed_if_unlocked() branch
that accepts calls from unlocked contexts and takes task_rq_lock() itself
- a "callable from unlocked" property encoded in the kfunc body rather
than in set membership. That's fine while the runtime check is the
authoritative gate, but the upcoming verifier-time filter uses set
membership as the source of truth and needs it to reflect every context
the kfunc may be called from.

Add the three select_cpu kfuncs to scx_kfunc_ids_unlocked so their full
set of callable contexts is captured by set membership. This follows the
existing dual-set convention used by scx_bpf_dsq_move{,_vtime} and
scx_bpf_dsq_move_set_{slice,vtime}, which are members of both
scx_kfunc_ids_dispatch and scx_kfunc_ids_unlocked.

While at it, add brief comments on each duplicate BTF_ID_FLAGS block
(including the pre-existing dsq_move ones) explaining the dual
membership.

No runtime behavior change: the runtime check in select_cpu_from_kfunc()
remains the authoritative gate until it is removed along with the rest
of the scx_kf_mask enforcement in a follow-up.

v2: Clarify dispatch-set comment to name scx_bpf_dsq_move*() explicitly so it
    doesn't appear to cover scx_bpf_sub_dispatch() (Andrea Righi).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c      | 6 ++++++
 kernel/sched/ext_idle.c | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b757b853b42b..11a8f936ecc1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8497,6 +8497,7 @@ BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local, KF_IMPLICIT_ARGS)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local___v2, KF_IMPLICIT_ARGS)
+/* scx_bpf_dsq_move*() also in scx_kfunc_ids_unlocked: callable from unlocked contexts */
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
@@ -8612,10 +8613,15 @@ __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_IMPLICIT_ARGS | KF_SLEEPABLE)
+/* also in scx_kfunc_ids_dispatch: also callable from ops.dispatch() */
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
+/* also in scx_kfunc_ids_select_cpu: also callable from ops.select_cpu()/ops.enqueue() */
+BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
 
 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index cd88aee47bd8..8c31fb65477c 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -1482,6 +1482,10 @@ static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
  * contexts where @p's pi_lock state is unknown. Keep them out of
  * BPF_PROG_TYPE_TRACING by registering them in their own set which is exposed
  * only to STRUCT_OPS and SYSCALL programs.
+ *
+ * These kfuncs are also members of scx_kfunc_ids_unlocked (see ext.c) because
+ * they're callable from unlocked contexts in addition to ops.select_cpu() and
+ * ops.enqueue().
  */
 BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
 BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)

From 9fb457074f6d118b30458624223abef985725a88 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 123/134] sched_ext: Track @p's rq lock across
 set_cpus_allowed_scx -> ops.set_cpumask

The SCX_CALL_OP_TASK call site passes rq=NULL incorrectly, leaving
scx_locked_rq() unset. Pass task_rq(p) instead so update_locked_rq()
reflects reality.

v2: Add Fixes: tag (Andrea Righi).

Fixes: 18853ba782be ("sched_ext: Track currently locked rq")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 11a8f936ecc1..aee48b34aefa 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3360,7 +3360,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
 	 * designation pointless. Cast it away when calling the operation.
 	 */
 	if (SCX_HAS_OP(sch, set_cpumask))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, NULL,
+		SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, task_rq(p),
 				 p, (struct cpumask *)p->cpus_ptr);
 }
 

From b470e37c1fad72731be6f437e233cb6b16618f41 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 124/134] sched_ext: Fix ops.cgroup_move() invocation kf_mask
 and rq tracking

sched_move_task() invokes ops.cgroup_move() inside task_rq_lock(tsk), so
@p's rq lock is held. The SCX_CALL_OP_TASK invocation mislabels this:

  - kf_mask = SCX_KF_UNLOCKED (== 0), claiming no lock is held.
  - rq = NULL, so update_locked_rq() doesn't run and scx_locked_rq()
    returns NULL.

Switch to SCX_KF_REST and pass task_rq(p), matching ops.set_cpumask()
from set_cpus_allowed_scx().

Three effects:

  - scx_bpf_task_cgroup() becomes callable (was rejected by
    scx_kf_allowed(__SCX_KF_RQ_LOCKED)). Safe; rq lock is held.

  - scx_bpf_dsq_move() is now rejected (was allowed via the unlocked
    branch). Calling it while holding an unrelated task's rq lock is
    risky; rejection is correct.

  - scx_bpf_select_cpu_*() previously took the unlocked branch in
    select_cpu_from_kfunc() and called task_rq_lock(p, &rf), which
    would deadlock against the already-held pi_lock. Now it takes the
    locked-rq branch and is rejected with -EPERM via the existing
    kf_allowed(SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE) check. Latent
    deadlock fix.

No in-tree scheduler is known to call any of these from ops.cgroup_move().

v2: Add Fixes: tag (Andrea Righi).

Fixes: 18853ba782be ("sched_ext: Track currently locked rq")
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index aee48b34aefa..4d793a56d965 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4397,7 +4397,7 @@ void scx_cgroup_move_task(struct task_struct *p)
 	 */
 	if (SCX_HAS_OP(sch, cgroup_move) &&
 	    !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
-		SCX_CALL_OP_TASK(sch, SCX_KF_UNLOCKED, cgroup_move, NULL,
+		SCX_CALL_OP_TASK(sch, SCX_KF_REST, cgroup_move, task_rq(p),
 				 p, p->scx.cgrp_moving_from,
 				 tg_cgrp(task_group(p)));
 	p->scx.cgrp_moving_from = NULL;

From 0022b328504d1055be57eecf9e02c00e2ddcb0a2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 125/134] sched_ext: Decouple kfunc unlocked-context check from
 kf_mask

scx_kf_allowed_if_unlocked() uses !current->scx.kf_mask as a proxy for "no
SCX-tracked lock held". kf_mask is removed in a follow-up patch, so its two
callers - select_cpu_from_kfunc() and scx_dsq_move() - need another basis.

Add a new bool scx_rq.in_select_cpu, set across the SCX_CALL_OP_TASK_RET
that invokes ops.select_cpu(), to capture the one case where SCX itself
holds no lock but try_to_wake_up() holds @p's pi_lock. Together with
scx_locked_rq(), it expresses the same accepted-context set.

select_cpu_from_kfunc() needs a runtime test because it has to take
different locking paths depending on context. Open-code as a three-way
branch. The unlocked branch takes raw_spin_lock_irqsave(&p->pi_lock)
directly - pi_lock alone is enough for the fields the kfunc reads, and is
lighter than task_rq_lock().

scx_dsq_move() doesn't really need a runtime test - its accepted contexts
could be enforced at verifier load time. But since the runtime state is
already there and using it keeps the upcoming load-time filter simpler, just
write it the same way: (scx_locked_rq() || in_select_cpu) &&
!kf_allowed(DISPATCH).

scx_kf_allowed_if_unlocked() is deleted with the conversions.

No semantic change.

v2: s/No functional change/No semantic change/ - the unlocked path now acquires
    pi_lock instead of the heavier task_rq_lock() (Andrea Righi).

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          |  4 +++-
 kernel/sched/ext_idle.c     | 39 ++++++++++++++++---------------------
 kernel/sched/ext_internal.h |  5 -----
 kernel/sched/sched.h        |  1 +
 4 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 4d793a56d965..fa266fd7afd5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3308,10 +3308,12 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		WARN_ON_ONCE(*ddsp_taskp);
 		*ddsp_taskp = p;
 
+		this_rq()->scx.in_select_cpu = true;
 		cpu = SCX_CALL_OP_TASK_RET(sch,
 					   SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
 					   select_cpu, NULL, p, prev_cpu,
 					   wake_flags);
+		this_rq()->scx.in_select_cpu = false;
 		p->scx.selected_cpu = cpu;
 		*ddsp_taskp = NULL;
 		if (ops_cpu_valid(sch, cpu, "from ops.select_cpu()"))
@@ -8144,7 +8146,7 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	bool in_balance;
 	unsigned long flags;
 
-	if (!scx_kf_allowed_if_unlocked() &&
+	if ((scx_locked_rq() || this_rq()->scx.in_select_cpu) &&
 	    !scx_kf_allowed(sch, SCX_KF_DISPATCH))
 		return false;
 
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 8c31fb65477c..f99ceeba2e56 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -913,8 +913,8 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
 				 s32 prev_cpu, u64 wake_flags,
 				 const struct cpumask *allowed, u64 flags)
 {
-	struct rq *rq;
-	struct rq_flags rf;
+	unsigned long irq_flags;
+	bool we_locked = false;
 	s32 cpu;
 
 	if (!ops_cpu_valid(sch, prev_cpu, NULL))
@@ -924,27 +924,22 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
 		return -EBUSY;
 
 	/*
-	 * If called from an unlocked context, acquire the task's rq lock,
-	 * so that we can safely access p->cpus_ptr and p->nr_cpus_allowed.
+	 * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq
+	 * lock or @p's pi_lock. Three cases:
 	 *
-	 * Otherwise, allow to use this kfunc only from ops.select_cpu()
-	 * and ops.select_enqueue().
+	 *  - inside ops.select_cpu(): try_to_wake_up() holds @p's pi_lock.
+	 *  - other rq-locked SCX op: scx_locked_rq() points at the held rq.
+	 *  - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops):
+	 *    nothing held, take pi_lock ourselves.
 	 */
-	if (scx_kf_allowed_if_unlocked()) {
-		rq = task_rq_lock(p, &rf);
-	} else {
-		if (!scx_kf_allowed(sch, SCX_KF_SELECT_CPU | SCX_KF_ENQUEUE))
-			return -EPERM;
-		rq = scx_locked_rq();
-	}
-
-	/*
-	 * Validate locking correctness to access p->cpus_ptr and
-	 * p->nr_cpus_allowed: if we're holding an rq lock, we're safe;
-	 * otherwise, assert that p->pi_lock is held.
-	 */
-	if (!rq)
+	if (this_rq()->scx.in_select_cpu) {
 		lockdep_assert_held(&p->pi_lock);
+	} else if (!scx_locked_rq()) {
+		raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
+		we_locked = true;
+	} else if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE)) {
+		return -EPERM;
+	}
 
 	/*
 	 * This may also be called from ops.enqueue(), so we need to handle
@@ -963,8 +958,8 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
 					 allowed ?: p->cpus_ptr, flags);
 	}
 
-	if (scx_kf_allowed_if_unlocked())
-		task_rq_unlock(rq, p, &rf);
+	if (we_locked)
+		raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);
 
 	return cpu;
 }
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index b4f36d8b9c1d..54da08a223b7 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1372,11 +1372,6 @@ static inline struct rq *scx_locked_rq(void)
 	return __this_cpu_read(scx_locked_rq_state);
 }
 
-static inline bool scx_kf_allowed_if_unlocked(void)
-{
-	return !current->scx.kf_mask;
-}
-
 static inline bool scx_bypassing(struct scx_sched *sch, s32 cpu)
 {
 	return unlikely(per_cpu_ptr(sch->pcpu, cpu)->flags &
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ae0783e27c1e..0b6a177fd597 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -798,6 +798,7 @@ struct scx_rq {
 	u64			extra_enq_flags;	/* see move_task_to_local_dsq() */
 	u32			nr_running;
 	u32			cpuperf_target;		/* [0, SCHED_CAPACITY_SCALE] */
+	bool			in_select_cpu;
 	bool			cpu_released;
 	u32			flags;
 	u32			nr_immed;		/* ENQ_IMMED tasks on local_dsq */

From 2193af26a149acfb7a66f49397665640c2a60d8c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 126/134] sched_ext: Drop redundant rq-locked check from
 scx_bpf_task_cgroup()

scx_kf_allowed_on_arg_tasks() runs both an scx_kf_allowed(__SCX_KF_RQ_LOCKED)
mask check and a kf_tasks[] check. After the preceding call-site fixes,
every SCX_CALL_OP_TASK*() invocation has kf_mask & __SCX_KF_RQ_LOCKED
non-zero, so the mask check is redundant whenever the kf_tasks[] check
passes. Drop it and simplify the helper to take only @sch and @p.

Fold the locking guarantee into the SCX_CALL_OP_TASK() comment block, which
scx_bpf_task_cgroup() now points to.

No functional change.

Extracted from a larger verifier-time kfunc context filter patch
originally written by Juntong Deng.

Original-patch-by: Juntong Deng <juntong.deng@outlook.com>
Cc: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fa266fd7afd5..be5b6755f1d1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -540,15 +540,18 @@ do {										\
 })
 
 /*
- * Some kfuncs are allowed only on the tasks that are subjects of the
- * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such
- * restrictions, the following SCX_CALL_OP_*() variants should be used when
- * invoking scx_ops operations that take task arguments. These can only be used
- * for non-nesting operations due to the way the tasks are tracked.
+ * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments
+ * and records them in current->scx.kf_tasks[] for the duration of the call. A
+ * kfunc invoked from inside such an op can then use
+ * scx_kf_allowed_on_arg_tasks() to verify that its task argument is one of
+ * those subject tasks.
  *
- * kfuncs which can only operate on such tasks can in turn use
- * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on
- * the specific task.
+ * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
+ * either via the @rq argument here, or (for ops.select_cpu()) via @p's pi_lock
+ * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
+ * kf_tasks[] is set, @p's scheduler-protected fields are stable.
+ *
+ * These macros only work for non-nesting ops since kf_tasks[] is not stacked.
  */
 #define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...)			\
 do {										\
@@ -613,12 +616,8 @@ static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
 
 /* see SCX_CALL_OP_TASK() */
 static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
-							u32 mask,
 							struct task_struct *p)
 {
-	if (!scx_kf_allowed(sch, mask))
-		return false;
-
 	if (unlikely((p != current->scx.kf_tasks[0] &&
 		      p != current->scx.kf_tasks[1]))) {
 		scx_error(sch, "called on a task not being operated on");
@@ -9535,9 +9534,8 @@ __bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
  * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with
  * from the scheduler's POV. SCX operations should use this function to
  * determine @p's current cgroup as, unlike following @p->cgroups,
- * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all
- * rq-locked operations. Can be called on the parameter tasks of rq-locked
- * operations. The restriction guarantees that @p's rq is locked by the caller.
+ * @p->sched_task_group is stable for the duration of the SCX op. See
+ * SCX_CALL_OP_TASK() for details.
  */
 __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p,
 					       const struct bpf_prog_aux *aux)
@@ -9552,7 +9550,7 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p,
 	if (unlikely(!sch))
 		goto out;
 
-	if (!scx_kf_allowed_on_arg_tasks(sch, __SCX_KF_RQ_LOCKED, p))
+	if (!scx_kf_allowed_on_arg_tasks(sch, p))
 		goto out;
 
 	cgrp = tg_cgrp(tg);

From d1d3c1c6ae3691869be9d94730d6e5325aaae8c6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 127/134] sched_ext: Add verifier-time kfunc context filter

Move enforcement of SCX context-sensitive kfunc restrictions from per-task
runtime kf_mask checks to BPF verifier-time filtering, using the BPF core's
struct_ops context information.

A shared .filter callback is attached to each context-sensitive BTF set
and consults a per-op allow table (scx_kf_allow_flags[]) indexed by SCX
ops member offset. Disallowed calls are now rejected at program load time
instead of at runtime.

The old model split reachability across two places: each SCX_CALL_OP*()
set bits naming its op context, and each kfunc's scx_kf_allowed() check
OR'd together the bits it accepted. A kfunc was callable when those two
masks overlapped. The new model transposes the result to the caller side -
each op's allow flags directly list the kfunc groups it may call. The old
bit assignments were:

  Call-site bits:
    ops.select_cpu   = ENQUEUE | SELECT_CPU
    ops.enqueue      = ENQUEUE
    ops.dispatch     = DISPATCH
    ops.cpu_release  = CPU_RELEASE

  Kfunc-group accepted bits:
    enqueue group     = ENQUEUE | DISPATCH
    select_cpu group  = SELECT_CPU | ENQUEUE
    dispatch group    = DISPATCH
    cpu_release group = CPU_RELEASE

Intersecting them yields the reachability now expressed directly by
scx_kf_allow_flags[]:

  ops.select_cpu  -> SELECT_CPU | ENQUEUE
  ops.enqueue     -> SELECT_CPU | ENQUEUE
  ops.dispatch    -> ENQUEUE | DISPATCH
  ops.cpu_release -> CPU_RELEASE

Unlocked ops carried no kf_mask bits and reached only unlocked kfuncs;
that maps directly to UNLOCKED in the new table.

Equivalence was checked by walking every (op, kfunc-group) combination
across SCX ops, SYSCALL, and non-SCX struct_ops callers against the old
scx_kf_allowed() runtime checks. With two intended exceptions (see below),
all combinations reach the same verdict; disallowed calls are now caught at
load time instead of firing scx_error() at runtime.

scx_bpf_dsq_move_set_slice() and scx_bpf_dsq_move_set_vtime() are
exceptions: they have no runtime check at all, but the new filter rejects
them from ops outside dispatch/unlocked. The affected cases are nonsensical
- the values these setters store are only read by
scx_bpf_dsq_move{,_vtime}(), which is itself restricted to
dispatch/unlocked, so a setter call from anywhere else was already dead
code.

Runtime scx_kf_mask enforcement is left in place by this patch and removed
in a follow-up.

Original-patch-by: Juntong Deng <juntong.deng@outlook.com>
Original-patch-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c          | 124 ++++++++++++++++++++++++++++++++++--
 kernel/sched/ext_idle.c     |   1 +
 kernel/sched/ext_idle.h     |   2 +
 kernel/sched/ext_internal.h |   3 +
 4 files changed, 125 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index be5b6755f1d1..edf51d91bab2 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -8133,6 +8133,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
 	.owner			= THIS_MODULE,
 	.set			= &scx_kfunc_ids_enqueue_dispatch,
+	.filter			= scx_kfunc_context_filter,
 };
 
 static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
@@ -8511,6 +8512,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
 	.owner			= THIS_MODULE,
 	.set			= &scx_kfunc_ids_dispatch,
+	.filter			= scx_kfunc_context_filter,
 };
 
 __bpf_kfunc_start_defs();
@@ -8551,6 +8553,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_cpu_release)
 static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = {
 	.owner			= THIS_MODULE,
 	.set			= &scx_kfunc_ids_cpu_release,
+	.filter			= scx_kfunc_context_filter,
 };
 
 __bpf_kfunc_start_defs();
@@ -8628,6 +8631,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
 static const struct btf_kfunc_id_set scx_kfunc_set_unlocked = {
 	.owner			= THIS_MODULE,
 	.set			= &scx_kfunc_ids_unlocked,
+	.filter			= scx_kfunc_context_filter,
 };
 
 __bpf_kfunc_start_defs();
@@ -9603,6 +9607,115 @@ static const struct btf_kfunc_id_set scx_kfunc_set_any = {
 	.set			= &scx_kfunc_ids_any,
 };
 
+/*
+ * Per-op kfunc allow flags. Each bit corresponds to a context-sensitive kfunc
+ * group; an op may permit zero or more groups, with the union expressed in
+ * scx_kf_allow_flags[]. The verifier-time filter (scx_kfunc_context_filter())
+ * consults this table to decide whether a context-sensitive kfunc is callable
+ * from a given SCX op.
+ */
+enum scx_kf_allow_flags {
+	SCX_KF_ALLOW_UNLOCKED		= 1 << 0,
+	SCX_KF_ALLOW_CPU_RELEASE	= 1 << 1,
+	SCX_KF_ALLOW_DISPATCH		= 1 << 2,
+	SCX_KF_ALLOW_ENQUEUE		= 1 << 3,
+	SCX_KF_ALLOW_SELECT_CPU		= 1 << 4,
+};
+
+/*
+ * Map each SCX op to the union of kfunc groups it permits, indexed by
+ * SCX_OP_IDX(op). Ops not listed only permit kfuncs that are not
+ * context-sensitive.
+ */
+static const u32 scx_kf_allow_flags[] = {
+	[SCX_OP_IDX(select_cpu)]	= SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE,
+	[SCX_OP_IDX(enqueue)]		= SCX_KF_ALLOW_SELECT_CPU | SCX_KF_ALLOW_ENQUEUE,
+	[SCX_OP_IDX(dispatch)]		= SCX_KF_ALLOW_ENQUEUE | SCX_KF_ALLOW_DISPATCH,
+	[SCX_OP_IDX(cpu_release)]	= SCX_KF_ALLOW_CPU_RELEASE,
+	[SCX_OP_IDX(init_task)]		= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(dump)]		= SCX_KF_ALLOW_UNLOCKED,
+#ifdef CONFIG_EXT_GROUP_SCHED
+	[SCX_OP_IDX(cgroup_init)]	= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(cgroup_exit)]	= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(cgroup_prep_move)]	= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(cgroup_cancel_move)] = SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(cgroup_set_weight)]	= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(cgroup_set_bandwidth)] = SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(cgroup_set_idle)]	= SCX_KF_ALLOW_UNLOCKED,
+#endif	/* CONFIG_EXT_GROUP_SCHED */
+	[SCX_OP_IDX(sub_attach)]	= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(sub_detach)]	= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(cpu_online)]	= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(cpu_offline)]	= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(init)]		= SCX_KF_ALLOW_UNLOCKED,
+	[SCX_OP_IDX(exit)]		= SCX_KF_ALLOW_UNLOCKED,
+};
+
+/*
+ * Verifier-time filter for context-sensitive SCX kfuncs. Registered via the
+ * .filter field on each per-group btf_kfunc_id_set. The BPF core invokes this
+ * for every kfunc call in the registered hook (BPF_PROG_TYPE_STRUCT_OPS or
+ * BPF_PROG_TYPE_SYSCALL), regardless of which set originally introduced the
+ * kfunc - so the filter must short-circuit on kfuncs it doesn't govern (e.g.
+ * scx_kfunc_ids_any) by falling through to "allow" when none of the
+ * context-sensitive sets contain the kfunc.
+ */
+int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+	bool in_unlocked = btf_id_set8_contains(&scx_kfunc_ids_unlocked, kfunc_id);
+	bool in_select_cpu = btf_id_set8_contains(&scx_kfunc_ids_select_cpu, kfunc_id);
+	bool in_enqueue = btf_id_set8_contains(&scx_kfunc_ids_enqueue_dispatch, kfunc_id);
+	bool in_dispatch = btf_id_set8_contains(&scx_kfunc_ids_dispatch, kfunc_id);
+	bool in_cpu_release = btf_id_set8_contains(&scx_kfunc_ids_cpu_release, kfunc_id);
+	u32 moff, flags;
+
+	/* Not a context-sensitive kfunc (e.g. from scx_kfunc_ids_any) - allow. */
+	if (!(in_unlocked || in_select_cpu || in_enqueue || in_dispatch || in_cpu_release))
+		return 0;
+
+	/* SYSCALL progs (e.g. BPF test_run()) may call unlocked and select_cpu kfuncs. */
+	if (prog->type == BPF_PROG_TYPE_SYSCALL)
+		return (in_unlocked || in_select_cpu) ? 0 : -EACCES;
+
+	if (prog->type != BPF_PROG_TYPE_STRUCT_OPS)
+		return -EACCES;
+
+	/*
+	 * add_subprog_and_kfunc() collects all kfunc calls, including dead code
+	 * guarded by bpf_ksym_exists(), before check_attach_btf_id() sets
+	 * prog->aux->st_ops. Allow all kfuncs when st_ops is not yet set;
+	 * do_check_main() re-runs the filter with st_ops set and enforces the
+	 * actual restrictions.
+	 */
+	if (!prog->aux->st_ops)
+		return 0;
+
+	/*
+	 * Non-SCX struct_ops: only unlocked kfuncs are safe. The other
+	 * context-sensitive kfuncs assume the rq lock is held by the SCX
+	 * dispatch path, which doesn't apply to other struct_ops users.
+	 */
+	if (prog->aux->st_ops != &bpf_sched_ext_ops)
+		return in_unlocked ? 0 : -EACCES;
+
+	/* SCX struct_ops: check the per-op allow list. */
+	moff = prog->aux->attach_st_ops_member_off;
+	flags = scx_kf_allow_flags[SCX_MOFF_IDX(moff)];
+
+	if ((flags & SCX_KF_ALLOW_UNLOCKED) && in_unlocked)
+		return 0;
+	if ((flags & SCX_KF_ALLOW_CPU_RELEASE) && in_cpu_release)
+		return 0;
+	if ((flags & SCX_KF_ALLOW_DISPATCH) && in_dispatch)
+		return 0;
+	if ((flags & SCX_KF_ALLOW_ENQUEUE) && in_enqueue)
+		return 0;
+	if ((flags & SCX_KF_ALLOW_SELECT_CPU) && in_select_cpu)
+		return 0;
+
+	return -EACCES;
+}
+
 static int __init scx_init(void)
 {
 	int ret;
@@ -9612,11 +9725,12 @@ static int __init scx_init(void)
 	 * register_btf_kfunc_id_set() needs most of the system to be up.
 	 *
 	 * Some kfuncs are context-sensitive and can only be called from
-	 * specific SCX ops. They are grouped into BTF sets accordingly.
-	 * Unfortunately, BPF currently doesn't have a way of enforcing such
-	 * restrictions. Eventually, the verifier should be able to enforce
-	 * them. For now, register them the same and make each kfunc explicitly
-	 * check using scx_kf_allowed().
+	 * specific SCX ops. They are grouped into per-context BTF sets, each
+	 * registered with scx_kfunc_context_filter as its .filter callback. The
+	 * BPF core dedups identical filter pointers per hook
+	 * (btf_populate_kfunc_set()), so the filter is invoked exactly once per
+	 * kfunc lookup; it consults scx_kf_allow_flags[] to enforce per-op
+	 * restrictions at verify time.
 	 */
 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
 					     &scx_kfunc_set_enqueue_dispatch)) ||
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index f99ceeba2e56..ec49e0c9892e 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -1491,6 +1491,7 @@ BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
 static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
 	.owner			= THIS_MODULE,
 	.set			= &scx_kfunc_ids_select_cpu,
+	.filter			= scx_kfunc_context_filter,
 };
 
 int scx_idle_init(void)
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
index fa583f141f35..dc35f850481e 100644
--- a/kernel/sched/ext_idle.h
+++ b/kernel/sched/ext_idle.h
@@ -12,6 +12,8 @@
 
 struct sched_ext_ops;
 
+extern struct btf_id_set8 scx_kfunc_ids_select_cpu;
+
 void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
 void scx_idle_init_masks(void);
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 54da08a223b7..62ce4eaf6a3f 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -6,6 +6,7 @@
  * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
  */
 #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+#define SCX_MOFF_IDX(moff)	((moff) / sizeof(void (*)(void)))
 
 enum scx_consts {
 	SCX_DSP_DFL_MAX_BATCH		= 32,
@@ -1363,6 +1364,8 @@ enum scx_ops_state {
 extern struct scx_sched __rcu *scx_root;
 DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
 
+int scx_kfunc_context_filter(const struct bpf_prog *prog, u32 kfunc_id);
+
 /*
  * Return the rq currently locked from an scx callback, or NULL if no rq is
  * locked.

From 7cd9a5d7d4b75802b97aa89f6f53375a6d84d1d5 Mon Sep 17 00:00:00 2001
From: Cheng-Yang Chou <yphbchou0911@gmail.com>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 128/134] sched_ext: Remove runtime kfunc mask enforcement

Now that scx_kfunc_context_filter enforces context-sensitive kfunc
restrictions at BPF load time, the per-task runtime enforcement via
scx_kf_mask is redundant. Remove it entirely:

 - Delete enum scx_kf_mask, the kf_mask field on sched_ext_entity, and
   the scx_kf_allow()/scx_kf_disallow()/scx_kf_allowed() helpers along
   with the higher_bits()/highest_bit() helpers they used.
 - Strip the @mask parameter (and the BUILD_BUG_ON checks) from the
   SCX_CALL_OP[_RET]/SCX_CALL_OP_TASK[_RET]/SCX_CALL_OP_2TASKS_RET
   macros and update every call site. Reflow call sites that were
   wrapped only to fit the old 5-arg form and now collapse onto a single
   line under ~100 cols.
 - Remove the in-kfunc scx_kf_allowed() runtime checks from
   scx_dsq_insert_preamble(), scx_dsq_move(), scx_bpf_dispatch_nr_slots(),
   scx_bpf_dispatch_cancel(), scx_bpf_dsq_move_to_local___v2(),
   scx_bpf_sub_dispatch(), scx_bpf_reenqueue_local(), and the per-call
   guard inside select_cpu_from_kfunc().

scx_bpf_task_cgroup() and scx_kf_allowed_on_arg_tasks() were already
cleaned up in the "drop redundant rq-locked check" patch.
scx_kf_allowed_if_unlocked() was rewritten in the preceding "decouple"
patch. No further changes to those helpers here.

Co-developed-by: Juntong Deng <juntong.deng@outlook.com>
Signed-off-by: Juntong Deng <juntong.deng@outlook.com>
Signed-off-by: Cheng-Yang Chou <yphbchou0911@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
---
 include/linux/sched/ext.h |  28 -----
 kernel/sched/ext.c        | 244 +++++++++-----------------------------
 kernel/sched/ext_idle.c   |   4 +-
 3 files changed, 58 insertions(+), 218 deletions(-)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 602dc83cab36..1a3af2ea2a79 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -147,33 +147,6 @@ enum scx_ent_dsq_flags {
 	SCX_TASK_DSQ_ON_PRIQ	= 1 << 0, /* task is queued on the priority queue of a dsq */
 };
 
-/*
- * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
- * everywhere and the following bits track which kfunc sets are currently
- * allowed for %current. This simple per-task tracking works because SCX ops
- * nest in a limited way. BPF will likely implement a way to allow and disallow
- * kfuncs depending on the calling context which will replace this manual
- * mechanism. See scx_kf_allow().
- */
-enum scx_kf_mask {
-	SCX_KF_UNLOCKED		= 0,	  /* sleepable and not rq locked */
-	/* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */
-	SCX_KF_CPU_RELEASE	= 1 << 0, /* ops.cpu_release() */
-	/*
-	 * ops.dispatch() may release rq lock temporarily and thus ENQUEUE and
-	 * SELECT_CPU may be nested inside. ops.dequeue (in REST) may also be
-	 * nested inside DISPATCH.
-	 */
-	SCX_KF_DISPATCH		= 1 << 1, /* ops.dispatch() */
-	SCX_KF_ENQUEUE		= 1 << 2, /* ops.enqueue() and ops.select_cpu() */
-	SCX_KF_SELECT_CPU	= 1 << 3, /* ops.select_cpu() */
-	SCX_KF_REST		= 1 << 4, /* other rq-locked operations */
-
-	__SCX_KF_RQ_LOCKED	= SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH |
-				  SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
-	__SCX_KF_TERMINAL	= SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
-};
-
 enum scx_dsq_lnode_flags {
 	SCX_DSQ_LNODE_ITER_CURSOR = 1 << 0,
 
@@ -221,7 +194,6 @@ struct sched_ext_entity {
 	s32			sticky_cpu;
 	s32			holding_cpu;
 	s32			selected_cpu;
-	u32			kf_mask;	/* see scx_kf_mask above */
 	struct task_struct	*kf_tasks[2];	/* see SCX_CALL_OP_TASK() */
 
 	struct list_head	runnable_node;	/* rq->scx.runnable_list */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index edf51d91bab2..d37418a684e9 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -229,19 +229,6 @@ static long jiffies_delta_msecs(unsigned long at, unsigned long now)
 		return -(long)jiffies_to_msecs(now - at);
 }
 
-/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
-static u32 higher_bits(u32 flags)
-{
-	return ~((1 << fls(flags)) - 1);
-}
-
-/* return the mask with only the highest bit set */
-static u32 highest_bit(u32 flags)
-{
-	int bit = fls(flags);
-	return ((u64)1 << bit) >> 1;
-}
-
 static bool u32_before(u32 a, u32 b)
 {
 	return (s32)(a - b) < 0;
@@ -462,30 +449,6 @@ static bool rq_is_open(struct rq *rq, u64 enq_flags)
 	return false;
 }
 
-/*
- * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
- * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
- * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
- * whether it's running from an allowed context.
- *
- * @mask is constant, always inline to cull the mask calculations.
- */
-static __always_inline void scx_kf_allow(u32 mask)
-{
-	/* nesting is allowed only in increasing scx_kf_mask order */
-	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
-		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
-		  current->scx.kf_mask, mask);
-	current->scx.kf_mask |= mask;
-	barrier();
-}
-
-static void scx_kf_disallow(u32 mask)
-{
-	barrier();
-	current->scx.kf_mask &= ~mask;
-}
-
 /*
  * Track the rq currently locked.
  *
@@ -506,34 +469,22 @@ static inline void update_locked_rq(struct rq *rq)
 	__this_cpu_write(scx_locked_rq_state, rq);
 }
 
-#define SCX_CALL_OP(sch, mask, op, rq, args...)					\
+#define SCX_CALL_OP(sch, op, rq, args...)					\
 do {										\
 	if (rq)									\
 		update_locked_rq(rq);						\
-	if (mask) {								\
-		scx_kf_allow(mask);						\
-		(sch)->ops.op(args);						\
-		scx_kf_disallow(mask);						\
-	} else {								\
-		(sch)->ops.op(args);						\
-	}									\
+	(sch)->ops.op(args);							\
 	if (rq)									\
 		update_locked_rq(NULL);						\
 } while (0)
 
-#define SCX_CALL_OP_RET(sch, mask, op, rq, args...)				\
+#define SCX_CALL_OP_RET(sch, op, rq, args...)					\
 ({										\
 	__typeof__((sch)->ops.op(args)) __ret;					\
 										\
 	if (rq)									\
 		update_locked_rq(rq);						\
-	if (mask) {								\
-		scx_kf_allow(mask);						\
-		__ret = (sch)->ops.op(args);					\
-		scx_kf_disallow(mask);						\
-	} else {								\
-		__ret = (sch)->ops.op(args);					\
-	}									\
+	__ret = (sch)->ops.op(args);						\
 	if (rq)									\
 		update_locked_rq(NULL);						\
 	__ret;									\
@@ -553,67 +504,33 @@ do {										\
  *
  * These macros only work for non-nesting ops since kf_tasks[] is not stacked.
  */
-#define SCX_CALL_OP_TASK(sch, mask, op, rq, task, args...)			\
+#define SCX_CALL_OP_TASK(sch, op, rq, task, args...)				\
 do {										\
-	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
-	SCX_CALL_OP((sch), mask, op, rq, task, ##args);				\
+	SCX_CALL_OP((sch), op, rq, task, ##args);				\
 	current->scx.kf_tasks[0] = NULL;					\
 } while (0)
 
-#define SCX_CALL_OP_TASK_RET(sch, mask, op, rq, task, args...)			\
+#define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...)			\
 ({										\
 	__typeof__((sch)->ops.op(task, ##args)) __ret;				\
-	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task;					\
-	__ret = SCX_CALL_OP_RET((sch), mask, op, rq, task, ##args);		\
+	__ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args);			\
 	current->scx.kf_tasks[0] = NULL;					\
 	__ret;									\
 })
 
-#define SCX_CALL_OP_2TASKS_RET(sch, mask, op, rq, task0, task1, args...)	\
+#define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...)		\
 ({										\
 	__typeof__((sch)->ops.op(task0, task1, ##args)) __ret;			\
-	BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL);				\
 	current->scx.kf_tasks[0] = task0;					\
 	current->scx.kf_tasks[1] = task1;					\
-	__ret = SCX_CALL_OP_RET((sch), mask, op, rq, task0, task1, ##args);	\
+	__ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args);		\
 	current->scx.kf_tasks[0] = NULL;					\
 	current->scx.kf_tasks[1] = NULL;					\
 	__ret;									\
 })
 
-/* @mask is constant, always inline to cull unnecessary branches */
-static __always_inline bool scx_kf_allowed(struct scx_sched *sch, u32 mask)
-{
-	if (unlikely(!(current->scx.kf_mask & mask))) {
-		scx_error(sch, "kfunc with mask 0x%x called from an operation only allowing 0x%x",
-			  mask, current->scx.kf_mask);
-		return false;
-	}
-
-	/*
-	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
-	 * DISPATCH must not be called if we're running DEQUEUE which is nested
-	 * inside ops.dispatch(). We don't need to check boundaries for any
-	 * blocking kfuncs as the verifier ensures they're only called from
-	 * sleepable progs.
-	 */
-	if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE &&
-		     (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) {
-		scx_error(sch, "cpu_release kfunc called from a nested operation");
-		return false;
-	}
-
-	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
-		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
-		scx_error(sch, "dispatch kfunc called from a nested operation");
-		return false;
-	}
-
-	return true;
-}
-
 /* see SCX_CALL_OP_TASK() */
 static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
 							struct task_struct *p)
@@ -1461,7 +1378,7 @@ static void call_task_dequeue(struct scx_sched *sch, struct rq *rq,
 		return;
 
 	if (SCX_HAS_OP(sch, dequeue))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, dequeue, rq, p, deq_flags);
+		SCX_CALL_OP_TASK(sch, dequeue, rq, p, deq_flags);
 
 	p->scx.flags &= ~SCX_TASK_IN_CUSTODY;
 }
@@ -1920,7 +1837,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	WARN_ON_ONCE(*ddsp_taskp);
 	*ddsp_taskp = p;
 
-	SCX_CALL_OP_TASK(sch, SCX_KF_ENQUEUE, enqueue, rq, p, enq_flags);
+	SCX_CALL_OP_TASK(sch, enqueue, rq, p, enq_flags);
 
 	*ddsp_taskp = NULL;
 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -2024,7 +1941,7 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int core_enq_
 	add_nr_running(rq, 1);
 
 	if (SCX_HAS_OP(sch, runnable) && !task_on_rq_migrating(p))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, runnable, rq, p, enq_flags);
+		SCX_CALL_OP_TASK(sch, runnable, rq, p, enq_flags);
 
 	if (enq_flags & SCX_ENQ_WAKEUP)
 		touch_core_sched(rq, p);
@@ -2141,11 +2058,11 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int core_deq_
 	 */
 	if (SCX_HAS_OP(sch, stopping) && task_current(rq, p)) {
 		update_curr_scx(rq);
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, false);
+		SCX_CALL_OP_TASK(sch, stopping, rq, p, false);
 	}
 
 	if (SCX_HAS_OP(sch, quiescent) && !task_on_rq_migrating(p))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, quiescent, rq, p, deq_flags);
+		SCX_CALL_OP_TASK(sch, quiescent, rq, p, deq_flags);
 
 	if (deq_flags & SCX_DEQ_SLEEP)
 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
@@ -2167,7 +2084,7 @@ static void yield_task_scx(struct rq *rq)
 	struct scx_sched *sch = scx_task_sched(p);
 
 	if (SCX_HAS_OP(sch, yield))
-		SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL);
+		SCX_CALL_OP_2TASKS_RET(sch, yield, rq, p, NULL);
 	else
 		p->scx.slice = 0;
 }
@@ -2178,8 +2095,7 @@ static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
 	struct scx_sched *sch = scx_task_sched(from);
 
 	if (SCX_HAS_OP(sch, yield) && sch == scx_task_sched(to))
-		return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq,
-					      from, to);
+		return SCX_CALL_OP_2TASKS_RET(sch, yield, rq, from, to);
 	else
 		return false;
 }
@@ -2799,20 +2715,11 @@ scx_dispatch_sched(struct scx_sched *sch, struct rq *rq,
 		dspc->nr_tasks = 0;
 
 		if (nested) {
-			/*
-			 * If nested, don't update kf_mask as the originating
-			 * invocation would already have set it up.
-			 */
-			SCX_CALL_OP(sch, 0, dispatch, rq, cpu,
-				    prev_on_sch ? prev : NULL);
+			SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL);
 		} else {
-			/*
-			 * If not nested, stash @prev so that nested invocations
-			 * can access it.
-			 */
+			/* stash @prev so that nested invocations can access it */
 			rq->scx.sub_dispatch_prev = prev;
-			SCX_CALL_OP(sch, SCX_KF_DISPATCH, dispatch, rq, cpu,
-				    prev_on_sch ? prev : NULL);
+			SCX_CALL_OP(sch, dispatch, rq, cpu, prev_on_sch ? prev : NULL);
 			rq->scx.sub_dispatch_prev = NULL;
 		}
 
@@ -2871,7 +2778,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * emitted in switch_class().
 		 */
 		if (SCX_HAS_OP(sch, cpu_acquire))
-			SCX_CALL_OP(sch, SCX_KF_REST, cpu_acquire, rq, cpu, NULL);
+			SCX_CALL_OP(sch, cpu_acquire, rq, cpu, NULL);
 		rq->scx.cpu_released = false;
 	}
 
@@ -2950,7 +2857,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
 	if (SCX_HAS_OP(sch, running) && (p->scx.flags & SCX_TASK_QUEUED))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, running, rq, p);
+		SCX_CALL_OP_TASK(sch, running, rq, p);
 
 	clr_task_runnable(p, true);
 
@@ -3022,8 +2929,7 @@ static void switch_class(struct rq *rq, struct task_struct *next)
 				.task = next,
 			};
 
-			SCX_CALL_OP(sch, SCX_KF_CPU_RELEASE, cpu_release, rq,
-				    cpu_of(rq), &args);
+			SCX_CALL_OP(sch, cpu_release, rq, cpu_of(rq), &args);
 		}
 		rq->scx.cpu_released = true;
 	}
@@ -3041,7 +2947,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
 
 	/* see dequeue_task_scx() on why we skip when !QUEUED */
 	if (SCX_HAS_OP(sch, stopping) && (p->scx.flags & SCX_TASK_QUEUED))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, stopping, rq, p, true);
+		SCX_CALL_OP_TASK(sch, stopping, rq, p, true);
 
 	if (p->scx.flags & SCX_TASK_QUEUED) {
 		set_task_runnable(rq, p);
@@ -3271,7 +3177,7 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
 	 */
 	if (sch_a == sch_b && SCX_HAS_OP(sch_a, core_sched_before) &&
 	    !scx_bypassing(sch_a, task_cpu(a)))
-		return SCX_CALL_OP_2TASKS_RET(sch_a, SCX_KF_REST, core_sched_before,
+		return SCX_CALL_OP_2TASKS_RET(sch_a, core_sched_before,
 					      NULL,
 					      (struct task_struct *)a,
 					      (struct task_struct *)b);
@@ -3308,10 +3214,7 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
 		*ddsp_taskp = p;
 
 		this_rq()->scx.in_select_cpu = true;
-		cpu = SCX_CALL_OP_TASK_RET(sch,
-					   SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
-					   select_cpu, NULL, p, prev_cpu,
-					   wake_flags);
+		cpu = SCX_CALL_OP_TASK_RET(sch, select_cpu, NULL, p, prev_cpu, wake_flags);
 		this_rq()->scx.in_select_cpu = false;
 		p->scx.selected_cpu = cpu;
 		*ddsp_taskp = NULL;
@@ -3361,8 +3264,7 @@ static void set_cpus_allowed_scx(struct task_struct *p,
 	 * designation pointless. Cast it away when calling the operation.
 	 */
 	if (SCX_HAS_OP(sch, set_cpumask))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, task_rq(p),
-				 p, (struct cpumask *)p->cpus_ptr);
+		SCX_CALL_OP_TASK(sch, set_cpumask, task_rq(p), p, (struct cpumask *)p->cpus_ptr);
 }
 
 static void handle_hotplug(struct rq *rq, bool online)
@@ -3384,9 +3286,9 @@ static void handle_hotplug(struct rq *rq, bool online)
 		scx_idle_update_selcpu_topology(&sch->ops);
 
 	if (online && SCX_HAS_OP(sch, cpu_online))
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_online, NULL, cpu);
+		SCX_CALL_OP(sch, cpu_online, NULL, cpu);
 	else if (!online && SCX_HAS_OP(sch, cpu_offline))
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cpu_offline, NULL, cpu);
+		SCX_CALL_OP(sch, cpu_offline, NULL, cpu);
 	else
 		scx_exit(sch, SCX_EXIT_UNREG_KERN,
 			 SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG,
@@ -3504,7 +3406,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
 		curr->scx.slice = 0;
 		touch_core_sched(rq, curr);
 	} else if (SCX_HAS_OP(sch, tick)) {
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, tick, rq, curr);
+		SCX_CALL_OP_TASK(sch, tick, rq, curr);
 	}
 
 	if (!curr->scx.slice)
@@ -3580,8 +3482,7 @@ static int __scx_init_task(struct scx_sched *sch, struct task_struct *p, bool fo
 			.fork = fork,
 		};
 
-		ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init_task, NULL,
-				      p, &args);
+		ret = SCX_CALL_OP_RET(sch, init_task, NULL, p, &args);
 		if (unlikely(ret)) {
 			ret = ops_sanitize_err(sch, "init_task", ret);
 			return ret;
@@ -3662,11 +3563,10 @@ static void __scx_enable_task(struct scx_sched *sch, struct task_struct *p)
 	p->scx.weight = sched_weight_to_cgroup(weight);
 
 	if (SCX_HAS_OP(sch, enable))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, enable, rq, p);
+		SCX_CALL_OP_TASK(sch, enable, rq, p);
 
 	if (SCX_HAS_OP(sch, set_weight))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
-				 p, p->scx.weight);
+		SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight);
 }
 
 static void scx_enable_task(struct scx_sched *sch, struct task_struct *p)
@@ -3685,7 +3585,7 @@ static void scx_disable_task(struct scx_sched *sch, struct task_struct *p)
 	clear_direct_dispatch(p);
 
 	if (SCX_HAS_OP(sch, disable))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, disable, rq, p);
+		SCX_CALL_OP_TASK(sch, disable, rq, p);
 	scx_set_task_state(p, SCX_TASK_READY);
 
 	/*
@@ -3723,8 +3623,7 @@ static void __scx_disable_and_exit_task(struct scx_sched *sch,
 	}
 
 	if (SCX_HAS_OP(sch, exit_task))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, exit_task, task_rq(p),
-				 p, &args);
+		SCX_CALL_OP_TASK(sch, exit_task, task_rq(p), p, &args);
 }
 
 static void scx_disable_and_exit_task(struct scx_sched *sch,
@@ -3903,8 +3802,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p,
 
 	p->scx.weight = sched_weight_to_cgroup(scale_load_down(lw->weight));
 	if (SCX_HAS_OP(sch, set_weight))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_weight, rq,
-				 p, p->scx.weight);
+		SCX_CALL_OP_TASK(sch, set_weight, rq, p, p->scx.weight);
 }
 
 static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio)
@@ -3925,8 +3823,7 @@ static void switching_to_scx(struct rq *rq, struct task_struct *p)
 	 * different scheduler class. Keep the BPF scheduler up-to-date.
 	 */
 	if (SCX_HAS_OP(sch, set_cpumask))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, set_cpumask, rq,
-				 p, (struct cpumask *)p->cpus_ptr);
+		SCX_CALL_OP_TASK(sch, set_cpumask, rq, p, (struct cpumask *)p->cpus_ptr);
 }
 
 static void switched_from_scx(struct rq *rq, struct task_struct *p)
@@ -4309,7 +4206,7 @@ int scx_tg_online(struct task_group *tg)
 				  .bw_quota_us = tg->scx.bw_quota_us,
 				  .bw_burst_us = tg->scx.bw_burst_us };
 
-			ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init,
+			ret = SCX_CALL_OP_RET(sch, cgroup_init,
 					      NULL, tg->css.cgroup, &args);
 			if (ret)
 				ret = ops_sanitize_err(sch, "cgroup_init", ret);
@@ -4331,8 +4228,7 @@ void scx_tg_offline(struct task_group *tg)
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_exit) &&
 	    (tg->scx.flags & SCX_TG_INITED))
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
-			    tg->css.cgroup);
+		SCX_CALL_OP(sch, cgroup_exit, NULL, tg->css.cgroup);
 	tg->scx.flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED);
 }
 
@@ -4361,8 +4257,7 @@ int scx_cgroup_can_attach(struct cgroup_taskset *tset)
 			continue;
 
 		if (SCX_HAS_OP(sch, cgroup_prep_move)) {
-			ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED,
-					      cgroup_prep_move, NULL,
+			ret = SCX_CALL_OP_RET(sch, cgroup_prep_move, NULL,
 					      p, from, css->cgroup);
 			if (ret)
 				goto err;
@@ -4377,7 +4272,7 @@ err:
 	cgroup_taskset_for_each(p, css, tset) {
 		if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
 		    p->scx.cgrp_moving_from)
-			SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+			SCX_CALL_OP(sch, cgroup_cancel_move, NULL,
 				    p, p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
 	}
@@ -4398,7 +4293,7 @@ void scx_cgroup_move_task(struct task_struct *p)
 	 */
 	if (SCX_HAS_OP(sch, cgroup_move) &&
 	    !WARN_ON_ONCE(!p->scx.cgrp_moving_from))
-		SCX_CALL_OP_TASK(sch, SCX_KF_REST, cgroup_move, task_rq(p),
+		SCX_CALL_OP_TASK(sch, cgroup_move, task_rq(p),
 				 p, p->scx.cgrp_moving_from,
 				 tg_cgrp(task_group(p)));
 	p->scx.cgrp_moving_from = NULL;
@@ -4416,7 +4311,7 @@ void scx_cgroup_cancel_attach(struct cgroup_taskset *tset)
 	cgroup_taskset_for_each(p, css, tset) {
 		if (SCX_HAS_OP(sch, cgroup_cancel_move) &&
 		    p->scx.cgrp_moving_from)
-			SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_cancel_move, NULL,
+			SCX_CALL_OP(sch, cgroup_cancel_move, NULL,
 				    p, p->scx.cgrp_moving_from, css->cgroup);
 		p->scx.cgrp_moving_from = NULL;
 	}
@@ -4430,8 +4325,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_weight) &&
 	    tg->scx.weight != weight)
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_weight, NULL,
-			    tg_cgrp(tg), weight);
+		SCX_CALL_OP(sch, cgroup_set_weight, NULL, tg_cgrp(tg), weight);
 
 	tg->scx.weight = weight;
 
@@ -4445,8 +4339,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle)
 	percpu_down_read(&scx_cgroup_ops_rwsem);
 
 	if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL,
-			    tg_cgrp(tg), idle);
+		SCX_CALL_OP(sch, cgroup_set_idle, NULL, tg_cgrp(tg), idle);
 
 	/* Update the task group's idle state */
 	tg->scx.idle = idle;
@@ -4465,7 +4358,7 @@ void scx_group_set_bandwidth(struct task_group *tg,
 	    (tg->scx.bw_period_us != period_us ||
 	     tg->scx.bw_quota_us != quota_us ||
 	     tg->scx.bw_burst_us != burst_us))
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_bandwidth, NULL,
+		SCX_CALL_OP(sch, cgroup_set_bandwidth, NULL,
 			    tg_cgrp(tg), period_us, quota_us, burst_us);
 
 	tg->scx.bw_period_us = period_us;
@@ -4690,8 +4583,7 @@ static void scx_cgroup_exit(struct scx_sched *sch)
 		if (!sch->ops.cgroup_exit)
 			continue;
 
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_exit, NULL,
-			    css->cgroup);
+		SCX_CALL_OP(sch, cgroup_exit, NULL, css->cgroup);
 	}
 }
 
@@ -4722,7 +4614,7 @@ static int scx_cgroup_init(struct scx_sched *sch)
 			continue;
 		}
 
-		ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL,
+		ret = SCX_CALL_OP_RET(sch, cgroup_init, NULL,
 				      css->cgroup, &args);
 		if (ret) {
 			scx_error(sch, "ops.cgroup_init() failed (%d)", ret);
@@ -5795,12 +5687,12 @@ static void scx_sub_disable(struct scx_sched *sch)
 			.ops = &sch->ops,
 			.cgroup_path = sch->cgrp_path,
 		};
-		SCX_CALL_OP(parent, SCX_KF_UNLOCKED, sub_detach, NULL,
+		SCX_CALL_OP(parent, sub_detach, NULL,
 			    &sub_detach_args);
 	}
 
 	if (sch->ops.exit)
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, sch->exit_info);
+		SCX_CALL_OP(sch, exit, NULL, sch->exit_info);
 	kobject_del(&sch->kobj);
 }
 #else	/* CONFIG_EXT_SUB_SCHED */
@@ -5915,7 +5807,7 @@ static void scx_root_disable(struct scx_sched *sch)
 	}
 
 	if (sch->ops.exit)
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, exit, NULL, ei);
+		SCX_CALL_OP(sch, exit, NULL, ei);
 
 	scx_unlink_sched(sch);
 
@@ -6178,7 +6070,7 @@ static void scx_dump_task(struct scx_sched *sch,
 
 	if (SCX_HAS_OP(sch, dump_task)) {
 		ops_dump_init(s, "    ");
-		SCX_CALL_OP(sch, SCX_KF_REST, dump_task, NULL, dctx, p);
+		SCX_CALL_OP(sch, dump_task, NULL, dctx, p);
 		ops_dump_exit();
 	}
 
@@ -6242,7 +6134,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
 
 	if (SCX_HAS_OP(sch, dump)) {
 		ops_dump_init(&s, "");
-		SCX_CALL_OP(sch, SCX_KF_UNLOCKED, dump, NULL, &dctx);
+		SCX_CALL_OP(sch, dump, NULL, &dctx);
 		ops_dump_exit();
 	}
 
@@ -6302,7 +6194,7 @@ static void scx_dump_state(struct scx_sched *sch, struct scx_exit_info *ei,
 		used = seq_buf_used(&ns);
 		if (SCX_HAS_OP(sch, dump_cpu)) {
 			ops_dump_init(&ns, "  ");
-			SCX_CALL_OP(sch, SCX_KF_REST, dump_cpu, NULL,
+			SCX_CALL_OP(sch, dump_cpu, NULL,
 				    &dctx, cpu, idle);
 			ops_dump_exit();
 		}
@@ -6748,7 +6640,7 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 	scx_idle_enable(ops);
 
 	if (sch->ops.init) {
-		ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
+		ret = SCX_CALL_OP_RET(sch, init, NULL);
 		if (ret) {
 			ret = ops_sanitize_err(sch, "init", ret);
 			cpus_read_unlock();
@@ -7020,7 +6912,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	}
 
 	if (sch->ops.init) {
-		ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, init, NULL);
+		ret = SCX_CALL_OP_RET(sch, init, NULL);
 		if (ret) {
 			ret = ops_sanitize_err(sch, "init", ret);
 			scx_error(sch, "ops.init() failed (%d)", ret);
@@ -7037,7 +6929,7 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 		.cgroup_path = sch->cgrp_path,
 	};
 
-	ret = SCX_CALL_OP_RET(parent, SCX_KF_UNLOCKED, sub_attach, NULL,
+	ret = SCX_CALL_OP_RET(parent, sub_attach, NULL,
 			      &sub_attach_args);
 	if (ret) {
 		ret = ops_sanitize_err(sch, "sub_attach", ret);
@@ -7891,9 +7783,6 @@ static bool scx_vet_enq_flags(struct scx_sched *sch, u64 dsq_id, u64 *enq_flags)
 static bool scx_dsq_insert_preamble(struct scx_sched *sch, struct task_struct *p,
 				    u64 dsq_id, u64 *enq_flags)
 {
-	if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
-		return false;
-
 	lockdep_assert_irqs_disabled();
 
 	if (unlikely(!p)) {
@@ -8146,10 +8035,6 @@ static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
 	bool in_balance;
 	unsigned long flags;
 
-	if ((scx_locked_rq() || this_rq()->scx.in_select_cpu) &&
-	    !scx_kf_allowed(sch, SCX_KF_DISPATCH))
-		return false;
-
 	if (!scx_vet_enq_flags(sch, dsq_id, &enq_flags))
 		return false;
 
@@ -8244,9 +8129,6 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(const struct bpf_prog_aux *aux)
 	if (unlikely(!sch))
 		return 0;
 
-	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
-		return 0;
-
 	return sch->dsp_max_batch - __this_cpu_read(sch->pcpu->dsp_ctx.cursor);
 }
 
@@ -8268,9 +8150,6 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(const struct bpf_prog_aux *aux)
 	if (unlikely(!sch))
 		return;
 
-	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
-		return;
-
 	dspc = &this_cpu_ptr(sch->pcpu)->dsp_ctx;
 
 	if (dspc->cursor > 0)
@@ -8317,9 +8196,6 @@ __bpf_kfunc bool scx_bpf_dsq_move_to_local___v2(u64 dsq_id, u64 enq_flags,
 	if (unlikely(!sch))
 		return false;
 
-	if (!scx_kf_allowed(sch, SCX_KF_DISPATCH))
-		return false;
-
 	if (!scx_vet_enq_flags(sch, SCX_DSQ_LOCAL, &enq_flags))
 		return false;
 
@@ -8473,9 +8349,6 @@ __bpf_kfunc bool scx_bpf_sub_dispatch(u64 cgroup_id, const struct bpf_prog_aux *
 	if (unlikely(!parent))
 		return false;
 
-	if (!scx_kf_allowed(parent, SCX_KF_DISPATCH))
-		return false;
-
 	child = scx_find_sub_sched(cgroup_id);
 
 	if (unlikely(!child))
@@ -8535,9 +8408,6 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(const struct bpf_prog_aux *aux)
 	if (unlikely(!sch))
 		return 0;
 
-	if (!scx_kf_allowed(sch, SCX_KF_CPU_RELEASE))
-		return 0;
-
 	rq = cpu_rq(smp_processor_id());
 	lockdep_assert_rq_held(rq);
 
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index ec49e0c9892e..443d12a3df67 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -789,7 +789,7 @@ void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
 	 */
 	if (SCX_HAS_OP(sch, update_idle) && do_notify &&
 	    !scx_bypassing(sch, cpu_of(rq)))
-		SCX_CALL_OP(sch, SCX_KF_REST, update_idle, rq, cpu_of(rq), idle);
+		SCX_CALL_OP(sch, update_idle, rq, cpu_of(rq), idle);
 }
 
 static void reset_idle_masks(struct sched_ext_ops *ops)
@@ -937,8 +937,6 @@ static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
 	} else if (!scx_locked_rq()) {
 		raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
 		we_locked = true;
-	} else if (!scx_kf_allowed(sch, SCX_KF_ENQUEUE)) {
-		return -EPERM;
 	}
 
 	/*

From 979a98b6e9bf8ebf11dc3ca260be087606ac4c2c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 129/134] sched_ext: Rename scx_kf_allowed_on_arg_tasks() to
 scx_kf_arg_task_ok()

The "kf_allowed" framing on this helper comes from the old runtime
scx_kf_allowed() gate, which has been removed. Rename it to describe what it
actually does in the new model.

Pure rename, no functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d37418a684e9..40421698c9e3 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -494,7 +494,7 @@ do {										\
  * SCX_CALL_OP_TASK*() invokes an SCX op that takes one or two task arguments
  * and records them in current->scx.kf_tasks[] for the duration of the call. A
  * kfunc invoked from inside such an op can then use
- * scx_kf_allowed_on_arg_tasks() to verify that its task argument is one of
+ * scx_kf_arg_task_ok() to verify that its task argument is one of
  * those subject tasks.
  *
  * Every SCX_CALL_OP_TASK*() call site invokes its op with @p's rq lock held -
@@ -532,7 +532,7 @@ do {										\
 })
 
 /* see SCX_CALL_OP_TASK() */
-static __always_inline bool scx_kf_allowed_on_arg_tasks(struct scx_sched *sch,
+static __always_inline bool scx_kf_arg_task_ok(struct scx_sched *sch,
 							struct task_struct *p)
 {
 	if (unlikely((p != current->scx.kf_tasks[0] &&
@@ -9424,7 +9424,7 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p,
 	if (unlikely(!sch))
 		goto out;
 
-	if (!scx_kf_allowed_on_arg_tasks(sch, p))
+	if (!scx_kf_arg_task_ok(sch, p))
 		goto out;
 
 	cgrp = tg_cgrp(tg);

From e719e17d99aaf3922dbc15ae3ac3bb62fac32bad Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 07:54:06 -1000
Subject: [PATCH 130/134] sched_ext: Warn on task-based SCX op recursion

The kf_tasks[] design assumes task-based SCX ops don't nest - if they
did, kf_tasks[0] would get clobbered. The old scx_kf_allow() WARN_ONCE
caught invalid nesting via kf_mask, but that machinery is gone now.

Add a WARN_ON_ONCE(current->scx.kf_tasks[0]) at the top of each
SCX_CALL_OP_TASK*() macro. Checking kf_tasks[0] alone is sufficient: all
three variants (SCX_CALL_OP_TASK, SCX_CALL_OP_TASK_RET,
SCX_CALL_OP_2TASKS_RET) write to kf_tasks[0], so a non-NULL value at
entry to any of the three means re-entry from somewhere in the family.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
---
 kernel/sched/ext.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 40421698c9e3..b8dbae251fd5 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -502,10 +502,13 @@ do {										\
  * held by try_to_wake_up() with rq tracking via scx_rq.in_select_cpu. So if
  * kf_tasks[] is set, @p's scheduler-protected fields are stable.
  *
- * These macros only work for non-nesting ops since kf_tasks[] is not stacked.
+ * kf_tasks[] can not stack, so task-based SCX ops must not nest. The
+ * WARN_ON_ONCE() in each macro catches a re-entry of any of the three variants
+ * while a previous one is still in progress.
  */
 #define SCX_CALL_OP_TASK(sch, op, rq, task, args...)				\
 do {										\
+	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
 	current->scx.kf_tasks[0] = task;					\
 	SCX_CALL_OP((sch), op, rq, task, ##args);				\
 	current->scx.kf_tasks[0] = NULL;					\
@@ -514,6 +517,7 @@ do {										\
 #define SCX_CALL_OP_TASK_RET(sch, op, rq, task, args...)			\
 ({										\
 	__typeof__((sch)->ops.op(task, ##args)) __ret;				\
+	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
 	current->scx.kf_tasks[0] = task;					\
 	__ret = SCX_CALL_OP_RET((sch), op, rq, task, ##args);			\
 	current->scx.kf_tasks[0] = NULL;					\
@@ -523,6 +527,7 @@ do {										\
 #define SCX_CALL_OP_2TASKS_RET(sch, op, rq, task0, task1, args...)		\
 ({										\
 	__typeof__((sch)->ops.op(task0, task1, ##args)) __ret;			\
+	WARN_ON_ONCE(current->scx.kf_tasks[0]);					\
 	current->scx.kf_tasks[0] = task0;					\
 	current->scx.kf_tasks[1] = task1;					\
 	__ret = SCX_CALL_OP_RET((sch), op, rq, task0, task1, ##args);		\

From 49d78adf9555bbc02ccb65a28325e3e57e9c52ed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 10 Apr 2026 10:19:22 -1000
Subject: [PATCH 131/134] sched_ext: Drop spurious warning on kick during
 scheduler disable

kick_cpus_irq_workfn() warns when scx_kick_syncs is NULL, but this can
legitimately happen when a BPF timer or other kick source races with
free_kick_syncs() during scheduler disable. Drop the pr_warn_once() and
add a comment explaining the race.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
---
 kernel/sched/ext.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index b8dbae251fd5..012ca8bd70fb 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7600,10 +7600,9 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 	unsigned long *ksyncs;
 	s32 cpu;
 
-	if (unlikely(!ksyncs_pcpu)) {
-		pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_syncs");
+	/* can race with free_kick_syncs() during scheduler disable */
+	if (unlikely(!ksyncs_pcpu))
 		return;
-	}
 
 	ksyncs = rcu_dereference_bh(ksyncs_pcpu)->syncs;
 

From 3d3667f265148d856bc6eb54d1bd780a94e38da7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 12 Apr 2026 17:30:52 -1000
Subject: [PATCH 132/134] tools/sched_ext: Kick home CPU for stranded tasks in
 scx_qmap

scx_qmap uses global BPF queue maps (BPF_MAP_TYPE_QUEUE) that any CPU's
ops.dispatch() can pop from. When a CPU pops a task that can't run on it
(e.g. a pinned per-CPU kthread), it inserts the task into SHARED_DSQ.
consume_dispatch_q() then skips the task due to affinity mismatch, leaving it
stranded until some CPU in its allowed mask calls ops.dispatch(). This doesn't
cause indefinite stalls -- the periodic tick keeps firing (can_stop_idle_tick()
returns false when softirq is pending) -- but can cause noticeable scheduling
delays.

After inserting to SHARED_DSQ, kick the task's home CPU if this CPU can't run
it. There's a small race window where the home CPU can enter idle before the
kick lands -- if a per-CPU kthread like ksoftirqd is the stranded task, this
can trigger a "NOHZ tick-stop error" warning. The kick arrives shortly after
and the home CPU drains the task.

Rather than fully eliminating the warning by routing pinned tasks to local or
global DSQs, the current code keeps them going through the normal BPF queue
path and documents the race and the resulting warning in detail. scx_qmap is an
example scheduler and having tasks go through the usual dispatch path is useful
for testing. The detailed comment also serves as a reference for other
schedulers that may encounter similar warnings.

Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/scx_qmap.bpf.c | 40 ++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index f3587fb709c9..b68abb9e760b 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -471,6 +471,46 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			__sync_fetch_and_add(&nr_dispatched, 1);
 
 			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
+
+			/*
+			 * scx_qmap uses a global BPF queue that any CPU's
+			 * dispatch can pop from. If this CPU popped a task that
+			 * can't run here, it gets stranded on SHARED_DSQ after
+			 * consume_dispatch_q() skips it. Kick the task's home
+			 * CPU so it drains SHARED_DSQ.
+			 *
+			 * There's a race between the pop and the flush of the
+			 * buffered dsq_insert:
+			 *
+			 *  CPU 0 (dispatching)      CPU 1 (home, idle)
+			 *  ~~~~~~~~~~~~~~~~~~~      ~~~~~~~~~~~~~~~~~~~
+			 *  pop from BPF queue
+			 *  dsq_insert(buffered)
+			 *                           balance:
+			 *                             SHARED_DSQ empty
+			 *                             BPF queue empty
+			 *                             -> goes idle
+			 *  flush -> on SHARED
+			 *  kick CPU 1
+			 *                           wakes, drains task
+			 *
+			 * The kick prevents indefinite stalls but a per-CPU
+			 * kthread like ksoftirqd can be briefly stranded when
+			 * its home CPU enters idle with softirq pending,
+			 * triggering:
+			 *
+			 *  "NOHZ tick-stop error: local softirq work is pending, handler #N!!!"
+			 *
+			 * from report_idle_softirq(). The kick lands shortly
+			 * after and the home CPU drains the task. This could be
+			 * avoided by e.g. dispatching pinned tasks to local or
+			 * global DSQs, but the current code is left as-is to
+			 * document this class of issue -- other schedulers
+			 * seeing similar warnings can use this as a reference.
+			 */
+			if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
+				scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
+
 			bpf_task_release(p);
 
 			batch--;

From 4615361f0b148c172852590e6245a953cc075b73 Mon Sep 17 00:00:00 2001
From: Kuba Piecuch <jpiecuch@google.com>
Date: Mon, 13 Apr 2026 12:49:02 +0000
Subject: [PATCH 133/134] sched_ext: Make string params of __ENUM_set() const

A small change to improve type safety/const correctness.
__COMPAT_read_enum() already has const string parameters.

It fixes a warning when using the header in C++ code:

  error: ISO C++11 does not allow conversion from string literal
         to 'char *' [-Werror,-Wwritable-strings]

That's because string literals have type char[N] in C and
const char[N] in C++.

Signed-off-by: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/enums.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/sched_ext/include/scx/enums.h b/tools/sched_ext/include/scx/enums.h
index 8e7c91575f0b..c3b09acce824 100644
--- a/tools/sched_ext/include/scx/enums.h
+++ b/tools/sched_ext/include/scx/enums.h
@@ -9,7 +9,7 @@
 #ifndef __SCX_ENUMS_H
 #define __SCX_ENUMS_H
 
-static inline void __ENUM_set(u64 *val, char *type, char *name)
+static inline void __ENUM_set(u64 *val, const char *type, const char *name)
 {
 	bool res;
 

From 7e311bafb9ad3a4711c08c00b09fb7839ada37f0 Mon Sep 17 00:00:00 2001
From: Kuba Piecuch <jpiecuch@google.com>
Date: Mon, 13 Apr 2026 15:50:05 +0000
Subject: [PATCH 134/134] tools/sched_ext: Add explicit cast from void* in
 RESIZE_ARRAY()

This fixes the following compilation error when using the header from
C++ code:

  error: assigning to 'struct scx_flux__data_uei_dump *' from
  incompatible type 'void *'

Signed-off-by: Kuba Piecuch <jpiecuch@google.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 tools/sched_ext/include/scx/common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
index 823251fc4715..60f5513787d6 100644
--- a/tools/sched_ext/include/scx/common.h
+++ b/tools/sched_ext/include/scx/common.h
@@ -67,6 +67,7 @@ typedef int64_t s64;
 		bpf_map__set_value_size((__skel)->maps.elfsec##_##arr,			\
 				sizeof((__skel)->elfsec##_##arr->arr[0]) * (n));	\
 		(__skel)->elfsec##_##arr =						\
+			(typeof((__skel)->elfsec##_##arr))				\
 			bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz);	\
 	} while (0)