From 517a44d18537ef8ab888f71197c80116c14cee0a Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 8 Dec 2025 19:23:19 +0800 Subject: [PATCH 1/7] sched_ext: Fix the memleak for sch->helper objects This commit use kthread_destroy_worker() to release sch->helper objects to fix the following kmemleak: unreferenced object 0xffff888121ec7b00 (size 128): comm "scx_simple", pid 1197, jiffies 4295884415 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 ad 4e ad de .............N.. ff ff ff ff 00 00 00 00 ff ff ff ff ff ff ff ff ................ backtrace (crc 587b3352): kmemleak_alloc+0x62/0xa0 __kmalloc_cache_noprof+0x28d/0x3e0 kthread_create_worker_on_node+0xd5/0x1f0 scx_enable.isra.210+0x6c2/0x25b0 bpf_scx_reg+0x12/0x20 bpf_struct_ops_link_create+0x2c3/0x3b0 __sys_bpf+0x3102/0x4b00 __x64_sys_bpf+0x79/0xc0 x64_sys_call+0x15d9/0x1dd0 do_syscall_64+0xf0/0x470 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: bff3b5aec1b7 ("sched_ext: Move disable machinery into scx_sched") Cc: stable@vger.kernel.org # v6.16+ Signed-off-by: Zqiang Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 05f5a49e9649..073b669869cb 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3575,7 +3575,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work) int node; irq_work_sync(&sch->error_irq_work); - kthread_stop(sch->helper->task); + kthread_destroy_worker(sch->helper); free_percpu(sch->pcpu); @@ -4786,7 +4786,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) return sch; err_stop_helper: - kthread_stop(sch->helper->task); + kthread_destroy_worker(sch->helper); err_free_pcpu: free_percpu(sch->pcpu); err_free_gdsqs: From 12b5cd99a05f7cbc2ceb88b3b9601d404ef2236a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 6 Dec 2025 02:22:03 +0000 Subject: [PATCH 2/7] sched/ext: Avoid null ptr traversal when ->put_prev_task() is called with NULL next Early when trying to get sched_ext and proxy-exe working together, I kept tripping over NULL ptr in put_prev_task_scx() on the line: if (sched_class_above(&ext_sched_class, next->sched_class)) { Which was due to put_prev_task() passes a NULL next, calling: prev->sched_class->put_prev_task(rq, prev, NULL); put_prev_task_scx() already guards for a NULL next in the switch_class case, but doesn't seem to have a guard for sched_class_above() check. I can't say I understand why this doesn't trip usually without proxy-exec. And in newer kernels there are way fewer put_prev_task(), and I can't easily reproduce the issue now even with proxy-exec. But we still have one put_prev_task() call left in core.c that seems like it could trip this, so I wanted to send this out for consideration. tj: put_prev_task() can be called with NULL @next; however, when @p is queued, that doesn't happen, so this condition shouldn't currently be triggerable. The connection isn't straightforward or necessarily reliable, so add the NULL check even if it can't currently be triggered. Link: http://lkml.kernel.org/r/20251206022218.1541878-1-jstultz@google.com Signed-off-by: John Stultz Reviewed-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 073b669869cb..bd74b371f52d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2402,7 +2402,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, * ops.enqueue() that @p is the only one available for this cpu, * which should trigger an explicit follow-up scheduling event. */ - if (sched_class_above(&ext_sched_class, next->sched_class)) { + if (next && sched_class_above(&ext_sched_class, next->sched_class)) { WARN_ON_ONCE(!(sch->ops.flags & SCX_OPS_ENQ_LAST)); do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); } else { From 9f769637a93fac81689b80df6855f545839cf999 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Dec 2025 11:04:33 -1000 Subject: [PATCH 3/7] sched_ext: Fix bypass depth leak on scx_enable() failure scx_enable() calls scx_bypass(true) to initialize in bypass mode and then scx_bypass(false) on success to exit. If scx_enable() fails during task initialization - e.g. scx_cgroup_init() or scx_init_task() returns an error - it jumps to err_disable while bypass is still active. scx_disable_workfn() then calls scx_bypass(true/false) for its own bypass, leaving the bypass depth at 1 instead of 0. This causes the system to remain permanently in bypass mode after a failed scx_enable(). Failures after task initialization is complete - e.g. scx_tryset_enable_state() at the end - already call scx_bypass(false) before reaching the error path and are not affected. This only affects a subset of failure modes. Fix it by tracking whether scx_enable() called scx_bypass(true) in a bool and having scx_disable_workfn() call an extra scx_bypass(false) to clear it. This is a temporary measure as the bypass depth will be moved into the sched instance, which will make this tracking unnecessary. Fixes: 8c2090c504e9 ("sched_ext: Initialize in bypass mode") Cc: stable@vger.kernel.org # v6.12+ Reported-by: Chris Mason Reviewed-by: Emil Tsalapatis Link: https://lore.kernel.org/stable/286e6f7787a81239e1ce2989b52391ce%40kernel.org Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index bd74b371f52d..c4465ccefea4 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -41,6 +41,13 @@ static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); +/* + * Tracks whether scx_enable() called scx_bypass(true). Used to balance bypass + * depth on enable failure. Will be removed when bypass depth is moved into the + * sched instance. + */ +static bool scx_bypassed_for_enable; + static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); @@ -4318,6 +4325,11 @@ static void scx_disable_workfn(struct kthread_work *work) scx_dsp_max_batch = 0; free_kick_syncs(); + if (scx_bypassed_for_enable) { + scx_bypassed_for_enable = false; + scx_bypass(false); + } + mutex_unlock(&scx_enable_mutex); WARN_ON_ONCE(scx_set_enable_state(SCX_DISABLED) != SCX_DISABLING); @@ -4970,6 +4982,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) * Init in bypass mode to guarantee forward progress. */ scx_bypass(true); + scx_bypassed_for_enable = true; for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) if (((void (**)(void))ops)[i]) @@ -5067,6 +5080,7 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); + scx_bypassed_for_enable = false; scx_bypass(false); if (!scx_tryset_enable_state(SCX_ENABLED, SCX_ENABLING)) { From 530b6637c79e728d58f1d9b66bd4acf4b735b86d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 11 Dec 2025 15:45:03 -1000 Subject: [PATCH 4/7] sched_ext: Factor out local_dsq_post_enq() from dispatch_enqueue() Factor out local_dsq_post_enq() which performs post-enqueue handling for local DSQs - triggering resched_curr() if SCX_ENQ_PREEMPT is specified or if the current CPU is idle. No functional change. This will be used by the next patch to fix move_local_task_to_local_dsq(). Cc: stable@vger.kernel.org # v6.12+ Reviewed-by: Andrea Righi Reviewed-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c4465ccefea4..c78efa99406f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -982,6 +982,22 @@ static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } +static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p, + u64 enq_flags) +{ + struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); + bool preempt = false; + + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && + rq->curr->sched_class == &ext_sched_class) { + rq->curr->scx.slice = 0; + preempt = true; + } + + if (preempt || sched_class_above(&ext_sched_class, rq->curr->sched_class)) + resched_curr(rq); +} + static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, struct task_struct *p, u64 enq_flags) { @@ -1093,22 +1109,10 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, if (enq_flags & SCX_ENQ_CLEAR_OPSS) atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); - if (is_local) { - struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); - bool preempt = false; - - if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && - rq->curr->sched_class == &ext_sched_class) { - rq->curr->scx.slice = 0; - preempt = true; - } - - if (preempt || sched_class_above(&ext_sched_class, - rq->curr->sched_class)) - resched_curr(rq); - } else { + if (is_local) + local_dsq_post_enq(dsq, p, enq_flags); + else raw_spin_unlock(&dsq->lock); - } } static void task_unlink_from_dsq(struct task_struct *p, From f5e1e5ec204da11fa87fdf006d451d80ce06e118 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 11 Dec 2025 15:45:04 -1000 Subject: [PATCH 5/7] sched_ext: Fix missing post-enqueue handling in move_local_task_to_local_dsq() move_local_task_to_local_dsq() is used when moving a task from a non-local DSQ to a local DSQ on the same CPU. It directly manipulates the local DSQ without going through dispatch_enqueue() and was missing the post-enqueue handling that triggers preemption when SCX_ENQ_PREEMPT is set or the idle task is running. The function is used by move_task_between_dsqs() which backs scx_bpf_dsq_move() and may be called while the CPU is busy. Add local_dsq_post_enq() call to move_local_task_to_local_dsq(). As the dispatch path doesn't need post-enqueue handling, add SCX_RQ_IN_BALANCE early exit to keep consume_dispatch_q() behavior unchanged and avoid triggering unnecessary resched when scx_bpf_dsq_move() is used from the dispatch path. Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") Cc: stable@vger.kernel.org # v6.12+ Reviewed-by: Andrea Righi Reviewed-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index c78efa99406f..695503a2f7d1 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -988,6 +988,14 @@ static void local_dsq_post_enq(struct scx_dispatch_q *dsq, struct task_struct *p struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); bool preempt = false; + /* + * If @rq is in balance, the CPU is already vacant and looking for the + * next task to run. No need to preempt or trigger resched after moving + * @p into its local DSQ. + */ + if (rq->scx.flags & SCX_RQ_IN_BALANCE) + return; + if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && rq->curr->sched_class == &ext_sched_class) { rq->curr->scx.slice = 0; @@ -1636,6 +1644,8 @@ static void move_local_task_to_local_dsq(struct task_struct *p, u64 enq_flags, dsq_mod_nr(dst_dsq, 1); p->scx.dsq = dst_dsq; + + local_dsq_post_enq(dst_dsq, p, enq_flags); } /** From 579a3297b268f0281644ead7ff574a2b4bc64d3c Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Fri, 5 Dec 2025 13:23:14 -0800 Subject: [PATCH 6/7] selftests/sched_ext: flush stdout before test to avoid log spam The sched_ext selftests runner runs each test in the same process, with each test possibly forking multiple times. When the main runner has not flushed its stdout, the children inherit the buffered output for previous tests and emit it during exit. This causes log spam. Make sure stdout/stderr is fully flushed before each test. Cc: Ihor Solodrai Signed-off-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- tools/testing/selftests/sched_ext/runner.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c index aa2d7d32dda9..5748d2c69903 100644 --- a/tools/testing/selftests/sched_ext/runner.c +++ b/tools/testing/selftests/sched_ext/runner.c @@ -46,6 +46,14 @@ static void print_test_preamble(const struct scx_test *test, bool quiet) if (!quiet) printf("DESCRIPTION: %s\n", test->description); printf("OUTPUT:\n"); + + /* + * The tests may fork with the preamble buffered + * in the children's stdout. Flush before the test + * to avoid printing the message multiple times. + */ + fflush(stdout); + fflush(stderr); } static const char *status_to_result(enum scx_test_status status) From bb27226f0d00588ac53be8825e021ae80aa43371 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 15 Dec 2025 19:29:40 +0800 Subject: [PATCH 7/7] sched_ext: Remove unused code in the do_pick_task_scx() The kick_idle variable is no longer used, this commit therefore remove it and also remove associated code in the do_pick_task_scx(). Signed-off-by: Zqiang Reviewed-by: Andrea Righi Reviewed-by: Emil Tsalapatis Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 695503a2f7d1..94164f2dec6d 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -2446,7 +2446,7 @@ static struct task_struct * do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) { struct task_struct *prev = rq->curr; - bool keep_prev, kick_idle = false; + bool keep_prev; struct task_struct *p; /* see kick_cpus_irq_workfn() */ @@ -2488,12 +2488,8 @@ do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) refill_task_slice_dfl(rcu_dereference_sched(scx_root), p); } else { p = first_local_task(rq); - if (!p) { - if (kick_idle) - scx_kick_cpu(rcu_dereference_sched(scx_root), - cpu_of(rq), SCX_KICK_IDLE); + if (!p) return NULL; - } if (unlikely(!p->scx.slice)) { struct scx_sched *sch = rcu_dereference_sched(scx_root);