From dde3763365d80398d1465214458d0c38cc32de9c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Dec 2025 18:19:14 +0000 Subject: [PATCH 1/5] sched/headers: Remove whitespace noise from kernel/sched/sched.h A single case of space-Tab noise snuck in recently. Fixes: 36569780b0d6 ("sched: Change nr_uninterruptible type to unsigned long") Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/176478595428.498.13816176784792752599.tip-bot2@tip-bot2 --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8590113e4a60..cb80666addec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1165,7 +1165,7 @@ struct rq { * one CPU and if it got migrated afterwards it may decrease * it on another CPU. Always updated under the runqueue lock: */ - unsigned long nr_uninterruptible; + unsigned long nr_uninterruptible; #ifdef CONFIG_SCHED_PROXY_EXEC struct task_struct __rcu *donor; /* Scheduling context */ From e38e5299747b23015b00b0109891815db44a2f30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Sep 2025 22:46:29 +0200 Subject: [PATCH 2/5] sched/hrtick: Fix hrtick() vs. scheduling context The sched_class::task_tick() method is called on the donor sched_class, and sched_tick() hands it rq->donor as argument, which is consistent. However, while hrtick() uses the donor sched_class, it then passes rq->curr, which is inconsistent. Fix it. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: John Stultz Link: https://patch.msgid.link/20250918080205.442967033@infradead.org --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc358c1b6ca9..1711e9e50100 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -878,7 +878,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) rq_lock(rq, &rf); update_rq_clock(rq); - rq->donor->sched_class->task_tick(rq, rq->curr, 1); + rq->donor->sched_class->task_tick(rq, rq->donor, 1); rq_unlock(rq, &rf); return HRTIMER_NORESTART; From 22abd832776b1317ae4c3f8a097c8b71bf83fb38 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 27 Nov 2025 16:55:29 +0100 Subject: [PATCH 3/5] sched/rt: Remove a preempt-disable section in rt_mutex_setprio() rt_mutex_setprio() has only one caller: rt_mutex_adjust_prio(). It expects that task_struct::pi_lock and rt_mutex_base::wait_lock are held. Both locks are raw_spinlock_t and are acquired with disabled interrupts. Nevertheless rt_mutex_setprio() disables preemption while invoking __balance_callbacks() and raw_spin_rq_unlock(). Even if one of the balance callbacks unlocks the rq then it must not enable interrupts because rt_mutex_base::wait_lock is still locked. Therefore interrupts should remain disabled and disabling preemption is not needed. Commit 4c9a4bc89a9cc ("sched: Allow balance callbacks for check_class_changed()") adds a preempt-disable section to rt_mutex_setprio() and __sched_setscheduler(). In __sched_setscheduler() the preemption is disabled before rq is unlocked and interrupts enabled but I don't see why it makes a difference in rt_mutex_setprio(). Remove the preempt_disable() section from rt_mutex_setprio(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://patch.msgid.link/20251127155529.t_sTatE4@linutronix.de --- kernel/sched/core.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1711e9e50100..ee7dfbf01792 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7352,15 +7352,12 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) p->prio = prio; } out_unlock: - /* Avoid rq from going away on us: */ - preempt_disable(); + /* Caller holds task_struct::pi_lock, IRQs are still disabled */ rq_unpin_lock(rq, &rf); __balance_callbacks(rq); rq_repin_lock(rq, &rf); __task_rq_unlock(rq, p, &rf); - - preempt_enable(); } #endif /* CONFIG_RT_MUTEXES */ From ca125231dd29fc0678dd3622e9cdea80a51dffe4 Mon Sep 17 00:00:00 2001 From: xupengbo Date: Wed, 27 Aug 2025 10:22:07 +0800 Subject: [PATCH 4/5] sched/fair: Fix unfairness caused by stalled tg_load_avg_contrib when the last task migrates out When a task is migrated out, there is a probability that the tg->load_avg value will become abnormal. The reason is as follows: 1. Due to the 1ms update period limitation in update_tg_load_avg(), there is a possibility that the reduced load_avg is not updated to tg->load_avg when a task migrates out. 2. Even though __update_blocked_fair() traverses the leaf_cfs_rq_list and calls update_tg_load_avg() for cfs_rqs that are not fully decayed, the key function cfs_rq_is_decayed() does not check whether cfs->tg_load_avg_contrib is null. Consequently, in some cases, __update_blocked_fair() removes cfs_rqs whose avg.load_avg has not been updated to tg->load_avg. Add a check of cfs_rq->tg_load_avg_contrib in cfs_rq_is_decayed(), which fixes the case (2.) mentioned above. Fixes: 1528c661c24b ("sched/fair: Ratelimit update to tg->load_avg") Signed-off-by: xupengbo Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Aaron Lu Reviewed-by: Vincent Guittot Tested-by: Aaron Lu Link: https://patch.msgid.link/20250827022208.14487-1-xupengbo@oppo.com --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 769d7b7990df..da46c3164537 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4034,6 +4034,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) if (child_cfs_rq_on_list(cfs_rq)) return false; + if (cfs_rq->tg_load_avg_contrib) + return false; + return true; } From c2ae8b0df2d1bb7a063f9e356e4e9a06cd4afe11 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 5 Dec 2025 01:27:09 +0000 Subject: [PATCH 5/5] sched/core: Fix psi_dequeue() for Proxy Execution Currently, if the sleep flag is set, psi_dequeue() doesn't change any of the psi_flags. This is because psi_task_switch() will clear TSK_ONCPU as well as other potential flags (TSK_RUNNING), and the assumption is that a voluntary sleep always consists of a task being dequeued followed shortly there after with a psi_sched_switch() call. Proxy Execution changes this expectation, as mutex-blocked tasks that would normally sleep stay on the runqueue. But in the case where the mutex-owning task goes to sleep, or the owner is on a remote cpu, we will then deactivate the blocked task shortly after. In that situation, the mutex-blocked task will have had its TSK_ONCPU cleared when it was switched off the cpu, but it will stay TSK_RUNNING. Then if we later dequeue it (as currently done if we hit a case find_proxy_task() can't yet handle, such as the case of the owner being on another rq or a sleeping owner) psi_dequeue() won't change any state (leaving it TSK_RUNNING), as it incorrectly expects a psi_task_switch() call to immediately follow. Later on when the task get woken/re-enqueued, and psi_flags are set for TSK_RUNNING, we hit an error as the task is already TSK_RUNNING: psi: inconsistent task state! task=188:kworker/28:0 cpu=28 psi_flags=4 clear=0 set=4 To resolve this, extend the logic in psi_dequeue() so that if the sleep flag is set, we also check if psi_flags have TSK_ONCPU set (meaning the psi_task_switch is imminent) before we do the shortcut return. If TSK_ONCPU is not set, that means we've already switched away, and this psi_dequeue call needs to clear the flags. Fixes: be41bde4c3a8 ("sched: Add an initial sketch of the find_proxy_task() function") Reported-by: K Prateek Nayak Signed-off-by: John Stultz Signed-off-by: Ingo Molnar Tested-by: K Prateek Nayak Tested-by: Haiyue Wang Acked-by: Johannes Weiner Link: https://patch.msgid.link/20251205012721.756394-1-jstultz@google.com Closes: https://lore.kernel.org/lkml/20251117185550.365156-1-kprateek.nayak@amd.com/ --- kernel/sched/stats.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index cbf7206b3f9d..c903f1a42891 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -180,8 +180,13 @@ static inline void psi_dequeue(struct task_struct *p, int flags) * avoid walking all ancestors twice, psi_task_switch() handles * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. * Do nothing here. + * + * In the SCHED_PROXY_EXECUTION case we may do sleeping + * dequeues that are not followed by a task switch, so check + * TSK_ONCPU is set to ensure the task switch is imminent. + * Otherwise clear the flags as usual. */ - if (flags & DEQUEUE_SLEEP) + if ((flags & DEQUEUE_SLEEP) && (p->psi_flags & TSK_ONCPU)) return; /*