From 3d7e10188ae0b68dadd60f611ca81ecf9d991f77 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 May 2025 18:26:21 +0200 Subject: [PATCH 01/84] sched: Make clangd usable Due to the weird Makefile setup of sched the various files do not compile as stand alone units. The new generation of editors are trying to do just this -- mostly to offer fancy things like completions but also better syntax highlighting and code navigation. Specifically, I've been playing around with neovim and clangd. Setting up clangd on the kernel source is a giant pain in the arse (this really should be improved), but once you do manage, you run into dumb stuff like the above. Fix up the scheduler files to at least pretend to work. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Ingo Molnar Tested-by: Juri Lelli Link: https://lkml.kernel.org/r/20250523164348.GN39944@noisy.programming.kicks-ass.net --- kernel/sched/autogroup.c | 3 +++ kernel/sched/autogroup.h | 2 ++ kernel/sched/clock.c | 3 +++ kernel/sched/completion.c | 5 +++++ kernel/sched/core_sched.c | 2 ++ kernel/sched/cpuacct.c | 2 ++ kernel/sched/cpudeadline.c | 1 + kernel/sched/cpudeadline.h | 2 ++ kernel/sched/cpufreq.c | 1 + kernel/sched/cpufreq_schedutil.c | 2 ++ kernel/sched/cpupri.c | 1 + kernel/sched/cpupri.h | 3 +++ kernel/sched/cputime.c | 3 +++ kernel/sched/deadline.c | 4 ++++ kernel/sched/debug.c | 3 +++ kernel/sched/idle.c | 5 +++++ kernel/sched/isolation.c | 2 ++ kernel/sched/loadavg.c | 2 ++ kernel/sched/membarrier.c | 2 ++ kernel/sched/pelt.c | 1 + kernel/sched/pelt.h | 7 ++++++- kernel/sched/psi.c | 4 ++++ kernel/sched/rt.c | 3 +++ kernel/sched/sched-pelt.h | 1 + kernel/sched/sched.h | 1 + kernel/sched/smp.h | 7 +++++++ kernel/sched/stats.c | 1 + kernel/sched/stop_task.c | 1 + kernel/sched/swait.c | 1 + kernel/sched/topology.c | 2 ++ kernel/sched/wait.c | 1 + kernel/sched/wait_bit.c | 3 +++ 32 files changed, 80 insertions(+), 1 deletion(-) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2b331822c7e7..e96a167c3f28 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -4,6 +4,9 @@ * Auto-group scheduling implementation: */ +#include "autogroup.h" +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 90d69f2c5eaf..5c1796a463af 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -2,6 +2,8 @@ #ifndef _KERNEL_SCHED_AUTOGROUP_H #define _KERNEL_SCHED_AUTOGROUP_H +#include "sched.h" + #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup { diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index a09655b48140..e62b5510cd2e 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -54,6 +54,9 @@ * */ +#include +#include "sched.h" + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 3561ab533dd4..19ee702273c0 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -13,6 +13,11 @@ * Waiting for completion is a typically sync point, but not an exclusion point. */ +#include +#include +#include +#include "sched.h" + static void complete_with_flags(struct completion *x, int wake_flags) { unsigned long flags; diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index c4606ca89210..9ede71ecba7f 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -4,6 +4,8 @@ * A simple wrapper around refcount. An allocated sched_core_cookie's * address is used to compute the cookie of the task. */ +#include "sched.h" + struct sched_core_cookie { refcount_t refcnt; }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 0de9dda09949..23a56ba12d81 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -6,6 +6,8 @@ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh * (balbir@in.ibm.com). */ +#include +#include "sched.h" /* Time spent by the tasks of the CPU accounting group executing in ... */ enum cpuacct_stat_index { diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 95baa12a1029..cdd740b3f774 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -6,6 +6,7 @@ * * Author: Juri Lelli */ +#include "sched.h" static inline int parent(int i) { diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 0adeda93b5fb..3f7c73d1d189 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include +#include #define IDX_INVALID -1 diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 5252fb191fae..742fb9e62e1a 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -5,6 +5,7 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki */ +#include "sched.h" DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 461242ec958a..3d7e9ccb99a0 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -5,6 +5,8 @@ * Copyright (C) 2016, Intel Corporation * Author: Rafael J. Wysocki */ +#include +#include "sched.h" #define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8) diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 42c40cfdf836..76a9ac5eb794 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -22,6 +22,7 @@ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ +#include "sched.h" /* * p->rt_priority p->prio newpri cpupri diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index d6cba0020064..f0f5a734c0c1 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,4 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 6dab4854c6c0..f01f17acb4a8 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -2,6 +2,9 @@ /* * Simple CPU accounting cgroup controller */ +#include +#include +#include "sched.h" #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ad45a8fea245..ff5be80c5c9d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,10 @@ */ #include +#include +#include +#include "sched.h" +#include "pelt.h" /* * Default limits for DL period; on the top end we guard against small util diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9d71baf08075..b384124d89d3 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -6,6 +6,9 @@ * * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar */ +#include +#include +#include "sched.h" /* * This allows printing both to /sys/kernel/debug/sched/debug and diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 2c85c86b455f..cd3298653ef4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -6,6 +6,11 @@ * (NOTE: these are not related to SCHED_IDLE batch scheduled * tasks which are handled in sched/fair.c ) */ +#include +#include +#include +#include "sched.h" +#include "smp.h" /* Linker adds these: start and end of __cpuidle functions */ extern char __cpuidle_text_start[], __cpuidle_text_end[]; diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 93b038d48900..a4cf17b1fab0 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -7,6 +7,8 @@ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker * */ +#include +#include "sched.h" enum hk_flags { HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index c48900b856a2..f6df84ca6925 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -6,6 +6,8 @@ * figure. Its a silly number but people think its important. We go through * great pains to make it work on big machines and tickless kernels. */ +#include +#include "sched.h" /* * Global load-average calculations diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 809194cd779f..62fba83b7bb1 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -4,6 +4,8 @@ * * membarrier system call */ +#include +#include "sched.h" /* * For documentation purposes, here are some membarrier ordering diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 7a8534a2deff..09be6a83e45a 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -23,6 +23,7 @@ * Move PELT related code from fair.c into this pelt.c file * Author: Vincent Guittot */ +#include "pelt.h" /* * Approximate: diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index f4f6a0875c66..19592077452e 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -1,3 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _KERNEL_SCHED_PELT_H +#define _KERNEL_SCHED_PELT_H +#include "sched.h" + #ifdef CONFIG_SMP #include "sched-pelt.h" @@ -233,4 +238,4 @@ update_idle_rq_clock_pelt(struct rq *rq) { } static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } #endif - +#endif /* _KERNEL_SCHED_PELT_H */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ad04a5c3162a..333a7ba39668 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -136,6 +136,10 @@ * cost-wise, yet way more sensitive and accurate than periodic * sampling of the aggregate task states would be. */ +#include +#include +#include +#include "sched.h" static int psi_bug __read_mostly; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e40422c37033..16008ac33e14 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -4,6 +4,9 @@ * policies) */ +#include "sched.h" +#include "pelt.h" + int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index c529706bed11..6803cfec7a1e 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ +#include static const u32 runnable_avg_yN_inv[] __maybe_unused = { 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 475bb5998295..f3a41487c96d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -69,6 +69,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 21ac44428bb0..7f151d96dba9 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -1,8 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _KERNEL_SCHED_SMP_H +#define _KERNEL_SCHED_SMP_H + /* * Scheduler internal SMP callback types and methods between the scheduler * and other internal parts of the core kernel: */ +#include extern void sched_ttwu_pending(void *arg); @@ -13,3 +18,5 @@ extern void flush_smp_call_function_queue(void); #else static inline void flush_smp_call_function_queue(void) { } #endif + +#endif /* _KERNEL_SCHED_SMP_H */ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 4346fd81c31f..1faea875d0b3 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -2,6 +2,7 @@ /* * /proc/schedstat implementation */ +#include "sched.h" void __update_stats_wait_start(struct rq *rq, struct task_struct *p, struct sched_statistics *stats) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 058dd42e3d9b..1c1bbc6e33b6 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -7,6 +7,7 @@ * * See kernel/stop_machine.c */ +#include "sched.h" #ifdef CONFIG_SMP static int diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 72505cd3b60a..0fef6496c4c8 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -2,6 +2,7 @@ /* * (simple wait queues ) implementation: */ +#include "sched.h" void __init_swait_queue_head(struct swait_queue_head *q, const char *name, struct lock_class_key *key) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b958fe48e020..9026d325d0fd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -3,7 +3,9 @@ * Scheduler topology setup/handling methods */ +#include #include +#include "sched.h" DEFINE_MUTEX(sched_domains_mutex); void sched_domains_mutex_lock(void) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 51e38f5f4701..a6f00012c72e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -4,6 +4,7 @@ * * (C) 2004 Nadia Yvette Chambers, Oracle */ +#include "sched.h" void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) { diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index b410b61cec95..1088d3b7012c 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include +#include "sched.h" + /* * The implementation of the wait_bit*() and related waiting APIs: */ From b01f2d9597250e9c4011cb78d8d46287deaa6a69 Mon Sep 17 00:00:00 2001 From: wang wei Date: Thu, 5 Jun 2025 23:29:31 +0800 Subject: [PATCH 02/84] sched/eevdf: Correct the comment in place_entity Correct "l" to "vl_i". Signed-off-by: wang wei Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250605152931.22804-1-a929244872@163.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7a14da5396fb..83157de5b808 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5253,7 +5253,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) * = (W*V + w_i*(V - vl_i)) / (W + w_i) * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) - * = (V*(W + w_i) - w_i*l) / (W + w_i) + * = (V*(W + w_i) - w_i*vl_i) / (W + w_i) * = V - w_i*vl_i / (W + w_i) * * And the actual lag after adding an entity with vl_i is: From 69ab14ee525649a875ca187cad2f05a2bec1ac3c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:42 +0200 Subject: [PATCH 03/84] sched: Clean up and standardize #if/#else/#endif markers in sched/autogroup.[ch] - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-2-mingo@kernel.org --- kernel/sched/autogroup.c | 6 +++--- kernel/sched/autogroup.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index e96a167c3f28..cdea931aae30 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -28,9 +28,9 @@ static void __init sched_autogroup_sysctl_init(void) { register_sysctl_init("kernel", sched_autogroup_sysctls); } -#else +#else /* !CONFIG_SYSCTL: */ #define sched_autogroup_sysctl_init() do { } while (0) -#endif +#endif /* !CONFIG_SYSCTL */ void __init autogroup_init(struct task_struct *init_task) { @@ -111,7 +111,7 @@ static inline struct autogroup *autogroup_create(void) free_rt_sched_group(tg); tg->rt_se = root_task_group.rt_se; tg->rt_rq = root_task_group.rt_rq; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ tg->autogroup = ag; sched_online_group(tg, &root_task_group); diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 5c1796a463af..06c82b2bdfb5 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h @@ -43,7 +43,7 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) extern int autogroup_path(struct task_group *tg, char *buf, int buflen); -#else /* !CONFIG_SCHED_AUTOGROUP */ +#else /* !CONFIG_SCHED_AUTOGROUP: */ static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } @@ -63,6 +63,6 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) return 0; } -#endif /* CONFIG_SCHED_AUTOGROUP */ +#endif /* !CONFIG_SCHED_AUTOGROUP */ #endif /* _KERNEL_SCHED_AUTOGROUP_H */ From bbb1b274e85b739ade5f9bc060a8c4fa00388e29 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:43 +0200 Subject: [PATCH 04/84] sched: Clean up and standardize #if/#else/#endif markers in sched/clock.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-3-mingo@kernel.org --- kernel/sched/clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e62b5510cd2e..f5e6dd6a6b3a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -474,7 +474,7 @@ notrace void sched_clock_idle_wakeup_event(void) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#else /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK: */ void __init sched_clock_init(void) { @@ -492,7 +492,7 @@ notrace u64 sched_clock_cpu(int cpu) return sched_clock(); } -#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ +#endif /* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * Running clock - returns the time that has elapsed while a guest has been From b7ebb758568b60ae816b0c21df70a67eecb84a71 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:44 +0200 Subject: [PATCH 05/84] sched: Clean up and standardize #if/#else/#endif markers in sched/core.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Apply this simplification: -#if defined(CONFIG_FOO) +#ifdef CONFIG_FOO - Fix whitespace noise. - Use vertical alignment to better visualize nested #ifdef blocks, where appropriate: #ifdef CONFIG_FOO # ifdef CONFIG_BAR ... # endif #endif Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-4-mingo@kernel.org --- kernel/sched/core.c | 186 ++++++++++++++++++++++---------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dce50fa57471..f1ef6d29792c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -481,13 +481,13 @@ void sched_core_put(void) schedule_work(&_work); } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_enqueue(struct rq *rq, struct task_struct *p) { } static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* need a wrapper since we may need to trace from modules */ EXPORT_TRACEPOINT_SYMBOL(sched_set_state_tp); @@ -667,7 +667,7 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } -#endif +#endif /* CONFIG_SMP */ /* * __task_rq_lock - lock the rq @p resides on. @@ -899,7 +899,7 @@ void hrtick_start(struct rq *rq, u64 delay) smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -#else +#else /* !CONFIG_SMP: */ /* * Called to set the hrtick timer state. * @@ -916,7 +916,7 @@ void hrtick_start(struct rq *rq, u64 delay) HRTIMER_MODE_REL_PINNED_HARD); } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) { @@ -925,7 +925,7 @@ static void hrtick_rq_init(struct rq *rq) #endif hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } -#else /* CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_clear(struct rq *rq) { } @@ -933,7 +933,7 @@ static inline void hrtick_clear(struct rq *rq) static inline void hrtick_rq_init(struct rq *rq) { } -#endif /* CONFIG_SCHED_HRTICK */ +#endif /* !CONFIG_SCHED_HRTICK */ /* * try_cmpxchg based fetch_or() macro so it works for different integer types: @@ -1971,7 +1971,7 @@ undo: sysctl_sched_uclamp_util_min_rt_default = old_min_rt; return result; } -#endif +#endif /* CONFIG_SYSCTL */ static void uclamp_fork(struct task_struct *p) { @@ -2037,13 +2037,13 @@ static void __init init_uclamp(void) } } -#else /* !CONFIG_UCLAMP_TASK */ +#else /* !CONFIG_UCLAMP_TASK: */ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p, int flags) { } static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } -#endif /* CONFIG_UCLAMP_TASK */ +#endif /* !CONFIG_UCLAMP_TASK */ bool sched_task_on_rq(struct task_struct *p) { @@ -3661,7 +3661,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } @@ -3770,7 +3770,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } -#endif +#endif /* CONFIG_SMP */ } /* @@ -3992,14 +3992,14 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { return false; } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { @@ -4335,9 +4335,9 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } -#else +#else /* !CONFIG_SMP: */ cpu = task_cpu(p); -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); } @@ -4599,8 +4599,8 @@ static int sysctl_numa_balancing(const struct ctl_table *table, int write, } return err; } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SCHEDSTATS @@ -4787,7 +4787,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) +#ifdef CONFIG_SMP p->on_cpu = 0; #endif init_task_preempt_count(p); @@ -4978,7 +4978,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, __fire_sched_out_preempt_notifiers(curr, next); } -#else /* !CONFIG_PREEMPT_NOTIFIERS */ +#else /* !CONFIG_PREEMPT_NOTIFIERS: */ static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) { @@ -4990,7 +4990,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, { } -#endif /* CONFIG_PREEMPT_NOTIFIERS */ +#endif /* !CONFIG_PREEMPT_NOTIFIERS */ static inline void prepare_task(struct task_struct *next) { @@ -5107,13 +5107,13 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) } } -#else +#else /* !CONFIG_SMP: */ static inline void __balance_callbacks(struct rq *rq) { } -#endif +#endif /* !CONFIG_SMP */ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) @@ -5527,7 +5527,7 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif +#endif /* CONFIG_SMP */ DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); @@ -5835,10 +5835,10 @@ int __init sched_tick_offload_init(void) return 0; } -#else /* !CONFIG_NO_HZ_FULL */ +#else /* !CONFIG_NO_HZ_FULL: */ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } -#endif +#endif /* !CONFIG_NO_HZ_FULL */ #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) @@ -6553,7 +6553,7 @@ static inline void sched_core_cpu_dying(unsigned int cpu) rq->core = rq; } -#else /* !CONFIG_SCHED_CORE */ +#else /* !CONFIG_SCHED_CORE: */ static inline void sched_core_cpu_starting(unsigned int cpu) {} static inline void sched_core_cpu_deactivate(unsigned int cpu) {} @@ -6565,7 +6565,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return __pick_next_task(rq, prev, rf); } -#endif /* CONFIG_SCHED_CORE */ +#endif /* !CONFIG_SCHED_CORE */ /* * Constants for the sched_mode argument of __schedule(). @@ -6992,14 +6992,14 @@ NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_dynamic_enabled -#define preempt_schedule_dynamic_enabled preempt_schedule -#define preempt_schedule_dynamic_disabled NULL -#endif +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# ifndef preempt_schedule_dynamic_enabled +# define preempt_schedule_dynamic_enabled preempt_schedule +# define preempt_schedule_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); void __sched notrace dynamic_preempt_schedule(void) { @@ -7009,8 +7009,8 @@ void __sched notrace dynamic_preempt_schedule(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule); EXPORT_SYMBOL(dynamic_preempt_schedule); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /** * preempt_schedule_notrace - preempt_schedule called by tracing @@ -7065,14 +7065,14 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#ifndef preempt_schedule_notrace_dynamic_enabled -#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace -#define preempt_schedule_notrace_dynamic_disabled NULL -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# ifndef preempt_schedule_notrace_dynamic_enabled +# define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace +# define preempt_schedule_notrace_dynamic_disabled NULL +# endif DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace); void __sched notrace dynamic_preempt_schedule_notrace(void) { @@ -7082,7 +7082,7 @@ void __sched notrace dynamic_preempt_schedule_notrace(void) } NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); -#endif +# endif #endif #endif /* CONFIG_PREEMPTION */ @@ -7301,7 +7301,7 @@ out_unlock: preempt_enable(); } -#endif +#endif /* CONFIG_RT_MUTEXES */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) int __sched __cond_resched(void) @@ -7332,17 +7332,17 @@ EXPORT_SYMBOL(__cond_resched); #endif #ifdef CONFIG_PREEMPT_DYNAMIC -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define cond_resched_dynamic_enabled __cond_resched -#define cond_resched_dynamic_disabled ((void *)&__static_call_return0) +# ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL +# define cond_resched_dynamic_enabled __cond_resched +# define cond_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); -#define might_resched_dynamic_enabled __cond_resched -#define might_resched_dynamic_disabled ((void *)&__static_call_return0) +# define might_resched_dynamic_enabled __cond_resched +# define might_resched_dynamic_disabled ((void *)&__static_call_return0) DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) { @@ -7360,8 +7360,8 @@ int __sched dynamic_might_resched(void) return __cond_resched(); } EXPORT_SYMBOL(dynamic_might_resched); -#endif -#endif +# endif +#endif /* CONFIG_PREEMPT_DYNAMIC */ /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -7427,9 +7427,9 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC -#ifdef CONFIG_GENERIC_ENTRY -#include -#endif +# ifdef CONFIG_GENERIC_ENTRY +# include +# endif /* * SC:cond_resched @@ -7484,37 +7484,37 @@ int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) { -#ifndef CONFIG_PREEMPT_RT +# ifndef CONFIG_PREEMPT_RT if (!strcmp(str, "none")) return preempt_dynamic_none; if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; -#endif +# endif if (!strcmp(str, "full")) return preempt_dynamic_full; -#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY +# ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY if (!strcmp(str, "lazy")) return preempt_dynamic_lazy; -#endif +# endif return -EINVAL; } -#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) -#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) +# define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) -#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) -#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) -#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) -#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) -#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) -#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) -#else -#error "Unsupported PREEMPT_DYNAMIC mechanism" -#endif +# if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) +# define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) +# define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) +# elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +# define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) +# define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) +# else +# error "Unsupported PREEMPT_DYNAMIC mechanism" +# endif static DEFINE_MUTEX(sched_dynamic_mutex); @@ -7618,7 +7618,7 @@ static void __init preempt_dynamic_init(void) } } -#define PREEMPT_MODEL_ACCESSOR(mode) \ +# define PREEMPT_MODEL_ACCESSOR(mode) \ bool preempt_model_##mode(void) \ { \ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \ @@ -8123,7 +8123,7 @@ static void balance_hotplug_wait(void) TASK_UNINTERRUPTIBLE); } -#else +#else /* !CONFIG_HOTPLUG_CPU: */ static inline void balance_push(struct rq *rq) { @@ -8137,7 +8137,7 @@ static inline void balance_hotplug_wait(void) { } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* !CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) { @@ -8446,7 +8446,7 @@ int sched_cpu_dying(unsigned int cpu) sched_core_cpu_dying(cpu); return 0; } -#endif +#endif /* CONFIG_HOTPLUG_CPU */ void __init sched_init_smp(void) { @@ -8480,12 +8480,12 @@ static int __init migration_init(void) } early_initcall(migration_init); -#else +#else /* !CONFIG_SMP: */ void __init sched_init_smp(void) { sched_init_granularity(); } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ int in_sched_functions(unsigned long addr) { @@ -8637,15 +8637,15 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ_COMMON +# ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); -#endif -#ifdef CONFIG_HOTPLUG_CPU +# endif +# ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); -#endif +# endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); @@ -8830,7 +8830,7 @@ void __cant_sleep(const char *file, int line, int preempt_offset) } EXPORT_SYMBOL_GPL(__cant_sleep); -#ifdef CONFIG_SMP +# ifdef CONFIG_SMP void __cant_migrate(const char *file, int line) { static unsigned long prev_jiffy; @@ -8861,8 +8861,8 @@ void __cant_migrate(const char *file, int line) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_migrate); -#endif -#endif +# endif /* CONFIG_SMP */ +#endif /* CONFIG_DEBUG_ATOMIC_SLEEP */ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) @@ -8902,7 +8902,7 @@ void normalize_rt_tasks(void) #endif /* CONFIG_MAGIC_SYSRQ */ -#if defined(CONFIG_KGDB_KDB) +#ifdef CONFIG_KGDB_KDB /* * These functions are only useful for KDB. * @@ -8926,7 +8926,7 @@ struct task_struct *curr_task(int cpu) return cpu_curr(cpu); } -#endif /* defined(CONFIG_KGDB_KDB) */ +#endif /* CONFIG_KGDB_KDB */ #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task groups */ @@ -9807,7 +9807,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, scx_group_set_idle(css_tg(css), idle); return ret; } -#endif +#endif /* CONFIG_GROUP_SCHED_WEIGHT */ static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -9935,7 +9935,7 @@ static int cpu_extra_stat_show(struct seq_file *sf, cfs_b->nr_periods, cfs_b->nr_throttled, throttled_usec, cfs_b->nr_burst, burst_usec); } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ return 0; } @@ -10076,7 +10076,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, ret = tg_set_cfs_bandwidth(tg, period, quota, burst); return ret ?: nbytes; } -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ static struct cftype cpu_files[] = { #ifdef CONFIG_GROUP_SCHED_WEIGHT @@ -10112,7 +10112,7 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_cfs_burst_read_u64, .write_u64 = cpu_cfs_burst_write_u64, }, -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_UCLAMP_TASK_GROUP { .name = "uclamp.min", @@ -10126,7 +10126,7 @@ static struct cftype cpu_files[] = { .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, }, -#endif +#endif /* CONFIG_UCLAMP_TASK_GROUP */ { } /* terminate */ }; @@ -10147,7 +10147,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .threaded = true, }; -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ void dump_cpu_task(int cpu) { @@ -10733,7 +10733,7 @@ void sched_mm_cid_fork(struct task_struct *t) WARN_ON_ONCE(!t->mm || t->mm_cid != -1); t->mm_cid_active = 1; } -#endif +#endif /* CONFIG_SCHED_MM_CID */ #ifdef CONFIG_SCHED_CLASS_EXT void sched_deq_and_put_task(struct task_struct *p, int queue_flags, @@ -10768,4 +10768,4 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) if (ctx->running) set_next_task(rq, ctx->p); } -#endif /* CONFIG_SCHED_CLASS_EXT */ +#endif /* CONFIG_SCHED_CLASS_EXT */ From 8bb9b0c5aeb9594bc1039e38d15b14d2a055cf3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:45 +0200 Subject: [PATCH 06/84] sched: Clean up and standardize #if/#else/#endif markers in sched/cpufreq_schedutil.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-5-mingo@kernel.org --- kernel/sched/cpufreq_schedutil.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3d7e9ccb99a0..0ab5f9d4bc59 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -382,9 +382,9 @@ static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) sg_cpu->saved_idle_calls = idle_calls; return ret; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * Make sugov_should_update_freq() ignore the rate limit when DL From 79af17344c2728c58e254219ff47840c67211cd9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:46 +0200 Subject: [PATCH 07/84] sched: Clean up and standardize #if/#else/#endif markers in sched/cpupri.h - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-6-mingo@kernel.org --- kernel/sched/cpupri.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f0f5a734c0c1..24add19625ff 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -29,4 +29,4 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#endif +#endif /* CONFIG_SMP */ From 4aec8669ff3c7ea7ab62c10bb6442c70c0d1b1eb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:47 +0200 Subject: [PATCH 08/84] sched: Clean up and standardize #if/#else/#endif markers in sched/cputime.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-7-mingo@kernel.org --- kernel/sched/cputime.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f01f17acb4a8..7097de2c8cda 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -91,7 +91,7 @@ static u64 irqtime_tick_accounted(u64 maxtime) return delta; } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static u64 irqtime_tick_accounted(u64 dummy) { @@ -244,7 +244,7 @@ void __account_forceidle_time(struct task_struct *p, u64 delta) task_group_account_field(p, CPUTIME_FORCEIDLE, delta); } -#endif +#endif /* CONFIG_SCHED_CORE */ /* * When a guest is interrupted for a longer amount of time, missed clock @@ -265,7 +265,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime) return steal; } -#endif +#endif /* CONFIG_PARAVIRT */ return 0; } @@ -291,7 +291,7 @@ static inline u64 read_sum_exec_runtime(struct task_struct *t) { return t->se.sum_exec_runtime; } -#else +#else /* !CONFIG_64BIT: */ static u64 read_sum_exec_runtime(struct task_struct *t) { u64 ns; @@ -304,7 +304,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) return ns; } -#endif +#endif /* !CONFIG_64BIT */ /* * Accumulate raw cputime values of dead tasks (sig->[us]time) and live @@ -414,11 +414,11 @@ static void irqtime_account_idle_ticks(int ticks) { irqtime_account_process_tick(current, 0, ticks); } -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void irqtime_account_idle_ticks(int ticks) { } static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, int nr_ticks) { } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * Use precise platform statistics if available: From c503c3dc2d49dbca2cec531ca8f1b8e9143c5087 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:48 +0200 Subject: [PATCH 09/84] sched: Clean up and standardize #if/#else/#endif markers in sched/deadline.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-8-mingo@kernel.org --- kernel/sched/deadline.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index ff5be80c5c9d..a2cea68b2198 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -55,7 +55,7 @@ static int __init sched_dl_sysctl_init(void) return 0; } late_initcall(sched_dl_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static bool dl_server(struct sched_dl_entity *dl_se) { @@ -103,7 +103,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return pi_of(dl_se) != dl_se; } -#else +#else /* !CONFIG_RT_MUTEXES: */ static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) { return dl_se; @@ -113,7 +113,7 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) { return false; } -#endif +#endif /* !CONFIG_RT_MUTEXES */ #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) @@ -195,7 +195,7 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) rq->dl.extra_bw += bw; } } -#else +#else /* !CONFIG_SMP: */ static inline struct dl_bw *dl_bw_of(int i) { return &cpu_rq(i)->dl.dl_bw; @@ -223,7 +223,7 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) dl->extra_bw += bw; } -#endif +#endif /* !CONFIG_SMP */ static inline void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) @@ -757,7 +757,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } -#else +#else /* !CONFIG_SMP: */ static inline void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -786,7 +786,7 @@ static inline void deadline_queue_push_tasks(struct rq *rq) static inline void deadline_queue_pull_task(struct rq *rq) { } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); @@ -1213,7 +1213,7 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) push_dl_task(rq); rq_repin_lock(rq, rf); } -#endif +#endif /* CONFIG_SMP */ } /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ @@ -1360,7 +1360,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * there. */ } -#endif +#endif /* CONFIG_SMP */ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->donor)) @@ -1602,7 +1602,7 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } /* @@ -1885,12 +1885,12 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) } } -#else +#else /* !CONFIG_SMP: */ static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) @@ -2379,11 +2379,11 @@ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { hrtick_start(rq, dl_se->runtime); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { @@ -3125,13 +3125,13 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) resched_curr(rq); } -#else +#else /* !CONFIG_SMP: */ /* * We don't know if p has a earlier or later deadline, so let's blindly * set a (maybe not needed) rescheduling point. */ resched_curr(rq); -#endif +#endif /* !CONFIG_SMP */ } #ifdef CONFIG_SCHED_CORE @@ -3162,7 +3162,7 @@ DEFINE_SCHED_CLASS(dl) = { .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, .find_lock_rq = find_lock_later_rq, -#endif +#endif /* CONFIG_SMP */ .task_tick = task_tick_dl, .task_fork = task_fork_dl, @@ -3574,7 +3574,7 @@ void dl_bw_free(int cpu, u64 dl_bw) { dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } -#endif +#endif /* CONFIG_SMP */ void print_dl_stats(struct seq_file *m, int cpu) { From 29dd6f8cd285360cc5a3e1dc696a960541020705 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:49 +0200 Subject: [PATCH 10/84] sched: Clean up and standardize #if/#else/#endif markers in sched/debug.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-9-mingo@kernel.org --- kernel/sched/debug.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b384124d89d3..748709c03214 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -93,10 +93,10 @@ static void sched_feat_enable(int i) { static_key_enable_cpuslocked(&sched_feat_keys[i]); } -#else +#else /* !CONFIG_JUMP_LABEL: */ static void sched_feat_disable(int i) { }; static void sched_feat_enable(int i) { }; -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ static int sched_feat_set(char *cmp) { @@ -217,7 +217,7 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; -#endif /* SMP */ +#endif /* CONFIG_SMP */ #ifdef CONFIG_PREEMPT_DYNAMIC @@ -314,9 +314,9 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, return result; } -#else -#define sched_verbose_write debugfs_write_file_bool -#endif +#else /* !CONFIG_SMP: */ +# define sched_verbose_write debugfs_write_file_bool +#endif /* !CONFIG_SMP */ static const struct file_operations sched_verbose_fops = { .read = debugfs_read_file_bool, @@ -523,7 +523,7 @@ static __init int sched_init_debug(void) sched_domains_mutex_lock(); update_sched_domain_debugfs(); sched_domains_mutex_unlock(); -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING numa = debugfs_create_dir("numa_balancing", debugfs_sched); @@ -533,7 +533,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); -#endif +#endif /* CONFIG_NUMA_BALANCING */ debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); @@ -697,14 +697,14 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->avg.load_avg); P(se->avg.util_avg); P(se->avg.runnable_avg); -#endif +#endif /* CONFIG_SMP */ #undef PN_SCHEDSTAT #undef PN #undef P_SCHEDSTAT #undef P } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED static DEFINE_SPINLOCK(sched_debug_lock); @@ -877,8 +877,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->tg_load_avg_contrib); SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); -#endif -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); @@ -954,9 +954,9 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } -#else +#else /* !CONFIG_X86: */ SEQ_printf(m, "cpu#%d\n", cpu); -#endif +#endif /* !CONFIG_X86 */ #define P(x) \ do { \ @@ -984,7 +984,7 @@ do { \ P64(avg_idle); P64(max_idle_balance_cost); #undef P64 -#endif +#endif /* CONFIG_SMP */ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { @@ -1166,7 +1166,7 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m) SEQ_printf(m, "current_node=%d, numa_group_id=%d\n", task_node(p), task_numa_group_id(p)); show_numa_stats(p, m); -#endif +#endif /* CONFIG_NUMA_BALANCING */ } void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, @@ -1263,13 +1263,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); __PS("effective uclamp.min", uclamp_eff_value(p, UCLAMP_MIN)); __PS("effective uclamp.max", uclamp_eff_value(p, UCLAMP_MAX)); -#endif +#endif /* CONFIG_UCLAMP_TASK */ P(policy); P(prio); if (task_has_dl_policy(p)) { From 416d5f78e4d3b010734248cb0aad9dc54b7589fa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:50 +0200 Subject: [PATCH 11/84] sched: Clean up and standardize #if/#else/#endif markers in sched/fair.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-10-mingo@kernel.org --- kernel/sched/fair.c | 111 ++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 55 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 83157de5b808..1fabbe01bf93 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -111,7 +111,7 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: ~5%) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH /* @@ -162,7 +162,7 @@ static int __init sched_fair_sysctl_init(void) return 0; } late_initcall(sched_fair_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ static inline void update_load_add(struct load_weight *lw, unsigned long inc) { @@ -471,7 +471,7 @@ static int se_is_idle(struct sched_entity *se) return cfs_rq_is_idle(group_cfs_rq(se)); } -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ #define for_each_sched_entity(se) \ for (; se; se = NULL) @@ -517,7 +517,7 @@ static int se_is_idle(struct sched_entity *se) return task_has_idle_policy(task_of(se)); } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); @@ -1008,7 +1008,7 @@ int sched_update_scaling(void) return 0; } -#endif +#endif /* CONFIG_SMP */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1041,6 +1041,7 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) } #include "pelt.h" + #ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); @@ -1131,7 +1132,7 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ void init_entity_runnable_average(struct sched_entity *se) { } @@ -1141,7 +1142,7 @@ void post_init_entity_util_avg(struct task_struct *p) static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { @@ -2114,12 +2115,12 @@ static inline int numa_idle_core(int idle_core, int cpu) return idle_core; } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline int numa_idle_core(int idle_core, int cpu) { return idle_core; } -#endif +#endif /* !CONFIG_SCHED_SMT */ /* * Gather all necessary information to make NUMA balancing placement @@ -3673,7 +3674,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) p->numa_scan_period = task_scan_start(p); } -#else +#else /* !CONFIG_NUMA_BALANCING: */ + static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } @@ -3690,7 +3692,7 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu) { } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -3785,12 +3787,12 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } -#else +#else /* !CONFIG_SMP: */ static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -#endif +#endif /* !CONFIG_SMP */ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -4000,11 +4002,11 @@ static void update_cfs_group(struct sched_entity *se) reweight_entity(cfs_rq_of(se), se, shares); } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_cfs_group(struct sched_entity *se) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { @@ -4481,7 +4483,7 @@ static inline bool skip_blocked_update(struct sched_entity *se) return true; } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} @@ -4494,7 +4496,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_NO_HZ_COMMON static inline void migrate_se_pelt_lag(struct sched_entity *se) @@ -4575,9 +4577,9 @@ static inline void migrate_se_pelt_lag(struct sched_entity *se) __update_load_avg_blocked_se(now, se); } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static void migrate_se_pelt_lag(struct sched_entity *se) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages @@ -5144,7 +5146,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { @@ -5184,7 +5186,7 @@ util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { @@ -5685,7 +5687,7 @@ void cfs_bandwidth_usage_dec(void) { static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } -#else /* CONFIG_JUMP_LABEL */ +#else /* !CONFIG_JUMP_LABEL: */ static bool cfs_bandwidth_used(void) { return true; @@ -5693,7 +5695,7 @@ static bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} -#endif /* CONFIG_JUMP_LABEL */ +#endif /* !CONFIG_JUMP_LABEL */ /* * default period for cfs group bandwidth. @@ -6147,12 +6149,12 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) if (first) smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); } -#else +#else /* !CONFIG_SMP: */ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { unthrottle_cfs_rq(cfs_rq); } -#endif +#endif /* !CONFIG_SMP */ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { @@ -6733,9 +6735,9 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) if (cfs_task_bw_constrained(p)) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } -#endif +#endif /* CONFIG_NO_HZ_FULL */ -#else /* CONFIG_CFS_BANDWIDTH */ +#else /* !CONFIG_CFS_BANDWIDTH: */ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } @@ -6777,7 +6779,7 @@ bool cfs_task_bw_constrained(struct task_struct *p) return false; } #endif -#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* !CONFIG_CFS_BANDWIDTH */ #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) {} @@ -6822,7 +6824,7 @@ static void hrtick_update(struct rq *rq) hrtick_start_fair(rq, donor); } -#else /* !CONFIG_SCHED_HRTICK */ +#else /* !CONFIG_SCHED_HRTICK: */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -6831,7 +6833,7 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static inline void hrtick_update(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_HRTICK */ #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) @@ -6875,9 +6877,9 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized(rq->rd, 1); } -#else +#else /* !CONFIG_SMP: */ static inline void check_update_overutilized_status(struct rq *rq) { } -#endif +#endif /* !CONFIG_SMP */ /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) @@ -7677,7 +7679,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t return -1; } -#else /* CONFIG_SCHED_SMT */ +#else /* !CONFIG_SCHED_SMT: */ static inline void set_idle_cores(int cpu, int val) { @@ -7698,7 +7700,7 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd return -1; } -#endif /* CONFIG_SCHED_SMT */ +#endif /* !CONFIG_SCHED_SMT */ /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by @@ -8743,9 +8745,9 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } -#else +#else /* !CONFIG_SMP: */ static inline void set_task_max_allowed_capacity(struct task_struct *p) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { @@ -8939,7 +8941,7 @@ again: return p; simple: -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ put_prev_set_next_task(rq, prev, p); return p; @@ -9357,13 +9359,13 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) return src_weight - dst_weight; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { return 0; } -#endif +#endif /* !CONFIG_NUMA_BALANCING */ /* * Check whether the task is ineligible on the destination cpu @@ -9772,12 +9774,12 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) if (!has_blocked) rq->has_blocked_load = 0; } -#else +#else /* !CONFIG_NO_HZ_COMMON: */ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} -#endif +#endif /* !CONFIG_NO_HZ_COMMON */ static bool __update_blocked_others(struct rq *rq, bool *done) { @@ -9886,7 +9888,7 @@ static unsigned long task_h_load(struct task_struct *p) return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, cfs_rq_load_avg(cfs_rq) + 1); } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -9903,7 +9905,7 @@ static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg; } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void sched_balance_update_blocked_averages(int cpu) { @@ -10616,7 +10618,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) return remote; return all; } -#else +#else /* !CONFIG_NUMA_BALANCING: */ static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { return all; @@ -10626,7 +10628,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq) { return regular; } -#endif /* CONFIG_NUMA_BALANCING */ +#endif /* !CONFIG_NUMA_BALANCING */ struct sg_lb_stats; @@ -12772,7 +12774,7 @@ static void nohz_newidle_balance(struct rq *this_rq) atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) @@ -12781,7 +12783,7 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle } static inline void nohz_newidle_balance(struct rq *this_rq) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * sched_balance_newidle is called by schedule() if this_cpu is about to become @@ -13076,10 +13078,10 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, cfs_rqa = sea->cfs_rq; cfs_rqb = seb->cfs_rq; -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ cfs_rqa = &task_rq(a)->cfs; cfs_rqb = &task_rq(b)->cfs; -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ /* * Find delta after normalizing se's vruntime with its cfs_rq's @@ -13103,9 +13105,9 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) #endif return throttled_hierarchy(cfs_rq); } -#else +#else /* !CONFIG_SCHED_CORE: */ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} -#endif +#endif /* !CONFIG_SCHED_CORE */ /* * scheduler tick hitting a task of our scheduling class. @@ -13199,9 +13201,9 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) list_add_leaf_cfs_rq(cfs_rq); } } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static void propagate_entity_cfs_rq(struct sched_entity *se) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static void detach_entity_cfs_rq(struct sched_entity *se) { @@ -13737,6 +13739,5 @@ __init void init_sched_fair_class(void) nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -#endif /* SMP */ - +#endif /* CONFIG_SMP */ } From 833840a94f4dfd86ed32f1b6222fe8d9ba1790a9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:51 +0200 Subject: [PATCH 12/84] sched: Clean up and standardize #if/#else/#endif markers in sched/idle.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-11-mingo@kernel.org --- kernel/sched/idle.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index cd3298653ef4..43d6e006cff1 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -52,7 +52,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) return 1; } __setup("hlt", cpu_idle_nopoll_setup); -#endif +#endif /* CONFIG_GENERIC_IDLE_POLL_SETUP */ static noinline int __cpuidle cpu_idle_poll(void) { @@ -100,10 +100,10 @@ static inline void cond_tick_broadcast_exit(void) if (static_branch_unlikely(&arch_needs_tick_broadcast)) tick_broadcast_exit(); } -#else +#else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE: */ static inline void cond_tick_broadcast_enter(void) { } static inline void cond_tick_broadcast_exit(void) { } -#endif +#endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE */ /** * default_idle_call - Default CPU idle routine. @@ -444,7 +444,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return WARN_ON_ONCE(1); } -#endif +#endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: From c215dff7f80caab4a25160f0ded369eba5cb9190 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:52 +0200 Subject: [PATCH 13/84] sched: Clean up and standardize #if/#else/#endif markers in sched/loadavg.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-12-mingo@kernel.org --- kernel/sched/loadavg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index f6df84ca6925..e3a1ce923f9f 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -335,12 +335,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ_COMMON */ +#else /* !CONFIG_NO_HZ_COMMON: */ static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the From 311bb3f7b78e944e831ffb07cb58455b47bf2269 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:53 +0200 Subject: [PATCH 14/84] sched: Clean up and standardize #if/#else/#endif markers in sched/pelt.[ch] - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-13-mingo@kernel.org --- kernel/sched/pelt.c | 4 ++-- kernel/sched/pelt.h | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 09be6a83e45a..fa83bbaf4f3e 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -414,7 +414,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) return 0; } -#endif +#endif /* CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* @@ -467,7 +467,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) return ret; } -#endif +#endif /* CONFIG_HAVE_SCHED_AVG_IRQ */ /* * Load avg and utiliztion metrics need to be updated periodically and before diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 19592077452e..a5d4933e6b70 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -20,7 +20,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return READ_ONCE(rq->avg_hw.load_avg); } -#else +#else /* !CONFIG_SCHED_HW_PRESSURE: */ static inline int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) { @@ -31,7 +31,7 @@ static inline u64 hw_load_avg(struct rq *rq) { return 0; } -#endif +#endif /* !CONFIG_SCHED_HW_PRESSURE */ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); @@ -179,15 +179,15 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } -#else +#else /* !CONFIG_CFS_BANDWIDTH: */ static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { return rq_clock_pelt(rq_of(cfs_rq)); } -#endif +#endif /* !CONFIG_CFS_BANDWIDTH */ -#else +#else /* !CONFIG_SMP: */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) @@ -236,6 +236,6 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq) { } static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } -#endif +#endif /* !CONFIG_SMP */ #endif /* _KERNEL_SCHED_PELT_H */ From fd3db705f7496c5d6b56ce8d78570dd1da11cd30 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:54 +0200 Subject: [PATCH 15/84] sched: Clean up and standardize #if/#else/#endif markers in sched/psi.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-14-mingo@kernel.org --- kernel/sched/psi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 333a7ba39668..5837cd8e7b97 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1039,7 +1039,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st psi_schedule_rtpoll_work(group, 1, false); } while ((group = group->parent)); } -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /** * psi_memstall_enter - mark the beginning of a memory stall section @@ -1655,7 +1655,7 @@ static const struct proc_ops psi_irq_proc_ops = { .proc_poll = psi_fop_poll, .proc_release = psi_fop_release, }; -#endif +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ static int __init psi_proc_init(void) { From 3eca109a7885903f1bf07099ae89025069017cc0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:55 +0200 Subject: [PATCH 16/84] sched: Clean up and standardize #if/#else/#endif markers in sched/rt.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-15-mingo@kernel.org --- kernel/sched/rt.c | 54 +++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 16008ac33e14..7e8ed05d302a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -63,7 +63,7 @@ static int __init sched_rt_sysctl_init(void) return 0; } late_initcall(sched_rt_sysctl_init); -#endif +#endif /* CONFIG_SYSCTL */ void init_rt_rq(struct rt_rq *rt_rq) { @@ -294,7 +294,7 @@ err: return 0; } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #define rt_entity_is_task(rt_se) (1) @@ -330,7 +330,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { return 1; } -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP @@ -433,7 +433,7 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) } } -#else +#else /* !CONFIG_SMP: */ static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) { @@ -446,7 +446,7 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) static inline void rt_queue_push_tasks(struct rq *rq) { } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static void enqueue_top_rt_rq(struct rt_rq *rt_rq); static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); @@ -488,12 +488,12 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) return cpu_cap >= min(min_cap, max_cap); } -#else +#else /* !CONFIG_UCLAMP_TASK: */ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) { return true; } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ #ifdef CONFIG_RT_GROUP_SCHED @@ -801,9 +801,9 @@ static void balance_runtime(struct rt_rq *rt_rq) raw_spin_lock(&rt_rq->rt_runtime_lock); } } -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline void balance_runtime(struct rt_rq *rt_rq) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -933,7 +933,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ typedef struct rt_rq *rt_rq_iter_t; @@ -985,7 +985,7 @@ static void __enable_runtime(struct rq *rq) { } static void __disable_runtime(struct rq *rq) { } #endif -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline int rt_se_prio(struct sched_rt_entity *rt_se) { @@ -1036,7 +1036,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } static void @@ -1110,14 +1110,14 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} static inline void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void @@ -1158,12 +1158,12 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else +#else /* !(CONFIG_SMP || CONFIG_RT_GROUP_SCHED): */ static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} -#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ +#endif /* !(CONFIG_SMP || CONFIG_RT_GROUP_SCHED) */ #ifdef CONFIG_RT_GROUP_SCHED @@ -1185,7 +1185,7 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); } -#else /* CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) @@ -1195,7 +1195,7 @@ inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static inline void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) @@ -1685,7 +1685,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) */ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif +#endif /* CONFIG_SMP */ } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) @@ -2512,11 +2512,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) */ if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); -#else +#else /* !CONFIG_SMP: */ /* For UP simply resched on drop of prio */ if (oldprio < p->prio) resched_curr(rq); -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ } else { /* * This task is not running, but if it is @@ -2552,9 +2552,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) } } } -#else +#else /* !CONFIG_POSIX_TIMERS: */ static inline void watchdog(struct rq *rq, struct task_struct *p) { } -#endif +#endif /* !CONFIG_POSIX_TIMERS */ /* * scheduler tick hitting a task of our scheduling class. @@ -2623,7 +2623,7 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) return rt_rq_throttled(rt_rq); } -#endif +#endif /* CONFIG_SCHED_CORE */ DEFINE_SCHED_CLASS(rt) = { @@ -2646,7 +2646,7 @@ DEFINE_SCHED_CLASS(rt) = { .task_woken = task_woken_rt, .switched_from = switched_from_rt, .find_lock_rq = find_lock_lowest_rq, -#endif +#endif /* !CONFIG_SMP */ .task_tick = task_tick_rt, @@ -2890,7 +2890,7 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) return 1; } -#else /* !CONFIG_RT_GROUP_SCHED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) @@ -2898,7 +2898,7 @@ static int sched_rt_global_constraints(void) return 0; } #endif /* CONFIG_SYSCTL */ -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) From fdccd0c79280da0a332f1370d2ad17192d563c95 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:56 +0200 Subject: [PATCH 17/84] sched: Clean up and standardize #if/#else/#endif markers in sched/sched.h - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-16-mingo@kernel.org --- kernel/sched/sched.h | 82 +++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f3a41487c96d..2bf804b8c89b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -425,7 +425,7 @@ struct cfs_bandwidth { int nr_burst; u64 throttled_time; u64 burst_time; -#endif +#endif /* CONFIG_CFS_BANDWIDTH */ }; /* Task group related information */ @@ -443,15 +443,15 @@ struct task_group { /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; -#ifdef CONFIG_SMP +#ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; -#endif -#endif +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity **rt_se; @@ -532,7 +532,7 @@ extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline void free_fair_sched_group(struct task_group *tg) { } static inline int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { @@ -540,7 +540,7 @@ static inline int alloc_fair_sched_group(struct task_group *tg, struct task_grou } static inline void online_fair_sched_group(struct task_group *tg) { } static inline void unregister_fair_sched_group(struct task_group *tg) { } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -577,22 +577,22 @@ extern int sched_group_set_idle(struct task_group *tg, long idle); #ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { } -#endif /* CONFIG_SMP */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_SMP */ +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* !CONFIG_FAIR_GROUP_SCHED */ -#else /* CONFIG_CGROUP_SCHED */ +#else /* !CONFIG_CGROUP_SCHED: */ struct cfs_bandwidth { }; static inline bool cfs_task_bw_constrained(struct task_struct *p) { return false; } -#endif /* CONFIG_CGROUP_SCHED */ +#endif /* !CONFIG_CGROUP_SCHED */ extern void unregister_rt_sched_group(struct task_group *tg); extern void free_rt_sched_group(struct task_group *tg); @@ -860,9 +860,9 @@ struct dl_rq { * of the leftmost (earliest deadline) element. */ struct rb_root_cached pushable_dl_tasks_root; -#else +#else /* !CONFIG_SMP: */ struct dl_bw dl_bw; -#endif +#endif /* !CONFIG_SMP */ /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a @@ -1009,7 +1009,7 @@ struct root_domain { /* These atomics are updated outside of a lock */ atomic_t rto_loop_next; atomic_t rto_loop_start; -#endif +#endif /* HAVE_RT_PUSH_IPI */ /* * The "RT overload" flag: it gets set if a CPU has more than * one runnable RT task. @@ -1295,7 +1295,7 @@ struct rq { unsigned int core_forceidle_seq; unsigned int core_forceidle_occupation; u64 core_forceidle_start; -#endif +#endif /* CONFIG_SCHED_CORE */ /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; @@ -1314,13 +1314,13 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) return cfs_rq->rq; } -#else +#else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { return container_of(cfs_rq, struct rq, cfs); } -#endif +#endif /* !CONFIG_FAIR_GROUP_SCHED */ static inline int cpu_of(struct rq *rq) { @@ -1501,6 +1501,7 @@ static inline bool sched_group_cookie_match(struct rq *rq, } #endif /* !CONFIG_SCHED_CORE */ + #ifdef CONFIG_RT_GROUP_SCHED # ifdef CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED DECLARE_STATIC_KEY_FALSE(rt_group_sched); @@ -1508,16 +1509,16 @@ static inline bool rt_group_sched_enabled(void) { return static_branch_unlikely(&rt_group_sched); } -# else +# else /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED: */ DECLARE_STATIC_KEY_TRUE(rt_group_sched); static inline bool rt_group_sched_enabled(void) { return static_branch_likely(&rt_group_sched); } -# endif /* CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ -#else +# endif /* !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED */ +#else /* !CONFIG_RT_GROUP_SCHED: */ # define rt_group_sched_enabled() false -#endif /* CONFIG_RT_GROUP_SCHED */ +#endif /* !CONFIG_RT_GROUP_SCHED */ static inline void lockdep_assert_rq_held(struct rq *rq) { @@ -1575,9 +1576,9 @@ static inline void update_idle_core(struct rq *rq) __update_idle_core(rq); } -#else +#else /* !CONFIG_SCHED_SMT: */ static inline void update_idle_core(struct rq *rq) { } -#endif +#endif /* !CONFIG_SCHED_SMT */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1758,7 +1759,7 @@ static inline void scx_rq_clock_invalidate(struct rq *rq) WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID); } -#else /* !CONFIG_SCHED_CLASS_EXT */ +#else /* !CONFIG_SCHED_CLASS_EXT: */ #define scx_enabled() false #define scx_switched_all() false @@ -2175,7 +2176,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) tg = &root_task_group; p->rt.rt_rq = tg->rt_rq[cpu]; p->rt.parent = tg->rt_se[cpu]; -#endif +#endif /* CONFIG_RT_GROUP_SCHED */ } #else /* !CONFIG_CGROUP_SCHED: */ @@ -2201,7 +2202,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) smp_wmb(); WRITE_ONCE(task_thread_info(p)->cpu, cpu); p->wake_cpu = cpu; -#endif +#endif /* CONFIG_SMP */ } /* @@ -2430,7 +2431,7 @@ struct sched_class { void (*rq_offline)(struct rq *rq); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); -#endif +#endif /* CONFIG_SMP */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); @@ -2955,7 +2956,7 @@ static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) /* * __sched_core_flip() relies on SMT having cpu-id lock order. */ -#endif +#endif /* CONFIG_SCHED_CORE */ return rq1->cpu < rq2->cpu; } @@ -3146,6 +3147,7 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); extern void resched_latency_warn(int cpu, u64 latency); + #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); extern void @@ -3255,14 +3257,14 @@ static inline u64 irq_time_read(int cpu) return total; } -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline int irqtime_enabled(void) { return 0; } -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ #ifdef CONFIG_CPU_FREQ @@ -3356,9 +3358,9 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } -#else /* !CONFIG_SMP */ +#else /* !CONFIG_SMP: */ static inline bool update_other_load_avgs(struct rq *rq) { return false; } -#endif /* CONFIG_SMP */ +#endif /* !CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK @@ -3536,13 +3538,13 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } -#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ #define perf_domain_span(pd) NULL static inline bool sched_energy_enabled(void) { return false; } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #ifdef CONFIG_MEMBARRIER @@ -3568,7 +3570,7 @@ static inline void membarrier_switch_mm(struct rq *rq, WRITE_ONCE(rq->membarrier_state, membarrier_state); } -#else /* !CONFIG_MEMBARRIER :*/ +#else /* !CONFIG_MEMBARRIER: */ static inline void membarrier_switch_mm(struct rq *rq, struct mm_struct *prev_mm, @@ -3589,7 +3591,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } -#endif +#endif /* CONFIG_SMP */ extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -3909,7 +3911,7 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) return false; } -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_RT_MUTEXES @@ -3953,7 +3955,7 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SMP extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#else +#else /* !CONFIG_SMP: */ static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) { @@ -3964,7 +3966,7 @@ static inline void balance_callbacks(struct rq *rq, struct balance_callback *hea { } -#endif +#endif /* !CONFIG_SMP */ #ifdef CONFIG_SCHED_CLASS_EXT /* From 91433cd6e46828044a64520dd1dce817be41940e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:57 +0200 Subject: [PATCH 18/84] sched: Clean up and standardize #if/#else/#endif markers in sched/stats.[ch] - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-17-mingo@kernel.org --- kernel/sched/stats.c | 2 +- kernel/sched/stats.h | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 1faea875d0b3..86acd374fc4d 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -164,7 +164,7 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_move_balance); } rcu_read_unlock(); -#endif +#endif /* CONFIG_SMP */ } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 452826df6ae1..26f3fd4d34ce 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -112,10 +112,10 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev); -#else +#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /*CONFIG_IRQ_TIME_ACCOUNTING */ +#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -220,7 +220,7 @@ static inline void psi_sched_switch(struct task_struct *prev, psi_task_switch(prev, next, sleep); } -#else /* CONFIG_PSI */ +#else /* !CONFIG_PSI: */ static inline void psi_enqueue(struct task_struct *p, bool migrate) {} static inline void psi_dequeue(struct task_struct *p, bool migrate) {} static inline void psi_ttwu_dequeue(struct task_struct *p) {} @@ -229,7 +229,7 @@ static inline void psi_sched_switch(struct task_struct *prev, bool sleep) {} static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) {} -#endif /* CONFIG_PSI */ +#endif /* !CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO /* @@ -334,6 +334,6 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n # define sched_info_enqueue(rq, t) do { } while (0) # define sched_info_dequeue(rq, t) do { } while (0) # define sched_info_switch(rq, t, next) do { } while (0) -#endif /* CONFIG_SCHED_INFO */ +#endif /* !CONFIG_SCHED_INFO */ #endif /* _KERNEL_STATS_H */ From 23d27e2cfbee69d85b867a427c3021234bcf09ab Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:58 +0200 Subject: [PATCH 19/84] sched: Clean up and standardize #if/#else/#endif markers in sched/syscalls.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-18-mingo@kernel.org --- kernel/sched/syscalls.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 547c1f05b667..5cb5e9487f0d 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -174,7 +174,7 @@ SYSCALL_DEFINE1(nice, int, increment) return 0; } -#endif +#endif /* __ARCH_WANT_SYS_NICE */ /** * task_prio - return the priority value of a given task. @@ -255,8 +255,7 @@ int sched_core_idle_cpu(int cpu) return idle_cpu(cpu); } - -#endif +#endif /* CONFIG_SCHED_CORE */ /** * find_process_by_pid - find a process with a matching PID value. @@ -448,7 +447,7 @@ static inline int uclamp_validate(struct task_struct *p, } static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } -#endif +#endif /* !CONFIG_UCLAMP_TASK */ /* * Allow unprivileged RT tasks to decrease priority. @@ -658,7 +657,7 @@ change: goto unlock; } } -#endif +#endif /* CONFIG_SMP */ } /* Re-check policy now with rq lock held: */ From f1c6b957f7f4091d822b93ca2833e83bf728e001 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:08:59 +0200 Subject: [PATCH 20/84] sched: Clean up and standardize #if/#else/#endif markers in sched/topology.c - Use the standard #ifdef marker format for larger blocks, where appropriate: #if CONFIG_FOO ... #else /* !CONFIG_FOO: */ ... #endif /* !CONFIG_FOO */ - Fix whitespace noise and other inconsistencies. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-19-mingo@kernel.org --- kernel/sched/topology.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9026d325d0fd..2352caf4f942 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -315,7 +315,7 @@ static int __init sched_energy_aware_sysctl_init(void) } late_initcall(sched_energy_aware_sysctl_init); -#endif +#endif /* CONFIG_PROC_SYSCTL */ static void free_pd(struct perf_domain *pd) { @@ -451,9 +451,9 @@ free: return false; } -#else +#else /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL): */ static void free_pd(struct perf_domain *pd) { } -#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ +#endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ static void free_rootdomain(struct rcu_head *rcu) { @@ -1600,7 +1600,7 @@ static int sched_domains_curr_level; int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; -#endif +#endif /* CONFIG_NUMA */ /* * SD_flags allowed in topology descriptions. @@ -1716,7 +1716,7 @@ sd_init(struct sched_domain_topology_level *tl, SD_WAKE_AFFINE); } -#endif +#endif /* CONFIG_NUMA */ } else { sd->cache_nice_tries = 1; } From 5202c25dd17c54cd4c21f266d9a51b644d7cd682 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:00 +0200 Subject: [PATCH 21/84] sched/smp: Always define sched_domains_mutex_lock()/unlock(), def_root_domain and sched_domains_mutex Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional. Unconditionally build kernel/sched/topology.c and the main sched-domains locking primitives. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-20-mingo@kernel.org --- include/linux/sched.h | 5 ----- kernel/sched/build_utility.c | 3 ++- kernel/sched/topology.c | 4 ++++ 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4f78a64beb52..aa54d75034ea 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -395,15 +395,10 @@ enum uclamp_id { UCLAMP_CNT }; -#ifdef CONFIG_SMP extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); -#else -static inline void sched_domains_mutex_lock(void) { } -static inline void sched_domains_mutex_unlock(void) { } -#endif struct sched_param { int sched_priority; diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index bf9d8db94b70..5c485b2dfb95 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -83,9 +83,10 @@ #ifdef CONFIG_SMP # include "cpupri.c" # include "stop_task.c" -# include "topology.c" #endif +#include "topology.c" + #ifdef CONFIG_SCHED_CORE # include "core_sched.c" #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 2352caf4f942..ee347d9c5df4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -17,6 +17,8 @@ void sched_domains_mutex_unlock(void) mutex_unlock(&sched_domains_mutex); } +#ifdef CONFIG_SMP + /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; @@ -2842,3 +2844,5 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); sched_domains_mutex_unlock(); } + +#endif /* CONFIG_SMP */ From cac5cefbade90ff0bb0b393d301fa3b5234cf056 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:01 +0200 Subject: [PATCH 22/84] sched/smp: Make SMP unconditional Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional. Introduce transitory wrappers for functionality not yet converted to SMP. Note that this patch is pretty large, because there's no clear separation between various aspects of the SMP scheduler, it's basically a huge block of #ifdef CONFIG_SMP. A fair amount of it has to be switched on for it to boot and work on UP systems. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-21-mingo@kernel.org --- include/linux/preempt.h | 9 --- include/linux/sched.h | 42 ------------ include/linux/sched/deadline.h | 4 -- include/linux/sched/idle.h | 4 -- include/linux/sched/nohz.h | 4 +- include/linux/sched/topology.h | 32 --------- kernel/sched/build_policy.c | 6 +- kernel/sched/build_utility.c | 6 +- kernel/sched/core.c | 106 ++++------------------------- kernel/sched/cpudeadline.h | 2 - kernel/sched/cpupri.h | 2 - kernel/sched/deadline.c | 95 -------------------------- kernel/sched/debug.c | 12 ---- kernel/sched/fair.c | 115 ------------------------------- kernel/sched/pelt.h | 52 -------------- kernel/sched/rt.c | 6 +- kernel/sched/sched.h | 121 +-------------------------------- kernel/sched/syscalls.c | 2 - kernel/sched/topology.c | 10 +-- 19 files changed, 31 insertions(+), 599 deletions(-) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index b0af8d4ef6e6..1fad1c8a4c76 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -369,8 +369,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif -#ifdef CONFIG_SMP - /* * Migrate-Disable and why it is undesired. * @@ -429,13 +427,6 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, extern void migrate_disable(void); extern void migrate_enable(void); -#else - -static inline void migrate_disable(void) { } -static inline void migrate_enable(void) { } - -#endif /* CONFIG_SMP */ - /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section * diff --git a/include/linux/sched.h b/include/linux/sched.h index aa54d75034ea..376befdec4b0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -599,7 +599,6 @@ struct sched_entity { unsigned long runnable_weight; #endif -#ifdef CONFIG_SMP /* * Per entity load average tracking. * @@ -607,7 +606,6 @@ struct sched_entity { * collide with read-mostly values above. */ struct sched_avg avg; -#endif }; struct sched_rt_entity { @@ -837,7 +835,6 @@ struct task_struct { struct alloc_tag *alloc_tag; #endif -#ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; @@ -853,7 +850,6 @@ struct task_struct { */ int recent_used_cpu; int wake_cpu; -#endif int on_rq; int prio; @@ -912,9 +908,7 @@ struct task_struct { cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; -#ifdef CONFIG_SMP unsigned short migration_disabled; -#endif unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU @@ -946,10 +940,8 @@ struct task_struct { struct sched_info sched_info; struct list_head tasks; -#ifdef CONFIG_SMP struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; -#endif struct mm_struct *mm; struct mm_struct *active_mm; @@ -1843,7 +1835,6 @@ extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpu extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); -#ifdef CONFIG_SMP /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); @@ -1861,33 +1852,6 @@ extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); -#else -static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ -} -static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -{ - /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */ - if ((*cpumask_bits(new_mask) & 1) == 0) - return -EINVAL; - return 0; -} -static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) -{ - if (src->user_cpus_ptr) - return -EINVAL; - return 0; -} -static inline void release_user_cpus_ptr(struct task_struct *p) -{ - WARN_ON(p->user_cpus_ptr); -} - -static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) -{ - return 0; -} -#endif extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); @@ -1976,11 +1940,7 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); -#ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); -#else -static inline void kick_process(struct task_struct *tsk) { } -#endif extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ @@ -2225,7 +2185,6 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_SMP static inline bool owner_on_cpu(struct task_struct *owner) { /* @@ -2237,7 +2196,6 @@ static inline bool owner_on_cpu(struct task_struct *owner) /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); -#endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index f9aabbc9d22e..c40115d4e34d 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -29,15 +29,11 @@ static inline bool dl_time_before(u64 a, u64 b) return (s64)(a - b) < 0; } -#ifdef CONFIG_SMP - struct root_domain; extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); extern void dl_clear_root_domain_cpu(int cpu); -#endif /* CONFIG_SMP */ - extern u64 dl_cookie; extern bool dl_bw_visited(int cpu, u64 cookie); diff --git a/include/linux/sched/idle.h b/include/linux/sched/idle.h index 439f6029d3b9..8465ff1f20d1 100644 --- a/include/linux/sched/idle.h +++ b/include/linux/sched/idle.h @@ -11,11 +11,7 @@ enum cpu_idle_type { CPU_MAX_IDLE_TYPES }; -#ifdef CONFIG_SMP extern void wake_up_if_idle(int cpu); -#else -static inline void wake_up_if_idle(int cpu) { } -#endif /* * Idle thread specific functions to determine the need_resched diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 6d67e9a5af6b..0db7f67935fe 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -6,7 +6,7 @@ * This is the interface between the scheduler and nohz/dynticks: */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_balance_enter_idle(int cpu); extern int get_nohz_timer_target(void); #else @@ -23,7 +23,7 @@ static inline void calc_load_nohz_remote(struct rq *rq) { } static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) +#ifdef CONFIG_NO_HZ_COMMON extern void wake_up_nohz_cpu(int cpu); #else static inline void wake_up_nohz_cpu(int cpu) { } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 198bb5cc1774..e54e7fa76ba6 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -9,7 +9,6 @@ /* * sched-domains (multiprocessor balancing) declarations: */ -#ifdef CONFIG_SMP /* Generate SD flag indexes */ #define SD_FLAG(name, mflags) __##name, @@ -200,37 +199,6 @@ extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); # define SD_INIT_NAME(type) .name = #type -#else /* CONFIG_SMP */ - -struct sched_domain_attr; - -static inline void -partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - struct sched_domain_attr *dattr_new) -{ -} - -static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) -{ - return true; -} - -static inline bool cpus_share_cache(int this_cpu, int that_cpu) -{ - return true; -} - -static inline bool cpus_share_resources(int this_cpu, int that_cpu) -{ - return true; -} - -static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) -{ -} - -#endif /* !CONFIG_SMP */ - #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern void rebuild_sched_domains_energy(void); #else diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index 72d97aa8b726..c4a488e67aa7 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -50,11 +50,9 @@ #include "idle.c" #include "rt.c" +#include "cpudeadline.c" -#ifdef CONFIG_SMP -# include "cpudeadline.c" -# include "pelt.c" -#endif +#include "pelt.c" #include "cputime.c" #include "deadline.c" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 5c485b2dfb95..e2cf3b08d4e9 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -80,10 +80,8 @@ #include "wait_bit.c" #include "wait.c" -#ifdef CONFIG_SMP -# include "cpupri.c" -# include "stop_task.c" -#endif +#include "cpupri.c" +#include "stop_task.c" #include "topology.c" diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ef6d29792c..9fc44f4b779a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -650,7 +650,6 @@ void raw_spin_rq_unlock(struct rq *rq) raw_spin_unlock(rq_lockp(rq)); } -#ifdef CONFIG_SMP /* * double_rq_lock - safely lock two runqueues */ @@ -667,7 +666,6 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) double_rq_clock_clear_update(rq1, rq2); } -#endif /* CONFIG_SMP */ /* * __task_rq_lock - lock the rq @p resides on. @@ -949,7 +947,7 @@ static inline void hrtick_rq_init(struct rq *rq) _val; \ }) -#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +#ifdef TIF_POLLING_NRFLAG /* * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, * this avoids any races wrt polling state changes and thereby avoids @@ -988,13 +986,11 @@ static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) return true; } -#ifdef CONFIG_SMP static inline bool set_nr_if_polling(struct task_struct *p) { return false; } #endif -#endif static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) { @@ -1167,7 +1163,6 @@ void resched_cpu(int cpu) raw_spin_rq_unlock_irqrestore(rq, flags); } -#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy CPU for migrating timers @@ -1374,10 +1369,8 @@ bool sched_can_stop_tick(struct rq *rq) return true; } #endif /* CONFIG_NO_HZ_FULL */ -#endif /* CONFIG_SMP */ -#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ - (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. @@ -2353,8 +2346,6 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state return ncsw; } -#ifdef CONFIG_SMP - static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); @@ -3305,6 +3296,8 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) WARN_ON_ONCE(ret); } +#ifdef CONFIG_SMP + void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { unsigned int state = READ_ONCE(p->__state); @@ -3358,6 +3351,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) @@ -3661,17 +3655,6 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else /* !CONFIG_SMP: */ - -static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } - -static inline bool rq_has_pinned_tasks(struct rq *rq) -{ - return false; -} - -#endif /* !CONFIG_SMP */ - static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -3682,7 +3665,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) rq = this_rq(); -#ifdef CONFIG_SMP if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); __schedstat_inc(p->stats.nr_wakeups_local); @@ -3702,7 +3684,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) if (wake_flags & WF_MIGRATED) __schedstat_inc(p->stats.nr_wakeups_migrate); -#endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); __schedstat_inc(p->stats.nr_wakeups); @@ -3731,13 +3712,11 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (p->sched_contributes_to_load) rq->nr_uninterruptible--; -#ifdef CONFIG_SMP if (wake_flags & WF_RQ_SELECTED) en_flags |= ENQUEUE_RQ_SELECTED; if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; else -#endif if (p->in_iowait) { delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); @@ -3748,7 +3727,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, ttwu_do_wakeup(p); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Our task @p is fully woken up and running; so it's safe to @@ -3770,7 +3748,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, rq->idle_stamp = 0; } -#endif /* CONFIG_SMP */ } /* @@ -3824,7 +3801,6 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) return ret; } -#ifdef CONFIG_SMP void sched_ttwu_pending(void *arg) { struct llist_node *llist = arg; @@ -3891,7 +3867,9 @@ static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); WRITE_ONCE(rq->ttwu_pending, 1); +#ifdef CONFIG_SMP __smp_call_single_queue(cpu, &p->wake_entry.llist); +#endif } void wake_up_if_idle(int cpu) @@ -3992,15 +3970,6 @@ static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) return false; } -#else /* !CONFIG_SMP: */ - -static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -{ - return false; -} - -#endif /* !CONFIG_SMP */ - static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); @@ -4533,10 +4502,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->capture_control = NULL; #endif init_numa_balancing(clone_flags, p); -#ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; -#endif init_sched_mm_cid(p); } @@ -4787,14 +4754,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#ifdef CONFIG_SMP p->on_cpu = 0; -#endif init_task_preempt_count(p); -#ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); -#endif + return 0; } @@ -4871,7 +4835,6 @@ void wake_up_new_task(struct task_struct *p) raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); -#ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_ptr can change in the fork path @@ -4883,7 +4846,6 @@ void wake_up_new_task(struct task_struct *p) p->recent_used_cpu = task_cpu(p); rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); -#endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); post_init_entity_util_avg(p); @@ -4994,7 +4956,6 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, static inline void prepare_task(struct task_struct *next) { -#ifdef CONFIG_SMP /* * Claim the task as running, we do this before switching to it * such that any running task will have this set. @@ -5003,12 +4964,10 @@ static inline void prepare_task(struct task_struct *next) * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); -#endif } static inline void finish_task(struct task_struct *prev) { -#ifdef CONFIG_SMP /* * This must be the very last reference to @prev from this CPU. After * p->on_cpu is cleared, the task can be moved to a different CPU. We @@ -5021,11 +4980,8 @@ static inline void finish_task(struct task_struct *prev) * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); -#endif } -#ifdef CONFIG_SMP - static void do_balance_callbacks(struct rq *rq, struct balance_callback *head) { void (*func)(struct rq *rq); @@ -5107,14 +5063,6 @@ void balance_callbacks(struct rq *rq, struct balance_callback *head) } } -#else /* !CONFIG_SMP: */ - -static inline void __balance_callbacks(struct rq *rq) -{ -} - -#endif /* !CONFIG_SMP */ - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -5563,7 +5511,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) struct rq *rq; u64 ns; -#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) +#ifdef CONFIG_64BIT /* * 64-bit doesn't need locks to atomically read a 64-bit value. * So we have a optimization chance when the task's delta_exec is 0. @@ -5690,12 +5638,10 @@ void sched_tick(void) if (donor->flags & PF_WQ_WORKER) wq_worker_tick(donor); -#ifdef CONFIG_SMP if (!scx_switched_all()) { rq->idle_balance = idle_cpu(cpu); sched_balance_trigger(rq); } -#endif } #ifdef CONFIG_NO_HZ_FULL @@ -7819,12 +7765,10 @@ void show_state_filter(unsigned int state_filter) */ void __init init_idle(struct task_struct *idle, int cpu) { -#ifdef CONFIG_SMP struct affinity_context ac = (struct affinity_context) { .new_mask = cpumask_of(cpu), .flags = 0, }; -#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -7840,13 +7784,11 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); -#ifdef CONFIG_SMP /* * No validation and serialization required at boot time and for * setting up the idle tasks of not yet online CPUs. */ set_cpus_allowed_common(idle, &ac); -#endif /* * We're having a chicken and egg problem, even though we are * holding rq->lock, the CPU isn't yet set to this CPU so the @@ -7865,9 +7807,7 @@ void __init init_idle(struct task_struct *idle, int cpu) rq_set_donor(rq, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; -#ifdef CONFIG_SMP idle->on_cpu = 1; -#endif raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); @@ -7880,13 +7820,9 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); -#ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -#endif } -#ifdef CONFIG_SMP - int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -8480,13 +8416,6 @@ static int __init migration_init(void) } early_initcall(migration_init); -#else /* !CONFIG_SMP: */ -void __init sched_init_smp(void) -{ - sched_init_granularity(); -} -#endif /* !CONFIG_SMP */ - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8512,9 +8441,7 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ -#ifdef CONFIG_SMP BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -#endif BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); @@ -8557,9 +8484,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_SMP init_defrootdomain(); -#endif #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, @@ -8620,7 +8545,6 @@ void __init sched_init(void) rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif -#ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = SCHED_CAPACITY_SCALE; @@ -8637,16 +8561,15 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -# ifdef CONFIG_NO_HZ_COMMON +#ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); -# endif -# ifdef CONFIG_HOTPLUG_CPU +#endif +#ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); -# endif -#endif /* CONFIG_SMP */ +#endif hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); fair_server_init(rq); @@ -8696,8 +8619,9 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); - balance_push_set(smp_processor_id(), false); #endif + + balance_push_set(smp_processor_id(), false); init_sched_fair_class(); init_sched_ext_class(); diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 3f7c73d1d189..11c0f1faa7e1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -17,7 +17,6 @@ struct cpudl { struct cpudl_item *elements; }; -#ifdef CONFIG_SMP int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); void cpudl_clear(struct cpudl *cp, int cpu); @@ -25,4 +24,3 @@ int cpudl_init(struct cpudl *cp); void cpudl_set_freecpu(struct cpudl *cp, int cpu); void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); -#endif /* CONFIG_SMP */ diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 24add19625ff..6f562088c056 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -20,7 +20,6 @@ struct cpupri { int *cpu_to_pri; }; -#ifdef CONFIG_SMP int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, @@ -29,4 +28,3 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, void cpupri_set(struct cpupri *cp, int cpu, int pri); int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); -#endif /* CONFIG_SMP */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a2cea68b2198..bf9b70a3ff95 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -115,7 +115,6 @@ static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) } #endif /* !CONFIG_RT_MUTEXES */ -#ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -195,35 +194,6 @@ void __dl_update(struct dl_bw *dl_b, s64 bw) rq->dl.extra_bw += bw; } } -#else /* !CONFIG_SMP: */ -static inline struct dl_bw *dl_bw_of(int i) -{ - return &cpu_rq(i)->dl.dl_bw; -} - -static inline int dl_bw_cpus(int i) -{ - return 1; -} - -static inline unsigned long dl_bw_capacity(int i) -{ - return SCHED_CAPACITY_SCALE; -} - -bool dl_bw_visited(int cpu, u64 cookie) -{ - return false; -} - -static inline -void __dl_update(struct dl_bw *dl_b, s64 bw) -{ - struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw); - - dl->extra_bw += bw; -} -#endif /* !CONFIG_SMP */ static inline void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus) @@ -556,23 +526,17 @@ void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->root = RB_ROOT_CACHED; -#ifdef CONFIG_SMP /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; -#else - init_dl_bw(&dl_rq->dl_bw); -#endif dl_rq->running_bw = 0; dl_rq->this_bw = 0; init_dl_rq_bw_ratio(dl_rq); } -#ifdef CONFIG_SMP - static inline int dl_overloaded(struct rq *rq) { return atomic_read(&rq->rd->dlo_count); @@ -757,37 +721,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p return later_rq; } -#else /* !CONFIG_SMP: */ - -static inline -void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline -void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline -void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ -} - -static inline void deadline_queue_push_tasks(struct rq *rq) -{ -} - -static inline void deadline_queue_pull_task(struct rq *rq) -{ -} -#endif /* !CONFIG_SMP */ - static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags); static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); @@ -1199,7 +1132,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) static void __push_dl_task(struct rq *rq, struct rq_flags *rf) { -#ifdef CONFIG_SMP /* * Queueing this task back might have overloaded rq, check if we need * to kick someone away. @@ -1213,7 +1145,6 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) push_dl_task(rq); rq_repin_lock(rq, rf); } -#endif /* CONFIG_SMP */ } /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ @@ -1343,7 +1274,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) goto unlock; } -#ifdef CONFIG_SMP if (unlikely(!rq->online)) { /* * If the runqueue is no longer available, migrate the @@ -1360,7 +1290,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * there. */ } -#endif /* CONFIG_SMP */ enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->donor)) @@ -1848,8 +1777,6 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se) #define __node_2_dle(node) \ rb_entry((node), struct sched_dl_entity, rb_node) -#ifdef CONFIG_SMP - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); @@ -1885,13 +1812,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) } } -#else /* !CONFIG_SMP: */ - -static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} -static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {} - -#endif /* !CONFIG_SMP */ - static inline void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { @@ -2218,8 +2138,6 @@ static void yield_task_dl(struct rq *rq) rq_clock_skip_update(rq); } -#ifdef CONFIG_SMP - static inline bool dl_task_is_earliest_deadline(struct task_struct *p, struct rq *rq) { @@ -2349,7 +2267,6 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Only called when both the current and waking task are -deadline @@ -2363,7 +2280,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, return; } -#ifdef CONFIG_SMP /* * In the unlikely case current and p have the same deadline * let us try to decide what's the best thing to do... @@ -2371,7 +2287,6 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, if ((p->dl.deadline == rq->donor->dl.deadline) && !test_tsk_need_resched(rq->curr)) check_preempt_equal_dl(rq, p); -#endif /* CONFIG_SMP */ } #ifdef CONFIG_SCHED_HRTICK @@ -2504,8 +2419,6 @@ static void task_fork_dl(struct task_struct *p) */ } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define DL_MAX_TRIES 3 @@ -2999,8 +2912,6 @@ void dl_clear_root_domain_cpu(int cpu) dl_clear_root_domain(cpu_rq(cpu)->rd); } -#endif /* CONFIG_SMP */ - static void switched_from_dl(struct rq *rq, struct task_struct *p) { /* @@ -3073,10 +2984,8 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } if (rq->donor != p) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) deadline_queue_push_tasks(rq); -#endif if (dl_task(rq->donor)) wakeup_preempt_dl(rq, p, 0); else @@ -3153,7 +3062,6 @@ DEFINE_SCHED_CLASS(dl) = { .put_prev_task = put_prev_task_dl, .set_next_task = set_next_task_dl, -#ifdef CONFIG_SMP .balance = balance_dl, .select_task_rq = select_task_rq_dl, .migrate_task_rq = migrate_task_rq_dl, @@ -3162,7 +3070,6 @@ DEFINE_SCHED_CLASS(dl) = { .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, .find_lock_rq = find_lock_later_rq, -#endif /* CONFIG_SMP */ .task_tick = task_tick_dl, .task_fork = task_fork_dl, @@ -3462,7 +3369,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) return false; } -#ifdef CONFIG_SMP int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { @@ -3574,7 +3480,6 @@ void dl_bw_free(int cpu, u64 dl_bw) { dl_bw_manage(dl_bw_req_free, cpu, dl_bw); } -#endif /* CONFIG_SMP */ void print_dl_stats(struct seq_file *m, int cpu) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 748709c03214..04c0354f05e4 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -286,7 +286,6 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_verbose; -#ifdef CONFIG_SMP static struct dentry *sd_dentry; @@ -314,9 +313,6 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf, return result; } -#else /* !CONFIG_SMP: */ -# define sched_verbose_write debugfs_write_file_bool -#endif /* !CONFIG_SMP */ static const struct file_operations sched_verbose_fops = { .read = debugfs_read_file_bool, @@ -543,8 +539,6 @@ static __init int sched_init_debug(void) } late_initcall(sched_init_debug); -#ifdef CONFIG_SMP - static cpumask_var_t sd_sysctl_cpus; static int sd_flags_show(struct seq_file *m, void *v) @@ -655,8 +649,6 @@ void dirty_sched_domain_sysctl(int cpu) __cpumask_set_cpu(cpu, sd_sysctl_cpus); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -932,11 +924,7 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) SEQ_printf(m, " .%-30s: %lu\n", #x, (unsigned long)(dl_rq->x)) PU(dl_nr_running); -#ifdef CONFIG_SMP dl_bw = &cpu_rq(cpu)->rd->dl_bw; -#else - dl_bw = &dl_rq->dl_bw; -#endif SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1fabbe01bf93..6b17d3da034a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -88,7 +88,6 @@ static int __init setup_sched_thermal_decay_shift(char *str) } __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift); -#ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. */ @@ -111,7 +110,6 @@ int __weak arch_asym_cpu_priority(int cpu) * (default: ~5%) */ #define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078) -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH /* @@ -996,7 +994,6 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) /************************************************************** * Scheduling class statistics methods: */ -#ifdef CONFIG_SMP int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); @@ -1008,7 +1005,6 @@ int sched_update_scaling(void) return 0; } -#endif /* CONFIG_SMP */ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); @@ -1042,8 +1038,6 @@ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) #include "pelt.h" -#ifdef CONFIG_SMP - static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); static unsigned long capacity_of(int cpu); @@ -1132,18 +1126,6 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -#else /* !CONFIG_SMP: */ -void init_entity_runnable_average(struct sched_entity *se) -{ -} -void post_init_entity_util_avg(struct task_struct *p) -{ -} -static void update_tg_load_avg(struct cfs_rq *cfs_rq) -{ -} -#endif /* !CONFIG_SMP */ - static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { u64 now = rq_clock_task(rq); @@ -3698,14 +3680,12 @@ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { struct rq *rq = rq_of(cfs_rq); account_numa_enqueue(rq, task_of(se)); list_add(&se->group_node, &rq->cfs_tasks); } -#endif cfs_rq->nr_queued++; } @@ -3713,12 +3693,10 @@ static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); -#ifdef CONFIG_SMP if (entity_is_task(se)) { account_numa_dequeue(rq_of(cfs_rq), task_of(se)); list_del_init(&se->group_node); } -#endif cfs_rq->nr_queued--; } @@ -3770,7 +3748,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) *ptr -= min_t(typeof(*ptr), *ptr, _val); \ } while (0) -#ifdef CONFIG_SMP static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -3787,12 +3764,6 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum, cfs_rq->avg.load_avg * PELT_MIN_DIVIDER); } -#else /* !CONFIG_SMP: */ -static inline void -enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -static inline void -dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } -#endif /* !CONFIG_SMP */ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); @@ -3824,13 +3795,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, update_load_set(&se->load, weight); -#ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); } while (0); -#endif enqueue_load_avg(cfs_rq, se); if (se->on_rq) { @@ -3865,7 +3834,6 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. @@ -3972,7 +3940,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) */ return clamp_t(long, shares, MIN_SHARES, tg_shares); } -#endif /* CONFIG_SMP */ /* * Recomputes the group entity based on the current state of its group @@ -3993,11 +3960,7 @@ static void update_cfs_group(struct sched_entity *se) if (throttled_hierarchy(gcfs_rq)) return; -#ifndef CONFIG_SMP - shares = READ_ONCE(gcfs_rq->tg->shares); -#else shares = calc_group_shares(gcfs_rq); -#endif if (unlikely(se->load.weight != shares)) reweight_entity(cfs_rq_of(se), se, shares); } @@ -4031,7 +3994,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) } } -#ifdef CONFIG_SMP static inline bool load_avg_is_decayed(struct sched_avg *sa) { if (sa->load_sum) @@ -5146,48 +5108,6 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } -#else /* !CONFIG_SMP: */ - -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - return !cfs_rq->nr_queued; -} - -#define UPDATE_TG 0x0 -#define SKIP_AGE_LOAD 0x0 -#define DO_ATTACH 0x0 -#define DO_DETACH 0x0 - -static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) -{ - cfs_rq_util_change(cfs_rq, 0); -} - -static inline void remove_entity_load_avg(struct sched_entity *se) {} - -static inline void -attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline void -detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} - -static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) -{ - return 0; -} - -static inline void -util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} - -static inline void -util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} -static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} - -#endif /* !CONFIG_SMP */ - void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) { struct sched_entity *se = &p->se; @@ -6090,7 +6010,6 @@ unthrottle_throttle: resched_curr(rq); } -#ifdef CONFIG_SMP static void __cfsb_csd_unthrottle(void *arg) { struct cfs_rq *cursor, *tmp; @@ -6149,12 +6068,6 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) if (first) smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); } -#else /* !CONFIG_SMP: */ -static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) -{ - unthrottle_cfs_rq(cfs_rq); -} -#endif /* !CONFIG_SMP */ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { @@ -6610,7 +6523,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) * guaranteed at this point that no additional cfs_rq of this group can * join a CSD list. */ -#ifdef CONFIG_SMP for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); unsigned long flags; @@ -6622,7 +6534,6 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) __cfsb_csd_unthrottle(rq); local_irq_restore(flags); } -#endif } /* @@ -6835,7 +6746,6 @@ static inline void hrtick_update(struct rq *rq) } #endif /* !CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { unsigned long rq_util_min, rq_util_max; @@ -6877,9 +6787,6 @@ static inline void check_update_overutilized_status(struct rq *rq) if (!is_rd_overutilized(rq->rd) && cpu_overutilized(rq->cpu)) set_rd_overutilized(rq->rd, 1); } -#else /* !CONFIG_SMP: */ -static inline void check_update_overutilized_status(struct rq *rq) { } -#endif /* !CONFIG_SMP */ /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) @@ -6888,12 +6795,10 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } -#ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { return sched_idle_rq(cpu_rq(cpu)); } -#endif static void requeue_delayed_entity(struct sched_entity *se) @@ -7208,8 +7113,6 @@ static inline unsigned int cfs_h_nr_delayed(struct rq *rq) return (rq->cfs.h_nr_queued - rq->cfs.h_nr_runnable); } -#ifdef CONFIG_SMP - /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); @@ -8745,9 +8648,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return sched_balance_newidle(rq, rf) != 0; } -#else /* !CONFIG_SMP: */ -static inline void set_task_max_allowed_capacity(struct task_struct *p) {} -#endif /* !CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { @@ -9057,7 +8957,6 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) return true; } -#ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods. * @@ -12980,8 +12879,6 @@ static void rq_offline_fair(struct rq *rq) clear_tg_offline_cfs_rqs(rq); } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_SCHED_CORE static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) @@ -13209,7 +13106,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_SMP /* * In case the task sched_avg hasn't been attached: * - A forked task which hasn't been woken up by wake_up_new_task(). @@ -13218,7 +13114,6 @@ static void detach_entity_cfs_rq(struct sched_entity *se) */ if (!se->avg.last_update_time) return; -#endif /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); @@ -13282,7 +13177,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs { struct sched_entity *se = &p->se; -#ifdef CONFIG_SMP if (task_on_rq_queued(p)) { /* * Move the next running task to the front of the list, so our @@ -13290,7 +13184,6 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs */ list_move(&se->group_node, &rq->cfs_tasks); } -#endif if (!first) return; @@ -13328,9 +13221,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifdef CONFIG_SMP raw_spin_lock_init(&cfs_rq->removed.lock); -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13345,10 +13236,8 @@ static void task_change_group_fair(struct task_struct *p) detach_task_cfs_rq(p); -#ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; -#endif set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } @@ -13644,7 +13533,6 @@ DEFINE_SCHED_CLASS(fair) = { .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, -#ifdef CONFIG_SMP .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13654,7 +13542,6 @@ DEFINE_SCHED_CLASS(fair) = { .task_dead = task_dead_fair, .set_cpus_allowed = set_cpus_allowed_fair, -#endif .task_tick = task_tick_fair, .task_fork = task_fork_fair, @@ -13717,7 +13604,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { -#ifdef CONFIG_SMP int i; for_each_possible_cpu(i) { @@ -13739,5 +13625,4 @@ __init void init_sched_fair_class(void) nohz.next_blocked = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); #endif -#endif /* CONFIG_SMP */ } diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index a5d4933e6b70..62c3fa543c0f 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -3,7 +3,6 @@ #define _KERNEL_SCHED_PELT_H #include "sched.h" -#ifdef CONFIG_SMP #include "sched-pelt.h" int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); @@ -187,55 +186,4 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) } #endif /* !CONFIG_CFS_BANDWIDTH */ -#else /* !CONFIG_SMP: */ - -static inline int -update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) -{ - return 0; -} - -static inline int -update_rt_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_dl_rq_load_avg(u64 now, struct rq *rq, int running) -{ - return 0; -} - -static inline int -update_hw_load_avg(u64 now, struct rq *rq, u64 capacity) -{ - return 0; -} - -static inline u64 hw_load_avg(struct rq *rq) -{ - return 0; -} - -static inline int -update_irq_load_avg(struct rq *rq, u64 running) -{ - return 0; -} - -static inline u64 rq_clock_pelt(struct rq *rq) -{ - return rq_clock_task(rq); -} - -static inline void -update_rq_clock_pelt(struct rq *rq, s64 delta) { } - -static inline void -update_idle_rq_clock_pelt(struct rq *rq) { } - -static inline void update_idle_cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { } -#endif /* !CONFIG_SMP */ - #endif /* _KERNEL_SCHED_PELT_H */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7e8ed05d302a..ab211706b160 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2454,7 +2454,11 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#endif /* CONFIG_SMP */ +#else /* !CONFIG_SMP: */ +void __init init_sched_rt_class(void) +{ +} +#endif /* !CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2bf804b8c89b..7a7ebc2a3675 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -443,14 +443,12 @@ struct task_group { /* runqueue "owned" by this group on each CPU */ struct cfs_rq **cfs_rq; unsigned long shares; -#ifdef CONFIG_SMP /* * load_avg can be heavily contended at clock tick time, so put * it in its own cache-line separated from the fields above which * will also be accessed at each tick. */ atomic_long_t load_avg ____cacheline_aligned; -#endif /* CONFIG_SMP */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -574,13 +572,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern int sched_group_set_idle(struct task_group *tg, long idle); -#ifdef CONFIG_SMP extern void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next); -#else /* !CONFIG_SMP: */ -static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } -#endif /* !CONFIG_SMP */ #else /* !CONFIG_FAIR_GROUP_SCHED: */ static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) { return 0; } static inline int sched_group_set_idle(struct task_group *tg, long idle) { return 0; } @@ -668,7 +661,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SMP /* * CFS load tracking */ @@ -700,7 +692,6 @@ struct cfs_rq { u64 last_h_load_update; struct sched_entity *h_load_next; #endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ @@ -797,14 +788,10 @@ struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; unsigned int rr_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP int next; /* next highest */ -#endif } highest_prio; -#endif #ifdef CONFIG_SMP bool overloaded; struct plist_head pushable_tasks; @@ -840,7 +827,6 @@ struct dl_rq { unsigned int dl_nr_running; -#ifdef CONFIG_SMP /* * Deadline values of the currently executing and the * earliest ready task on this rq. Caching these facilitates @@ -860,9 +846,7 @@ struct dl_rq { * of the leftmost (earliest deadline) element. */ struct rb_root_cached pushable_dl_tasks_root; -#else /* !CONFIG_SMP: */ - struct dl_bw dl_bw; -#endif /* !CONFIG_SMP */ + /* * "Active utilization" for this runqueue: increased when a * task wakes up (becomes TASK_RUNNING) and decreased when a @@ -933,7 +917,6 @@ static inline long se_runnable(struct sched_entity *se) #endif /* !CONFIG_FAIR_GROUP_SCHED */ -#ifdef CONFIG_SMP /* * XXX we want to get rid of these helpers and use the full load resolution. */ @@ -1044,7 +1027,6 @@ static inline void set_rd_overloaded(struct root_domain *rd, int status) #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK /* @@ -1108,18 +1090,14 @@ struct rq { unsigned int numa_migrate_on; #endif #ifdef CONFIG_NO_HZ_COMMON -#ifdef CONFIG_SMP unsigned long last_blocked_load_update_tick; unsigned int has_blocked_load; call_single_data_t nohz_csd; -#endif /* CONFIG_SMP */ unsigned int nohz_tick_stopped; atomic_t nohz_flags; #endif /* CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_SMP unsigned int ttwu_pending; -#endif u64 nr_switches; #ifdef CONFIG_UCLAMP_TASK @@ -1184,7 +1162,6 @@ struct rq { int membarrier_state; #endif -#ifdef CONFIG_SMP struct root_domain *rd; struct sched_domain __rcu *sd; @@ -1225,7 +1202,6 @@ struct rq { #ifdef CONFIG_HOTPLUG_CPU struct rcuwait hotplug_wait; #endif -#endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -1272,9 +1248,7 @@ struct rq { struct cpuidle_state *idle_state; #endif -#ifdef CONFIG_SMP unsigned int nr_pinned; -#endif unsigned int push_busy; struct cpu_stop_work push_work; @@ -1300,7 +1274,7 @@ struct rq { /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; -#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) +#ifdef CONFIG_CFS_BANDWIDTH call_single_data_t cfsb_csd; struct list_head cfsb_csd_list; #endif @@ -1963,8 +1937,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #endif /* !CONFIG_NUMA_BALANCING */ -#ifdef CONFIG_SMP - static inline void queue_balance_callback(struct rq *rq, struct balance_callback *head, @@ -2130,8 +2102,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p) return p->user_cpus_ptr; } -#endif /* CONFIG_SMP */ - #ifdef CONFIG_CGROUP_SCHED /* @@ -2418,7 +2388,6 @@ struct sched_class { void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); -#ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); @@ -2431,7 +2400,6 @@ struct sched_class { void (*rq_offline)(struct rq *rq); struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); -#endif /* CONFIG_SMP */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); void (*task_fork)(struct task_struct *p); @@ -2583,8 +2551,6 @@ extern struct task_struct *pick_task_idle(struct rq *rq); #define SCA_MIGRATE_ENABLE 0x04 #define SCA_USER 0x08 -#ifdef CONFIG_SMP - extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void sched_balance_trigger(struct rq *rq); @@ -2636,26 +2602,6 @@ static inline struct task_struct *get_push_task(struct rq *rq) extern int push_cpu_stop(void *arg); -#else /* !CONFIG_SMP: */ - -static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) -{ - return true; -} - -static inline int __set_cpus_allowed_ptr(struct task_struct *p, - struct affinity_context *ctx) -{ - return set_cpus_allowed_ptr(p, ctx->new_mask); -} - -static inline cpumask_t *alloc_user_cpus_ptr(int node) -{ - return NULL; -} - -#endif /* !CONFIG_SMP */ - #ifdef CONFIG_CPU_IDLE static inline void idle_set_state(struct rq *rq, @@ -2932,8 +2878,6 @@ static inline class_##name##_t class_##name##_constructor(type *lock, type *lock { class_##name##_t _t = { .lock = lock, .lock2 = lock2 }, *_T = &_t; \ _lock; return _t; } -#ifdef CONFIG_SMP - static inline bool rq_order_less(struct rq *rq1, struct rq *rq2) { #ifdef CONFIG_SCHED_CORE @@ -3093,42 +3037,6 @@ extern void set_rq_offline(struct rq *rq); extern bool sched_smp_initialized; -#else /* !CONFIG_SMP: */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - WARN_ON_ONCE(!irqs_disabled()); - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_lock(rq1); - __acquire(rq2->lock); /* Fake it out ;) */ - double_rq_clock_clear_update(rq1, rq2); -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - WARN_ON_ONCE(rq1 != rq2); - raw_spin_rq_unlock(rq1); - __release(rq2->lock); -} - -#endif /* !CONFIG_SMP */ - DEFINE_LOCK_GUARD_2(double_rq_lock, struct rq, double_rq_lock(_T->lock, _T->lock2), double_rq_unlock(_T->lock, _T->lock2)) @@ -3187,7 +3095,7 @@ extern void nohz_balance_exit_idle(struct rq *rq); static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif /* !CONFIG_NO_HZ_COMMON */ -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON extern void nohz_run_idle_balance(int cpu); #else static inline void nohz_run_idle_balance(int cpu) { } @@ -3313,8 +3221,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { } # define arch_scale_freq_invariant() false #endif -#ifdef CONFIG_SMP - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max); @@ -3358,10 +3264,6 @@ static inline unsigned long cpu_util_rt(struct rq *rq) return READ_ONCE(rq->avg_rt.util_avg); } -#else /* !CONFIG_SMP: */ -static inline bool update_other_load_avgs(struct rq *rq) { return false; } -#endif /* !CONFIG_SMP */ - #ifdef CONFIG_UCLAMP_TASK unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id); @@ -3580,7 +3482,6 @@ static inline void membarrier_switch_mm(struct rq *rq, #endif /* !CONFIG_MEMBARRIER */ -#ifdef CONFIG_SMP static inline bool is_per_cpu_kthread(struct task_struct *p) { if (!(p->flags & PF_KTHREAD)) @@ -3591,7 +3492,6 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) return true; } -#endif /* CONFIG_SMP */ extern void swake_up_all_locked(struct swait_queue_head *q); extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); @@ -3890,7 +3790,6 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } extern u64 avg_vruntime(struct cfs_rq *cfs_rq); extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); -#ifdef CONFIG_SMP static inline void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) { @@ -3911,7 +3810,6 @@ bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) return false; } -#endif /* CONFIG_SMP */ #ifdef CONFIG_RT_MUTEXES @@ -3952,21 +3850,8 @@ extern void check_class_changed(struct rq *rq, struct task_struct *p, const struct sched_class *prev_class, int oldprio); -#ifdef CONFIG_SMP extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#else /* !CONFIG_SMP: */ - -static inline struct balance_callback *splice_balance_callbacks(struct rq *rq) -{ - return NULL; -} - -static inline void balance_callbacks(struct rq *rq, struct balance_callback *head) -{ -} - -#endif /* !CONFIG_SMP */ #ifdef CONFIG_SCHED_CLASS_EXT /* diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 5cb5e9487f0d..d7fccf871c7d 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -1119,7 +1119,6 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, return copy_struct_to_user(uattr, usize, &kattr, sizeof(kattr), NULL); } -#ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { /* @@ -1148,7 +1147,6 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) return 0; } -#endif /* CONFIG_SMP */ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index ee347d9c5df4..f2c10167f2bc 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -17,8 +17,6 @@ void sched_domains_mutex_unlock(void) mutex_unlock(&sched_domains_mutex); } -#ifdef CONFIG_SMP - /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; @@ -1322,11 +1320,10 @@ next: update_group_capacity(sd, cpu); } -#ifdef CONFIG_SMP - /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) { +#ifdef CONFIG_SMP int asym_prefer_cpu = cpu; struct sched_domain *sd; @@ -1376,9 +1373,8 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); } -} - #endif /* CONFIG_SMP */ +} /* * Set of available CPUs grouped by their corresponding capacities @@ -2844,5 +2840,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); sched_domains_mutex_unlock(); } - -#endif /* CONFIG_SMP */ From 06ddd17521bf11a3e7f59dafdf5c148f29467d2c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:02 +0200 Subject: [PATCH 23/84] sched/smp: Always define is_percpu_thread() and scheduler_ipi() Simplify the scheduler by making the CONFIG_SMP=y primitives of is_percpu_thread() and scheduler_ipi() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-22-mingo@kernel.org --- include/linux/sched.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 376befdec4b0..eec6b225e9d1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1765,12 +1765,8 @@ extern struct pid *cad_pid; static __always_inline bool is_percpu_thread(void) { -#ifdef CONFIG_SMP return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); -#else - return true; -#endif } /* Per-process atomic flags. */ @@ -1967,7 +1963,6 @@ extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec buf; \ }) -#ifdef CONFIG_SMP static __always_inline void scheduler_ipi(void) { /* @@ -1977,9 +1972,6 @@ static __always_inline void scheduler_ipi(void) */ preempt_fold_need_resched(); } -#else -static inline void scheduler_ipi(void) { } -#endif extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); From a1416303d108befb4e2b62c863ae1defe4eb1ba3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:03 +0200 Subject: [PATCH 24/84] sched/smp: Always define rq->hrtick_csd Simplify the scheduler by making CONFIG_SMP=y data structure of rq->hrtick_csd unconditional. Adjust hrtick_start() accordingly, which was split due to the ::hrtick_csd asymmetry and use the SMP version there too. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-23-mingo@kernel.org --- kernel/sched/core.c | 23 ----------------------- kernel/sched/sched.h | 2 -- 2 files changed, 25 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9fc44f4b779a..c155ee0bd7a8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -851,8 +851,6 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) return HRTIMER_NORESTART; } -#ifdef CONFIG_SMP - static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; @@ -897,30 +895,9 @@ void hrtick_start(struct rq *rq, u64 delay) smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); } -#else /* !CONFIG_SMP: */ -/* - * Called to set the hrtick timer state. - * - * called with rq->lock held and IRQs disabled - */ -void hrtick_start(struct rq *rq, u64 delay) -{ - /* - * Don't schedule slices shorter than 10000ns, that just - * doesn't make sense. Rely on vruntime for fairness. - */ - delay = max_t(u64, delay, 10000LL); - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_HARD); -} - -#endif /* !CONFIG_SMP */ - static void hrtick_rq_init(struct rq *rq) { -#ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); -#endif hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); } #else /* !CONFIG_SCHED_HRTICK: */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7ebc2a3675..3e7151ed5e5e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1219,9 +1219,7 @@ struct rq { long calc_load_active; #ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP call_single_data_t hrtick_csd; -#endif struct hrtimer hrtick_timer; ktime_t hrtick_time; #endif From d0a0a055a58617ac8dd9418f08346e94310f1096 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:04 +0200 Subject: [PATCH 25/84] sched/smp: Use the SMP version of try_to_wake_up() Simplify the scheduler by making CONFIG_SMP=y logic within try_to_wake_up() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-24-mingo@kernel.org --- kernel/sched/core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c155ee0bd7a8..d20fea6a380e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4202,7 +4202,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) break; -#ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be * possible to, falsely, observe p->on_cpu == 0. @@ -4281,9 +4280,6 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) psi_ttwu_dequeue(p); set_task_cpu(p, cpu); } -#else /* !CONFIG_SMP: */ - cpu = task_cpu(p); -#endif /* !CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); } From 8039addbe5a56e4108b1bea57aa5b3f70750fed9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:05 +0200 Subject: [PATCH 26/84] sched/smp: Use the SMP version of __task_needs_rq_lock() Simplify the scheduler by making CONFIG_SMP=y code in __task_needs_rq_lock() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-25-mingo@kernel.org --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d20fea6a380e..82b7cdb353d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4312,14 +4312,12 @@ static bool __task_needs_rq_lock(struct task_struct *p) if (p->on_rq) return true; -#ifdef CONFIG_SMP /* * Ensure the task has finished __schedule() and will not be referenced * anymore. Again, see try_to_wake_up() for a longer comment. */ smp_rmb(); smp_cond_load_acquire(&p->on_cpu, !VAL); -#endif return false; } From 588467616c88b12657f6ebe7306d830c272fe054 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:06 +0200 Subject: [PATCH 27/84] sched/smp: Use the SMP version of wake_up_new_task() Simplify the scheduler by making CONFIG_SMP=y code in wake_up_new_task() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-26-mingo@kernel.org --- kernel/sched/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 82b7cdb353d8..c108b5c2e115 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4824,7 +4824,6 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, wake_flags); -#ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* * Nothing relies on rq->lock after this, so it's fine to @@ -4834,7 +4833,6 @@ void wake_up_new_task(struct task_struct *p) p->sched_class->task_woken(rq, p); rq_repin_lock(rq, &rf); } -#endif task_rq_unlock(rq, p, &rf); } From 1f25730e5a780b33f78e3ea23e64d3f75e0b2042 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:07 +0200 Subject: [PATCH 28/84] sched/smp: Use the SMP version of sched_exec() Simplify the scheduler making CONFIG_SMP=y sched_exec() code unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-27-mingo@kernel.org --- include/linux/sched/task.h | 4 ---- kernel/sched/core.c | 4 ---- 2 files changed, 8 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ca1db4b92c32..c517dbc242f7 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -109,11 +109,7 @@ int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ -#ifdef CONFIG_SMP extern void sched_exec(void); -#else -#define sched_exec() {} -#endif static inline struct task_struct *get_task_struct(struct task_struct *t) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c108b5c2e115..fa89006e05e9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5419,8 +5419,6 @@ unsigned int nr_iowait(void) return sum; } -#ifdef CONFIG_SMP - /* * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. @@ -5444,8 +5442,6 @@ void sched_exec(void) stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } -#endif /* CONFIG_SMP */ - DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); From 74063c1755ca990440a9c61c91bb7a873a40deeb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:08 +0200 Subject: [PATCH 29/84] sched/smp: Use the SMP version of idle_thread_set_boot_cpu() Simplify the scheduler by making the CONFIG_SMP=y version of idle_thread_set_boot_cpu() unconditional. Note that idle_thread_set_boot_cpu() is already conditional on CONFIG_GENERIC_SMP_IDLE_THREAD, which most architectures select unconditionally on both UP and SMP kernels. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-28-mingo@kernel.org --- kernel/sched/core.c | 2 -- kernel/smpboot.c | 4 ---- 2 files changed, 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa89006e05e9..a03c3c1d7f50 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8582,9 +8582,7 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; -#ifdef CONFIG_SMP idle_thread_set_boot_cpu(); -#endif balance_push_set(smp_processor_id(), false); init_sched_fair_class(); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1992b62e980b..4503b60ce9bd 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -18,8 +18,6 @@ #include "smpboot.h" -#ifdef CONFIG_SMP - #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD /* * For the hotplug case we keep the task structs around and reuse @@ -76,8 +74,6 @@ void __init idle_threads_init(void) } #endif -#endif /* #ifdef CONFIG_SMP */ - static LIST_HEAD(hotplug_threads); static DEFINE_MUTEX(smpboot_threads_lock); From 15125a229abc2404a264ce493e64a9ffa7850f6e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:09 +0200 Subject: [PATCH 30/84] sched/smp: Use the SMP version of the RT scheduling class Simplify the scheduler by making CONFIG_SMP=y primitives and data structures unconditional in the RT policies scheduler. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-29-mingo@kernel.org --- kernel/sched/rt.c | 72 -------------------------------------------- kernel/sched/sched.h | 2 -- 2 files changed, 74 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ab211706b160..15d5855c542c 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -78,12 +78,10 @@ void init_rt_rq(struct rt_rq *rt_rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -332,8 +330,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) } #endif /* !CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_SMP - static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ @@ -433,21 +429,6 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) } } -#else /* !CONFIG_SMP: */ - -static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) -{ -} - -static inline void rt_queue_push_tasks(struct rq *rq) -{ -} -#endif /* !CONFIG_SMP */ - static void enqueue_top_rt_rq(struct rt_rq *rt_rq); static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count); @@ -597,17 +578,10 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se) return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP static inline const struct cpumask *sched_rt_period_mask(void) { return this_rq()->rd->span; } -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) @@ -628,7 +602,6 @@ bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) rt_rq->rt_time < rt_b->rt_runtime); } -#ifdef CONFIG_SMP /* * We ran out of runtime, see if we can borrow some from our neighbours. */ @@ -801,9 +774,6 @@ static void balance_runtime(struct rt_rq *rt_rq) raw_spin_lock(&rt_rq->rt_runtime_lock); } } -#else /* !CONFIG_SMP: */ -static inline void balance_runtime(struct rt_rq *rt_rq) {} -#endif /* !CONFIG_SMP */ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { @@ -980,10 +950,8 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) return &cpu_rq(cpu)->rt; } -#ifdef CONFIG_SMP static void __enable_runtime(struct rq *rq) { } static void __disable_runtime(struct rq *rq) { } -#endif #endif /* !CONFIG_RT_GROUP_SCHED */ @@ -1078,8 +1046,6 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq) cpufreq_update_util(rq, 0); } -#if defined CONFIG_SMP - static void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) { @@ -1110,16 +1076,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); } -#else /* !CONFIG_SMP: */ - -static inline -void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} -static inline -void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} - -#endif /* !CONFIG_SMP */ - -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED static void inc_rt_prio(struct rt_rq *rt_rq, int prio) { @@ -1158,13 +1114,6 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) dec_rt_prio_smp(rt_rq, prio, prev_prio); } -#else /* !(CONFIG_SMP || CONFIG_RT_GROUP_SCHED): */ - -static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} -static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} - -#endif /* !(CONFIG_SMP || CONFIG_RT_GROUP_SCHED) */ - #ifdef CONFIG_RT_GROUP_SCHED static void @@ -1541,7 +1490,6 @@ static void yield_task_rt(struct rq *rq) requeue_task_rt(rq, rq->curr, 0); } -#ifdef CONFIG_SMP static int find_lowest_rq(struct task_struct *task); static int @@ -1656,7 +1604,6 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq); } -#endif /* CONFIG_SMP */ /* * Preempt the current task with a newly woken task if needed: @@ -1670,7 +1617,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) return; } -#ifdef CONFIG_SMP /* * If: * @@ -1685,7 +1631,6 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) */ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) check_preempt_equal_prio(rq, p); -#endif /* CONFIG_SMP */ } static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first) @@ -1779,8 +1724,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s enqueue_pushable_task(rq, p); } -#ifdef CONFIG_SMP - /* Only try algorithms three times */ #define RT_MAX_TRIES 3 @@ -2454,11 +2397,6 @@ void __init init_sched_rt_class(void) GFP_KERNEL, cpu_to_node(i)); } } -#else /* !CONFIG_SMP: */ -void __init init_sched_rt_class(void) -{ -} -#endif /* !CONFIG_SMP */ /* * When switching a task to RT, we may overload the runqueue @@ -2482,10 +2420,8 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * then see if we can move to another run queue. */ if (task_on_rq_queued(p)) { -#ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) rt_queue_push_tasks(rq); -#endif /* CONFIG_SMP */ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } @@ -2502,7 +2438,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) return; if (task_current_donor(rq, p)) { -#ifdef CONFIG_SMP /* * If our priority decreases while running, we * may need to pull tasks to this runqueue. @@ -2516,11 +2451,6 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) */ if (p->prio > rq->rt.highest_prio.curr) resched_curr(rq); -#else /* !CONFIG_SMP: */ - /* For UP simply resched on drop of prio */ - if (oldprio < p->prio) - resched_curr(rq); -#endif /* !CONFIG_SMP */ } else { /* * This task is not running, but if it is @@ -2641,7 +2571,6 @@ DEFINE_SCHED_CLASS(rt) = { .put_prev_task = put_prev_task_rt, .set_next_task = set_next_task_rt, -#ifdef CONFIG_SMP .balance = balance_rt, .select_task_rq = select_task_rq_rt, .set_cpus_allowed = set_cpus_allowed_common, @@ -2650,7 +2579,6 @@ DEFINE_SCHED_CLASS(rt) = { .task_woken = task_woken_rt, .switched_from = switched_from_rt, .find_lock_rq = find_lock_lowest_rq, -#endif /* !CONFIG_SMP */ .task_tick = task_tick_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3e7151ed5e5e..6bc8e426f174 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -792,11 +792,9 @@ struct rt_rq { int curr; /* highest queued rt task prio */ int next; /* next highest */ } highest_prio; -#ifdef CONFIG_SMP bool overloaded; struct plist_head pushable_tasks; -#endif /* CONFIG_SMP */ int rt_queued; #ifdef CONFIG_RT_GROUP_SCHED From 6324dce8f6262ec2049494af311e5418bc733341 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:10 +0200 Subject: [PATCH 31/84] sched/smp: Use the SMP version of the deadline scheduling class Simplify the scheduler by making CONFIG_SMP=y code in prio_changed_dl() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-30-mingo@kernel.org --- kernel/sched/deadline.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index bf9b70a3ff95..0f30697ad795 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3005,7 +3005,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, if (!task_on_rq_queued(p)) return; -#ifdef CONFIG_SMP /* * This might be too much, but unfortunately * we don't have the old deadline value, and @@ -3034,13 +3033,6 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) resched_curr(rq); } -#else /* !CONFIG_SMP: */ - /* - * We don't know if p has a earlier or later deadline, so let's blindly - * set a (maybe not needed) rescheduling point. - */ - resched_curr(rq); -#endif /* !CONFIG_SMP */ } #ifdef CONFIG_SCHED_CORE From 02fb885ebdc4ad029aabff4b85dcbc540d7cdb32 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:11 +0200 Subject: [PATCH 32/84] sched/smp: Use the SMP version of scheduler debugging data Simplify the scheduler by making CONFIG_SMP=y debug output unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-31-mingo@kernel.org --- kernel/sched/debug.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 04c0354f05e4..f16d53928bb9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -169,8 +169,6 @@ static const struct file_operations sched_feat_fops = { .release = single_release, }; -#ifdef CONFIG_SMP - static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -217,8 +215,6 @@ static const struct file_operations sched_scaling_fops = { .release = single_release, }; -#endif /* CONFIG_SMP */ - #ifdef CONFIG_PREEMPT_DYNAMIC static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, @@ -511,7 +507,6 @@ static __init int sched_init_debug(void) debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -#ifdef CONFIG_SMP debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); @@ -519,7 +514,6 @@ static __init int sched_init_debug(void) sched_domains_mutex_lock(); update_sched_domain_debugfs(); sched_domains_mutex_unlock(); -#endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING numa = debugfs_create_dir("numa_balancing", debugfs_sched); @@ -685,11 +679,9 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } P(se->load.weight); -#ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); P(se->avg.runnable_avg); -#endif /* CONFIG_SMP */ #undef PN_SCHEDSTAT #undef PN @@ -849,7 +841,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued); SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); -#ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg", @@ -870,7 +861,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", atomic_long_read(&cfs_rq->tg->load_avg)); #endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); @@ -967,12 +957,10 @@ do { \ #undef P #undef PN -#ifdef CONFIG_SMP #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); P64(avg_idle); P64(max_idle_balance_cost); #undef P64 -#endif /* CONFIG_SMP */ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, schedstat_val(rq->n)); if (schedstat_enabled()) { @@ -1242,7 +1230,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, __PS("nr_involuntary_switches", p->nivcsw); P(se.load.weight); -#ifdef CONFIG_SMP P(se.avg.load_sum); P(se.avg.runnable_sum); P(se.avg.util_sum); @@ -1251,7 +1238,6 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); PM(se.avg.util_est, ~UTIL_AVG_UNCHANGED); -#endif /* CONFIG_SMP */ #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); __PS("uclamp.max", p->uclamp_req[UCLAMP_MAX].value); From 9d9af2372f2a46242fd5e827973235f40f31a706 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:12 +0200 Subject: [PATCH 33/84] sched/smp: Use the SMP version of schedstats Simplify the scheduler by making CONFIG_SMP=y schedstats debugging output unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-32-mingo@kernel.org --- kernel/sched/stats.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 86acd374fc4d..d1c9429a4ac5 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -115,10 +115,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "timestamp %lu\n", jiffies); } else { struct rq *rq; -#ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; -#endif cpu = (unsigned long)(v - 2); rq = cpu_rq(cpu); @@ -133,7 +131,6 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "\n"); -#ifdef CONFIG_SMP /* domain-specific stats */ rcu_read_lock(); for_each_domain(cpu, sd) { @@ -164,7 +161,6 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->ttwu_move_balance); } rcu_read_unlock(); -#endif /* CONFIG_SMP */ } return 0; } From 8a9246ddc16c0feaa3b09ca9d2e30fbfa88d09de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:13 +0200 Subject: [PATCH 34/84] sched/smp: Use the SMP version of the scheduler syscalls Simplify the scheduler by making CONFIG_SMP=y code in idle_cpu(), __sched_setscheduler() and sched_setaffinity() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-33-mingo@kernel.org --- kernel/sched/syscalls.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index d7fccf871c7d..77ae87f36e84 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -209,10 +209,8 @@ int idle_cpu(int cpu) if (rq->nr_running) return 0; -#ifdef CONFIG_SMP if (rq->ttwu_pending) return 0; -#endif return 1; } @@ -641,7 +639,6 @@ change: goto unlock; } #endif /* CONFIG_RT_GROUP_SCHED */ -#ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(policy) && !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; @@ -657,7 +654,6 @@ change: goto unlock; } } -#endif /* CONFIG_SMP */ } /* Re-check policy now with rq lock held: */ @@ -1239,7 +1235,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); if (user_mask) { cpumask_copy(user_mask, in_mask); - } else if (IS_ENABLED(CONFIG_SMP)) { + } else { return -ENOMEM; } From 6c8d251621c131274fa05b46fc138f296b00f6e4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:14 +0200 Subject: [PATCH 35/84] sched/smp: Use the SMP version of sched_update_asym_prefer_cpu() Simplify the scheduler by making CONFIG_SMP=y code in sched_update_asym_prefer_cpu() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-34-mingo@kernel.org --- kernel/sched/topology.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f2c10167f2bc..8e06b1d22e91 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1323,7 +1323,6 @@ next: /* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) { -#ifdef CONFIG_SMP int asym_prefer_cpu = cpu; struct sched_domain *sd; @@ -1373,7 +1372,6 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); } -#endif /* CONFIG_SMP */ } /* From 482c4dae75cbe3a3bc7109d6c2346e400facbea8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:15 +0200 Subject: [PATCH 36/84] sched/smp: Use the SMP version of the idle scheduling class Simplify the scheduler by making CONFIG_SMP=y code in the idle scheduling classunconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-35-mingo@kernel.org --- kernel/sched/idle.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 43d6e006cff1..c39b089d4f09 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -432,7 +432,6 @@ void cpu_startup_entry(enum cpuhp_state state) * idle-task scheduling class. */ -#ifdef CONFIG_SMP static int select_task_rq_idle(struct task_struct *p, int cpu, int flags) { @@ -444,7 +443,6 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return WARN_ON_ONCE(1); } -#endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -531,11 +529,9 @@ DEFINE_SCHED_CLASS(idle) = { .put_prev_task = put_prev_task_idle, .set_next_task = set_next_task_idle, -#ifdef CONFIG_SMP .balance = balance_idle, .select_task_rq = select_task_rq_idle, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_idle, From caf5bde9c542f0cbdf2c91db7ea9743e33941d91 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:16 +0200 Subject: [PATCH 37/84] sched/smp: Use the SMP version of the stop-CPU scheduling class Simplify the scheduler by making CONFIG_SMP=y code in the stop-CPU scheduling class unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-36-mingo@kernel.org --- kernel/sched/stop_task.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 1c1bbc6e33b6..2d4e279f05ee 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -9,7 +9,6 @@ */ #include "sched.h" -#ifdef CONFIG_SMP static int select_task_rq_stop(struct task_struct *p, int cpu, int flags) { @@ -21,7 +20,6 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { return sched_stop_runnable(rq); } -#endif /* CONFIG_SMP */ static void wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) @@ -107,11 +105,9 @@ DEFINE_SCHED_CLASS(stop) = { .put_prev_task = put_prev_task_stop, .set_next_task = set_next_task_stop, -#ifdef CONFIG_SMP .balance = balance_stop, .select_task_rq = select_task_rq_stop, .set_cpus_allowed = set_cpus_allowed_common, -#endif .task_tick = task_tick_stop, From 172408811961d7facbd1ec9af66893b058d5d542 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:17 +0200 Subject: [PATCH 38/84] sched/smp: Use the SMP version of cpu_of() Simplify the scheduler by making CONFIG_SMP=y code in cpu_of() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-37-mingo@kernel.org --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6bc8e426f174..234dd28e2bec 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1294,11 +1294,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) static inline int cpu_of(struct rq *rq) { -#ifdef CONFIG_SMP return rq->cpu; -#else - return 0; -#endif } #define MDF_PUSH 0x01 From 9fd5da7989ba6175fe71439738ddcf4c030d3d51 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:18 +0200 Subject: [PATCH 39/84] sched/smp: Use the SMP version of is_migration_disabled() Simplify the scheduler by making the CONFIG_SMP-only code in is_migration_disabled() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-38-mingo@kernel.org --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 234dd28e2bec..b184c3d61ba2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1301,11 +1301,7 @@ static inline int cpu_of(struct rq *rq) static inline bool is_migration_disabled(struct task_struct *p) { -#ifdef CONFIG_SMP return p->migration_disabled; -#else - return false; -#endif } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); From 703b8e8545c7ca88002164d6c119c49e8ee9b137 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:19 +0200 Subject: [PATCH 40/84] sched/smp: Use the SMP version of rq_pin_lock() Simplify the scheduler by making a CONFIG_SMP-only warning in rq_pin_lock() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-39-mingo@kernel.org --- kernel/sched/sched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b184c3d61ba2..f4bac9fcb36b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1745,9 +1745,7 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#ifdef CONFIG_SMP WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback); -#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) From ea100b31eed459ea32d6df4489cf70219fd83a07 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:20 +0200 Subject: [PATCH 41/84] sched/smp: Use the SMP version of task_on_cpu() Simplify the scheduler by making CONFIG_SMP=y code in task_on_cpu() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-40-mingo@kernel.org --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f4bac9fcb36b..7490473a73aa 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2238,11 +2238,7 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p) static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { -#ifdef CONFIG_SMP return p->on_cpu; -#else - return task_current(rq, p); -#endif } static inline int task_on_rq_queued(struct task_struct *p) From 0203244600b2f48c7074a9d439789d8d1152d3e1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:21 +0200 Subject: [PATCH 42/84] sched/smp: Use the SMP version of WF_ and SD_ flag sanity checks Simplify the scheduler by making CONFIG_SMP=y asserts related to WF_ and SD_ flags unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-41-mingo@kernel.org --- kernel/sched/sched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7490473a73aa..d8c78ee73d12 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2261,11 +2261,9 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_CURRENT_CPU 0x40 /* Prefer to move the wakee to the current CPU. */ #define WF_RQ_SELECTED 0x80 /* ->select_task_rq() was called */ -#ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); static_assert(WF_FORK == SD_BALANCE_FORK); static_assert(WF_TTWU == SD_BALANCE_WAKE); -#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution From 241c307b05b00478bbb65bf440ece464aeac9208 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:22 +0200 Subject: [PATCH 43/84] sched/smp: Use the SMP version of ENQUEUE_MIGRATED Simplify the scheduler by making the CONFIG_SMP-only ENQUEUE_MIGRATED flag unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-42-mingo@kernel.org --- kernel/sched/sched.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d8c78ee73d12..45245f4057ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2319,11 +2319,7 @@ extern const u32 sched_prio_to_wmult[40]; #define ENQUEUE_HEAD 0x10 #define ENQUEUE_REPLENISH 0x20 -#ifdef CONFIG_SMP #define ENQUEUE_MIGRATED 0x40 -#else -#define ENQUEUE_MIGRATED 0x00 -#endif #define ENQUEUE_INITIAL 0x80 #define ENQUEUE_MIGRATING 0x100 #define ENQUEUE_DELAYED 0x200 From fc75ac3c918d512952f7c132ef635cedfc4900a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:23 +0200 Subject: [PATCH 44/84] sched/smp: Use the SMP version of add_nr_running() Simplify the scheduler by making CONFIG_SMP=y code in add_nr_running() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-43-mingo@kernel.org --- kernel/sched/sched.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 45245f4057ce..aa08103f8584 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2673,10 +2673,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) call_trace_sched_update_nr_running(rq, count); } -#ifdef CONFIG_SMP if (prev_nr < 2 && rq->nr_running >= 2) set_rd_overloaded(rq->rd, 1); -#endif sched_update_tick_dependency(rq); } From dabe1be4e84c05db9341eb8c6c410e18a5ffeaa5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 28 May 2025 10:09:24 +0200 Subject: [PATCH 45/84] sched/smp: Use the SMP version of double_rq_clock_clear_update() Simplify the scheduler by making CONFIG_SMP=y code in double_rq_clock_clear_update() unconditional. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: Dietmar Eggemann Cc: Juri Lelli Cc: Linus Torvalds Cc: Mel Gorman Cc: Sebastian Andrzej Siewior Cc: Shrikanth Hegde Cc: Steven Rostedt Cc: Valentin Schneider Cc: Vincent Guittot Link: https://lore.kernel.org/r/20250528080924.2273858-44-mingo@kernel.org --- kernel/sched/sched.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index aa08103f8584..c323d015486c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2840,10 +2840,7 @@ unsigned long arch_scale_freq_capacity(int cpu) static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); - /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ -#ifdef CONFIG_SMP rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); -#endif } #define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \ From d403a3689af5c3a3e3ac6e282958d0eaa69ca47f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:23:27 -1000 Subject: [PATCH 46/84] sched/fair: Move max_cfs_quota_period decl and default_cfs_period() def from fair.c to sched.h max_cfs_quota_period is defined in core.c but has a declaration in fair.c. Move the declaration to kernel/sched/sched.h. Also, move default_cfs_period() from fair.c to sched.h. No functional changes. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250614012346.2358261-2-tj@kernel.org --- kernel/sched/fair.c | 11 ----------- kernel/sched/sched.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b17d3da034a..707be4570430 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5617,15 +5617,6 @@ void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} #endif /* !CONFIG_JUMP_LABEL */ -/* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds - */ -static inline u64 default_cfs_period(void) -{ - return 100000000ULL; -} - static inline u64 sched_cfs_bandwidth_slice(void) { return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC; @@ -6405,8 +6396,6 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } -extern const u64 max_cfs_quota_period; - static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c323d015486c..e00b80cd998e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -402,6 +402,19 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; +#ifdef CONFIG_CFS_BANDWIDTH +extern const u64 max_cfs_quota_period; + +/* + * default period for cfs group bandwidth. + * default: 0.1s, units: nanoseconds + */ +static inline u64 default_cfs_period(void) +{ + return 100000000ULL; +} +#endif /* CONFIG_CFS_BANDWIDTH */ + struct cfs_bandwidth { #ifdef CONFIG_CFS_BANDWIDTH raw_spinlock_t lock; From de4c80c6963e130707ead16a544a387f811dbd87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:23:28 -1000 Subject: [PATCH 47/84] sched/core: Relocate tg_get_cfs_*() and cpu_cfs_*_read_*() Collect the getters, relocate the trivial interface file wrappers, and put all of them in period, quota, burst order to prepare for future changes. Pure reordering. No functional changes. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250614012346.2358261-3-tj@kernel.org --- kernel/sched/core.c | 134 ++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 67 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a03c3c1d7f50..dc9668a6592f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9404,20 +9404,14 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, return 0; } -static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) +static long tg_get_cfs_period(struct task_group *tg) { - u64 quota, period, burst; + u64 cfs_period_us; - period = ktime_to_ns(tg->cfs_bandwidth.period); - burst = tg->cfs_bandwidth.burst; - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - else - return -EINVAL; + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); + do_div(cfs_period_us, NSEC_PER_USEC); - return tg_set_cfs_bandwidth(tg, period, quota, burst); + return cfs_period_us; } static long tg_get_cfs_quota(struct task_group *tg) @@ -9433,6 +9427,16 @@ static long tg_get_cfs_quota(struct task_group *tg) return quota_us; } +static long tg_get_cfs_burst(struct task_group *tg) +{ + u64 burst_us; + + burst_us = tg->cfs_bandwidth.burst; + do_div(burst_us, NSEC_PER_USEC); + + return burst_us; +} + static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) { u64 quota, period, burst; @@ -9447,14 +9451,20 @@ static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) return tg_set_cfs_bandwidth(tg, period, quota, burst); } -static long tg_get_cfs_period(struct task_group *tg) +static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { - u64 cfs_period_us; + u64 quota, period, burst; - cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); - do_div(cfs_period_us, NSEC_PER_USEC); + period = ktime_to_ns(tg->cfs_bandwidth.period); + burst = tg->cfs_bandwidth.burst; + if (cfs_quota_us < 0) + quota = RUNTIME_INF; + else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) + quota = (u64)cfs_quota_us * NSEC_PER_USEC; + else + return -EINVAL; - return cfs_period_us; + return tg_set_cfs_bandwidth(tg, period, quota, burst); } static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) @@ -9471,52 +9481,6 @@ static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) return tg_set_cfs_bandwidth(tg, period, quota, burst); } -static long tg_get_cfs_burst(struct task_group *tg) -{ - u64 burst_us; - - burst_us = tg->cfs_bandwidth.burst; - do_div(burst_us, NSEC_PER_USEC); - - return burst_us; -} - -static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_quota(css_tg(css)); -} - -static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 cfs_quota_us) -{ - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); -} - -static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_period(css_tg(css)); -} - -static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_period_us) -{ - return tg_set_cfs_period(css_tg(css), cfs_period_us); -} - -static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - return tg_get_cfs_burst(css_tg(css)); -} - -static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_burst_us) -{ - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); -} - struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9649,6 +9613,42 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } + +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_period(css_tg(css)); +} + +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_quota(css_tg(css)); +} + +static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return tg_get_cfs_burst(css_tg(css)); +} + +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) +{ + return tg_set_cfs_period(css_tg(css), cfs_period_us); +} + +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) +{ + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); +} + +static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_burst_us) +{ + return tg_set_cfs_burst(css_tg(css), cfs_burst_us); +} #endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9710,16 +9710,16 @@ static struct cftype cpu_legacy_files[] = { }, #endif #ifdef CONFIG_CFS_BANDWIDTH - { - .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, - }, { .name = "cfs_period_us", .read_u64 = cpu_cfs_period_read_u64, .write_u64 = cpu_cfs_period_write_u64, }, + { + .name = "cfs_quota_us", + .read_s64 = cpu_cfs_quota_read_s64, + .write_s64 = cpu_cfs_quota_write_s64, + }, { .name = "cfs_burst_us", .read_u64 = cpu_cfs_burst_read_u64, From 43e33f53e25687ca870248d1939cfade0164426c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:23:29 -1000 Subject: [PATCH 48/84] sched/core: Reorganize cgroup bandwidth control interface file reads - Update tg_get_cfs_*() to return u64 values. These are now used as the low level accessors to the fair's bandwidth configuration parameters. Translation to usecs takes place in these functions. - Add tg_bandwidth() which reads all three bandwidth parameters using tg_get_cfs_*(). - Reimplement cgroup interface read functions using tg_bandwidth(). Drop cfs from the function names. This is to prepare for adding bandwidth control support to sched_ext. tg_bandwidth() will be used as the muxing point similar to tg_weight(). No functional changes. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250614012346.2358261-4-tj@kernel.org --- kernel/sched/core.c | 58 +++++++++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dc9668a6592f..8de93a3bfa27 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9404,7 +9404,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, return 0; } -static long tg_get_cfs_period(struct task_group *tg) +static u64 tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; @@ -9414,12 +9414,12 @@ static long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static long tg_get_cfs_quota(struct task_group *tg) +static u64 tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; if (tg->cfs_bandwidth.quota == RUNTIME_INF) - return -1; + return RUNTIME_INF; quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); @@ -9427,7 +9427,7 @@ static long tg_get_cfs_quota(struct task_group *tg) return quota_us; } -static long tg_get_cfs_burst(struct task_group *tg) +static u64 tg_get_cfs_burst(struct task_group *tg) { u64 burst_us; @@ -9614,22 +9614,42 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } -static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) +static void tg_bandwidth(struct task_group *tg, + u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) { - return tg_get_cfs_period(css_tg(css)); + if (period_us_p) + *period_us_p = tg_get_cfs_period(tg); + if (quota_us_p) + *quota_us_p = tg_get_cfs_quota(tg); + if (burst_us_p) + *burst_us_p = tg_get_cfs_burst(tg); } -static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) +static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(css_tg(css)); + u64 period_us; + + tg_bandwidth(css_tg(css), &period_us, NULL, NULL); + return period_us; } -static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) +static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_burst(css_tg(css)); + u64 quota_us; + + tg_bandwidth(css_tg(css), NULL, "a_us, NULL); + return quota_us; /* (s64)RUNTIME_INF becomes -1 */ +} + +static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + u64 burst_us; + + tg_bandwidth(css_tg(css), NULL, NULL, &burst_us); + return burst_us; } static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, @@ -9712,17 +9732,17 @@ static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_CFS_BANDWIDTH { .name = "cfs_period_us", - .read_u64 = cpu_cfs_period_read_u64, + .read_u64 = cpu_period_read_u64, .write_u64 = cpu_cfs_period_write_u64, }, { .name = "cfs_quota_us", - .read_s64 = cpu_cfs_quota_read_s64, + .read_s64 = cpu_quota_read_s64, .write_s64 = cpu_cfs_quota_write_s64, }, { .name = "cfs_burst_us", - .read_u64 = cpu_cfs_burst_read_u64, + .read_u64 = cpu_burst_read_u64, .write_u64 = cpu_cfs_burst_write_u64, }, { @@ -9944,8 +9964,10 @@ static int __maybe_unused cpu_period_quota_parse(char *buf, static int cpu_max_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); + u64 period_us, quota_us; - cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + tg_bandwidth(tg, &period_us, "a_us, NULL); + cpu_period_quota_print(sf, period_us, quota_us); return 0; } @@ -9996,7 +10018,7 @@ static struct cftype cpu_files[] = { { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_cfs_burst_read_u64, + .read_u64 = cpu_burst_read_u64, .write_u64 = cpu_cfs_burst_write_u64, }, #endif /* CONFIG_CFS_BANDWIDTH */ From 5bc34be478d09c4d16009e665e020ad0fcd0deea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Jun 2025 15:23:30 -1000 Subject: [PATCH 49/84] sched/core: Reorganize cgroup bandwidth control interface file writes - Move input parameter validation from tg_set_cfs_bandwidth() to the new outer function tg_set_bandwidth(). The outer function handles parameters in usecs, validates them and calls tg_set_cfs_bandwidth() which converts them into nsecs. This matches tg_bandwidth() on the read side. - max/min_cfs_* consts are now used by tg_set_bandwidth(). Relocate, convert into usecs and drop "cfs" from the names. - Reimplement cpu_cfs_{period|quote|burst}_write_*() using tg_bandwidth() and tg_set_bandwidth() and replace "cfs" in the names with "bw". - Update cpu_max_write() to use tg_set_bandiwdth(). cpu_period_quota_parse() is updated to drop nsec conversion accordingly. This aligns the behavior with cfs_period_quota_print(). - Drop now unused tg_set_cfs_{period|quota|burst}(). - While at it, for consistency, rename default_cfs_period() to default_bw_period_us() and make it return usecs. This is to prepare for adding bandwidth control support to sched_ext. tg_set_bandwidth() will be used as the muxing point. No functional changes intended. Signed-off-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20250614012346.2358261-5-tj@kernel.org --- kernel/sched/core.c | 205 +++++++++++++++++++++---------------------- kernel/sched/fair.c | 4 +- kernel/sched/sched.h | 10 +-- 3 files changed, 106 insertions(+), 113 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8de93a3bfa27..2f8caa9db78d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -9309,47 +9309,23 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); -const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ -static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ -/* More than 203 days if BW_SHIFT equals 20. */ -static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; - static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); -static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, - u64 burst) +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) { int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 period, quota, burst; - if (tg == &root_task_group) - return -EINVAL; + period = (u64)period_us * NSEC_PER_USEC; - /* - * Ensure we have at some amount of bandwidth every period. This is - * to prevent reaching a state of large arrears when throttled via - * entity_tick() resulting in prolonged exit starvation. - */ - if (quota < min_cfs_quota_period || period < min_cfs_quota_period) - return -EINVAL; + if (quota_us == RUNTIME_INF) + quota = RUNTIME_INF; + else + quota = (u64)quota_us * NSEC_PER_USEC; - /* - * Likewise, bound things on the other side by preventing insane quota - * periods. This also allows us to normalize in computing quota - * feasibility. - */ - if (period > max_cfs_quota_period) - return -EINVAL; - - /* - * Bound quota to defend quota against overflow during bandwidth shift. - */ - if (quota != RUNTIME_INF && quota > max_cfs_runtime) - return -EINVAL; - - if (quota != RUNTIME_INF && (burst > quota || - burst + quota > max_cfs_runtime)) - return -EINVAL; + burst = (u64)burst_us * NSEC_PER_USEC; /* * Prevent race between setting of cfs_rq->runtime_enabled and @@ -9437,50 +9413,6 @@ static u64 tg_get_cfs_burst(struct task_group *tg) return burst_us; } -static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg->cfs_bandwidth.quota; - burst = tg->cfs_bandwidth.burst; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) -{ - u64 quota, period, burst; - - period = ktime_to_ns(tg->cfs_bandwidth.period); - burst = tg->cfs_bandwidth.burst; - if (cfs_quota_us < 0) - quota = RUNTIME_INF; - else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) - quota = (u64)cfs_quota_us * NSEC_PER_USEC; - else - return -EINVAL; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - -static int tg_set_cfs_burst(struct task_group *tg, long cfs_burst_us) -{ - u64 quota, period, burst; - - if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) - return -EINVAL; - - burst = (u64)cfs_burst_us * NSEC_PER_USEC; - period = ktime_to_ns(tg->cfs_bandwidth.period); - quota = tg->cfs_bandwidth.quota; - - return tg_set_cfs_bandwidth(tg, period, quota, burst); -} - struct cfs_schedulable_data { struct task_group *tg; u64 period, quota; @@ -9614,6 +9546,11 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) return 0; } +const u64 max_bw_quota_period_us = 1 * USEC_PER_SEC; /* 1s */ +static const u64 min_bw_quota_period_us = 1 * USEC_PER_MSEC; /* 1ms */ +/* More than 203 days if BW_SHIFT equals 20. */ +static const u64 max_bw_runtime_us = MAX_BW; + static void tg_bandwidth(struct task_group *tg, u64 *period_us_p, u64 *quota_us_p, u64 *burst_us_p) { @@ -9634,6 +9571,50 @@ static u64 cpu_period_read_u64(struct cgroup_subsys_state *css, return period_us; } +static int tg_set_bandwidth(struct task_group *tg, + u64 period_us, u64 quota_us, u64 burst_us) +{ + const u64 max_usec = U64_MAX / NSEC_PER_USEC; + + if (tg == &root_task_group) + return -EINVAL; + + /* Values should survive translation to nsec */ + if (period_us > max_usec || + (quota_us != RUNTIME_INF && quota_us > max_usec) || + burst_us > max_usec) + return -EINVAL; + + /* + * Ensure we have some amount of bandwidth every period. This is to + * prevent reaching a state of large arrears when throttled via + * entity_tick() resulting in prolonged exit starvation. + */ + if (quota_us < min_bw_quota_period_us || + period_us < min_bw_quota_period_us) + return -EINVAL; + + /* + * Likewise, bound things on the other side by preventing insane quota + * periods. This also allows us to normalize in computing quota + * feasibility. + */ + if (period_us > max_bw_quota_period_us) + return -EINVAL; + + /* + * Bound quota to defend quota against overflow during bandwidth shift. + */ + if (quota_us != RUNTIME_INF && quota_us > max_bw_runtime_us) + return -EINVAL; + + if (quota_us != RUNTIME_INF && (burst_us > quota_us || + burst_us + quota_us > max_bw_runtime_us)) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, period_us, quota_us, burst_us); +} + static s64 cpu_quota_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -9652,22 +9633,37 @@ static u64 cpu_burst_read_u64(struct cgroup_subsys_state *css, return burst_us; } -static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_period_us) +static int cpu_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 period_us) { - return tg_set_cfs_period(css_tg(css), cfs_period_us); + struct task_group *tg = css_tg(css); + u64 quota_us, burst_us; + + tg_bandwidth(tg, NULL, "a_us, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } -static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, - struct cftype *cftype, s64 cfs_quota_us) +static int cpu_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 quota_us) { - return tg_set_cfs_quota(css_tg(css), cfs_quota_us); + struct task_group *tg = css_tg(css); + u64 period_us, burst_us; + + if (quota_us < 0) + quota_us = RUNTIME_INF; + + tg_bandwidth(tg, &period_us, NULL, &burst_us); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } -static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 cfs_burst_us) +static int cpu_burst_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 burst_us) { - return tg_set_cfs_burst(css_tg(css), cfs_burst_us); + struct task_group *tg = css_tg(css); + u64 period_us, quota_us; + + tg_bandwidth(tg, &period_us, "a_us, NULL); + return tg_set_bandwidth(tg, period_us, quota_us, burst_us); } #endif /* CONFIG_CFS_BANDWIDTH */ @@ -9733,17 +9729,17 @@ static struct cftype cpu_legacy_files[] = { { .name = "cfs_period_us", .read_u64 = cpu_period_read_u64, - .write_u64 = cpu_cfs_period_write_u64, + .write_u64 = cpu_period_write_u64, }, { .name = "cfs_quota_us", .read_s64 = cpu_quota_read_s64, - .write_s64 = cpu_cfs_quota_write_s64, + .write_s64 = cpu_quota_write_s64, }, { .name = "cfs_burst_us", .read_u64 = cpu_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .write_u64 = cpu_burst_write_u64, }, { .name = "stat", @@ -9940,22 +9936,20 @@ static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, } /* caller should put the current value in *@periodp before calling */ -static int __maybe_unused cpu_period_quota_parse(char *buf, - u64 *periodp, u64 *quotap) +static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *period_us_p, + u64 *quota_us_p) { char tok[21]; /* U64_MAX */ - if (sscanf(buf, "%20s %llu", tok, periodp) < 1) + if (sscanf(buf, "%20s %llu", tok, period_us_p) < 1) return -EINVAL; - *periodp *= NSEC_PER_USEC; - - if (sscanf(tok, "%llu", quotap)) - *quotap *= NSEC_PER_USEC; - else if (!strcmp(tok, "max")) - *quotap = RUNTIME_INF; - else - return -EINVAL; + if (sscanf(tok, "%llu", quota_us_p) < 1) { + if (!strcmp(tok, "max")) + *quota_us_p = RUNTIME_INF; + else + return -EINVAL; + } return 0; } @@ -9975,14 +9969,13 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct task_group *tg = css_tg(of_css(of)); - u64 period = tg_get_cfs_period(tg); - u64 burst = tg->cfs_bandwidth.burst; - u64 quota; + u64 period_us, quota_us, burst_us; int ret; - ret = cpu_period_quota_parse(buf, &period, "a); + tg_bandwidth(tg, &period_us, NULL, &burst_us); + ret = cpu_period_quota_parse(buf, &period_us, "a_us); if (!ret) - ret = tg_set_cfs_bandwidth(tg, period, quota, burst); + ret = tg_set_bandwidth(tg, period_us, quota_us, burst_us); return ret ?: nbytes; } #endif /* CONFIG_CFS_BANDWIDTH */ @@ -10019,7 +10012,7 @@ static struct cftype cpu_files[] = { .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_burst_read_u64, - .write_u64 = cpu_cfs_burst_write_u64, + .write_u64 = cpu_burst_write_u64, }, #endif /* CONFIG_CFS_BANDWIDTH */ #ifdef CONFIG_UCLAMP_TASK_GROUP diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 707be4570430..7e2963efe800 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6422,7 +6422,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) * to fail. */ new = old * 2; - if (new < max_cfs_quota_period) { + if (new < max_bw_quota_period_us * NSEC_PER_USEC) { cfs_b->period = ns_to_ktime(new); cfs_b->quota *= 2; cfs_b->burst *= 2; @@ -6456,7 +6456,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren raw_spin_lock_init(&cfs_b->lock); cfs_b->runtime = 0; cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); + cfs_b->period = us_to_ktime(default_bw_period_us()); cfs_b->burst = 0; cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e00b80cd998e..105190b18020 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -403,15 +403,15 @@ static inline bool dl_server_active(struct sched_dl_entity *dl_se) extern struct list_head task_groups; #ifdef CONFIG_CFS_BANDWIDTH -extern const u64 max_cfs_quota_period; +extern const u64 max_bw_quota_period_us; /* - * default period for cfs group bandwidth. - * default: 0.1s, units: nanoseconds + * default period for group bandwidth. + * default: 0.1s, units: microseconds */ -static inline u64 default_cfs_period(void) +static inline u64 default_bw_period_us(void) { - return 100000000ULL; + return 100000ULL; } #endif /* CONFIG_CFS_BANDWIDTH */ From 3f9ebeba9878679bb43ee2db7d50a4691f55e3a5 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Mon, 24 Mar 2025 14:18:34 +0800 Subject: [PATCH 50/84] rust: sync: Mark CondVar::notify_*() inline When build the kernel using the llvm-18.1.3-rust-1.85.0-x86_64 with ARCH=arm64, the following symbols are generated: $nm vmlinux | grep ' _R'.*CondVar | rustfilt ... T ::notify_all ... T ::notify_one ... T ::notify_sync ... These notify_*() symbols are trivial wrappers around the C functions __wake_up() and __wake_up_sync(). It doesn't make sense to go through a trivial wrapper for these functions, so mark them inline. [boqun: Reword the commit title for consistency and reformat the commit log.] Suggested-by: Alice Ryhl Link: https://github.com/Rust-for-Linux/linux/issues/1145 Co-developed-by: Grace Deng Signed-off-by: Grace Deng Signed-off-by: Kunwu Chan Reviewed-by: Benno Lossin Reviewed-by: Alice Ryhl Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250324061835.1693125-1-kunwu.chan@linux.dev --- rust/kernel/sync/condvar.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs index caebf03f553b..c6ec64295c9f 100644 --- a/rust/kernel/sync/condvar.rs +++ b/rust/kernel/sync/condvar.rs @@ -216,6 +216,7 @@ impl CondVar { /// This method behaves like `notify_one`, except that it hints to the scheduler that the /// current thread is about to go to sleep, so it should schedule the target thread on the same /// CPU. + #[inline] pub fn notify_sync(&self) { // SAFETY: `wait_queue_head` points to valid memory. unsafe { bindings::__wake_up_sync(self.wait_queue_head.get(), TASK_NORMAL) }; @@ -225,6 +226,7 @@ impl CondVar { /// /// This is not 'sticky' in the sense that if no thread is waiting, the notification is lost /// completely (as opposed to automatically waking up the next waiter). + #[inline] pub fn notify_one(&self) { self.notify(1); } @@ -233,6 +235,7 @@ impl CondVar { /// /// This is not 'sticky' in the sense that if no thread is waiting, the notification is lost /// completely (as opposed to automatically waking up the next waiter). + #[inline] pub fn notify_all(&self) { self.notify(0); } From 11867144ff81ab98f4b11c99716c3e8b714b8755 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Mon, 17 Mar 2025 10:52:05 +0800 Subject: [PATCH 51/84] rust: sync: Mark PollCondVar::drop() inline When building the kernel using the llvm-18.1.3-rust-1.85.0-x86_64 with ARCH=arm64, the following symbols are generated: $nm vmlinux | grep ' _R'.*PollCondVar | rustfilt ... T ::drop ... This Rust symbol is trivial wrappers around the C functions __wake_up_pollfree() and synchronize_rcu(). It doesn't make sense to go through a trivial wrapper for its functions, so mark it inline. [boqun: Reword the commit title and re-format the commit log per tip tree's requirement, remove unnecessary information from "nm vmlinux" result.] Link: https://github.com/Rust-for-Linux/linux/issues/1145 Suggested-by: Alice Ryhl Co-developed-by: Grace Deng Signed-off-by: Grace Deng Signed-off-by: Kunwu Chan Reviewed-by: Alice Ryhl Reviewed-by: Benno Lossin Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250317025205.2366518-1-kunwu.chan@linux.dev --- rust/kernel/sync/poll.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/kernel/sync/poll.rs b/rust/kernel/sync/poll.rs index d7e6e59e124b..7b973d72229b 100644 --- a/rust/kernel/sync/poll.rs +++ b/rust/kernel/sync/poll.rs @@ -107,6 +107,7 @@ impl Deref for PollCondVar { #[pinned_drop] impl PinnedDrop for PollCondVar { + #[inline] fn drop(self: Pin<&mut Self>) { // Clear anything registered using `register_wait`. // From 0a41f5af19391ce55cae1f0d7a562e8694bf1fd5 Mon Sep 17 00:00:00 2001 From: Panagiotis Foliadis Date: Sat, 15 Mar 2025 12:23:01 +0000 Subject: [PATCH 52/84] rust: task: Mark Task methods inline When building the kernel using the llvm-18.1.3-rust-1.85.0-x86_64 toolchain provided by kernel.org, the following symbols are generated: $ nm vmlinux | grep ' _R'.*Task | rustfilt ... T ::get_pid_ns ... T ::tgid_nr_ns ... T ::current_pid_ns ... T ::signal_pending ... T ::uid ... T ::euid ... T ::current ... T ::wake_up ... T ::dec_ref ... T ::inc_ref These Rust symbols are trivial wrappers around the C functions. It doesn't make sense to go through a trivial wrapper for these functions, so mark them inline. [boqun: Capitalize the title, reword a bit to avoid listing all the C functions as the code already shows them and remove the addresses of the symbols in the commit log as they are different from build to build.] Link: https://github.com/Rust-for-Linux/linux/issues/1145 Reviewed-by: Benno Lossin Reviewed-by: Christian Schrefl Reviewed-by: Charalampos Mitrodimas Reviewed-by: Alice Ryhl Signed-off-by: Panagiotis Foliadis Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250315-inline-c-wrappers-v3-1-048e43fcef7d@posteo.net --- rust/kernel/task.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 927413d85484..834368313088 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -173,6 +173,7 @@ impl Task { /// Callers must ensure that the returned object is only used to access a [`CurrentTask`] /// within the task context that was active when this function was called. For more details, /// see the invariants section for [`CurrentTask`]. + #[inline] pub unsafe fn current() -> impl Deref { struct TaskRef { task: *const CurrentTask, @@ -222,24 +223,28 @@ impl Task { } /// Returns the UID of the given task. + #[inline] pub fn uid(&self) -> Kuid { // SAFETY: It's always safe to call `task_uid` on a valid task. Kuid::from_raw(unsafe { bindings::task_uid(self.as_ptr()) }) } /// Returns the effective UID of the given task. + #[inline] pub fn euid(&self) -> Kuid { // SAFETY: It's always safe to call `task_euid` on a valid task. Kuid::from_raw(unsafe { bindings::task_euid(self.as_ptr()) }) } /// Determines whether the given task has pending signals. + #[inline] pub fn signal_pending(&self) -> bool { // SAFETY: It's always safe to call `signal_pending` on a valid task. unsafe { bindings::signal_pending(self.as_ptr()) != 0 } } /// Returns task's pid namespace with elevated reference count + #[inline] pub fn get_pid_ns(&self) -> Option> { // SAFETY: By the type invariant, we know that `self.0` is valid. let ptr = unsafe { bindings::task_get_pid_ns(self.as_ptr()) }; @@ -255,6 +260,7 @@ impl Task { /// Returns the given task's pid in the provided pid namespace. #[doc(alias = "task_tgid_nr_ns")] + #[inline] pub fn tgid_nr_ns(&self, pidns: Option<&PidNamespace>) -> Pid { let pidns = match pidns { Some(pidns) => pidns.as_ptr(), @@ -268,6 +274,7 @@ impl Task { } /// Wakes up the task. + #[inline] pub fn wake_up(&self) { // SAFETY: It's always safe to call `wake_up_process` on a valid task, even if the task // running. @@ -341,11 +348,13 @@ impl CurrentTask { // SAFETY: The type invariants guarantee that `Task` is always refcounted. unsafe impl crate::types::AlwaysRefCounted for Task { + #[inline] fn inc_ref(&self) { // SAFETY: The existence of a shared reference means that the refcount is nonzero. unsafe { bindings::get_task_struct(self.as_ptr()) }; } + #[inline] unsafe fn dec_ref(obj: ptr::NonNull) { // SAFETY: The safety requirements guarantee that the refcount is nonzero. unsafe { bindings::put_task_struct(obj.cast().as_ptr()) } From 0aa2b78ce5a9eac8f3332192ea77755d63a831cd Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Sun, 15 Jun 2025 15:14:00 -0700 Subject: [PATCH 53/84] rust: Introduce file_from_location() Most of kernel debugging facilities take a nul-terminated string for file names for a callsite (generated from __FILE__), however the Rust courterpart, Location, would return a Rust string (not nul-terminated) from method .file(). And such a string cannot be passed to C debugging function directly. There is ongoing work to support a Location::file_with_nul() [1], which returns a nul-terminated string from a Location. Since it's still working in progress, and it will take some time before the feature finally gets stabilized and the kernel's minimal rustc version might also take a while to bump to a version that at least has that feature, introduce a file_from_location() function, which returns a warning string if Location::file_with_nul() is not available. This should work in most cases because as for now the known usage of Location::file_with_nul() is only in debugging code (e.g. might_sleep()) and there might be other information reported by the debugging code that could help locate the problematic function, so missing the file name is fine at the moment. Link: https://github.com/rust-lang/rust/issues/141727 [1] Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250619151007.61767-2-boqun.feng@gmail.com --- init/Kconfig | 3 +++ rust/kernel/lib.rs | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index af4c2f085455..6f4ec5633ffa 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -142,6 +142,9 @@ config RUSTC_HAS_SPAN_FILE config RUSTC_HAS_UNNECESSARY_TRANSMUTES def_bool RUSTC_VERSION >= 108800 +config RUSTC_HAS_FILE_WITH_NUL + def_bool RUSTC_VERSION >= 108900 + config PAHOLE_VERSION int default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE)) diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index 6b4774b2b1c3..717a5b6160ca 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -40,6 +40,10 @@ #![cfg_attr(not(CONFIG_RUSTC_HAS_COERCE_POINTEE), feature(coerce_unsized))] #![cfg_attr(not(CONFIG_RUSTC_HAS_COERCE_POINTEE), feature(dispatch_from_dyn))] #![cfg_attr(not(CONFIG_RUSTC_HAS_COERCE_POINTEE), feature(unsize))] +// +// `feature(file_with_nul)` is expected to become stable. Before Rust 1.89.0, it did not exist, so +// enable it conditionally. +#![cfg_attr(CONFIG_RUSTC_HAS_FILE_WITH_NUL, feature(file_with_nul))] // Ensure conditional compilation based on the kernel configuration works; // otherwise we may silently break things like initcall handling. @@ -274,3 +278,47 @@ macro_rules! asm { ::core::arch::asm!( $($asm)*, $($rest)* ) }; } + +/// Gets the C string file name of a [`Location`]. +/// +/// If `file_with_nul()` is not available, returns a string that warns about it. +/// +/// [`Location`]: core::panic::Location +/// +/// # Examples +/// +/// ``` +/// # use kernel::file_from_location; +/// +/// #[track_caller] +/// fn foo() { +/// let caller = core::panic::Location::caller(); +/// +/// // Output: +/// // - A path like "rust/kernel/example.rs" if file_with_nul() is available. +/// // - "" otherwise. +/// let caller_file = file_from_location(caller); +/// +/// // Prints out the message with caller's file name. +/// pr_info!("foo() called in file {caller_file:?}\n"); +/// +/// # if cfg!(CONFIG_RUSTC_HAS_FILE_WITH_NUL) { +/// # assert_eq!(Ok(caller.file()), caller_file.to_str()); +/// # } +/// } +/// +/// # foo(); +/// ``` +#[inline] +pub fn file_from_location<'a>(loc: &'a core::panic::Location<'a>) -> &'a core::ffi::CStr { + #[cfg(CONFIG_RUSTC_HAS_FILE_WITH_NUL)] + { + loc.file_with_nul() + } + + #[cfg(not(CONFIG_RUSTC_HAS_FILE_WITH_NUL))] + { + let _ = loc; + c"" + } +} From 7e611710acf966df1e14bcf4e067385e38e549a1 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Apr 2025 07:56:23 +0900 Subject: [PATCH 54/84] rust: task: Add Rust version of might_sleep() Add a helper function equivalent to the C's might_sleep(), which serves as a debugging aid and a potential scheduling point. Note that this function can only be used in a nonatomic context. This will be used by Rust version of read_poll_timeout(). [boqun: Use file_from_location() to get a C string instead of changing __might_sleep()] Signed-off-by: FUJITA Tomonori Reviewed-by: Alice Ryhl Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20250619151007.61767-3-boqun.feng@gmail.com --- rust/helpers/task.c | 6 ++++++ rust/kernel/task.rs | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/rust/helpers/task.c b/rust/helpers/task.c index 31c33ea2dce6..2c85bbc2727e 100644 --- a/rust/helpers/task.c +++ b/rust/helpers/task.c @@ -1,7 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include +void rust_helper_might_resched(void) +{ + might_resched(); +} + struct task_struct *rust_helper_get_current(void) { return current; diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 834368313088..7d0935bc325c 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -400,3 +400,27 @@ impl PartialEq for Kuid { } impl Eq for Kuid {} + +/// Annotation for functions that can sleep. +/// +/// Equivalent to the C side [`might_sleep()`], this function serves as +/// a debugging aid and a potential scheduling point. +/// +/// This function can only be used in a nonatomic context. +/// +/// [`might_sleep()`]: https://docs.kernel.org/driver-api/basics.html#c.might_sleep +#[track_caller] +#[inline] +pub fn might_sleep() { + #[cfg(CONFIG_DEBUG_ATOMIC_SLEEP)] + { + let loc = core::panic::Location::caller(); + let file = kernel::file_from_location(loc); + + // SAFETY: `file.as_ptr()` is valid for reading and guaranteed to be nul-terminated. + unsafe { crate::bindings::__might_sleep(file.as_ptr().cast(), loc.line() as i32) } + } + + // SAFETY: Always safe to call. + unsafe { crate::bindings::might_resched() } +} From 155213a2aed42c85361bf4f5c817f5cb68951c3b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 26 Jun 2025 07:39:10 -0700 Subject: [PATCH 55/84] sched/fair: Bump sd->max_newidle_lb_cost when newidle balance fails schbench (https://github.com/masoncl/schbench.git) is showing a regression from previous production kernels that bisected down to: sched/fair: Remove sysctl_sched_migration_cost condition (c5b0a7eefc) The schbench command line was: schbench -L -m 4 -M auto -t 256 -n 0 -r 0 -s 0 This creates 4 message threads pinned to CPUs 0-3, and 256x4 worker threads spread across the rest of the CPUs. Neither the worker threads or the message threads do any work, they just wake each other up and go back to sleep as soon as possible. The end result is the first 4 CPUs are pegged waking up those 1024 workers, and the rest of the CPUs are constantly banging in and out of idle. If I take a v6.9 Linus kernel and revert that one commit, performance goes from 3.4M RPS to 5.4M RPS. schedstat shows there are ~100x more new idle balance operations, and profiling shows the worker threads are spending ~20% of their CPU time on new idle balance. schedstats also shows that almost all of these new idle balance attemps are failing to find busy groups. The fix used here is to crank up the cost of the newidle balance whenever it fails. Since we don't want sd->max_newidle_lb_cost to grow out of control, this also changes update_newidle_cost() to use sysctl_sched_migration_cost as the upper limit on max_newidle_lb_cost. Signed-off-by: Chris Mason Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Link: https://lkml.kernel.org/r/20250626144017.1510594-2-clm@fb.com --- kernel/sched/fair.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e2963efe800..ab0822cc51c2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12064,8 +12064,14 @@ static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. + * + * sched_balance_newidle() bumps the cost whenever newidle + * balance fails, and we don't want things to grow out of + * control. Use the sysctl_sched_migration_cost as the upper + * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = cost; + sd->max_newidle_lb_cost = + min(cost, sysctl_sched_migration_cost + 200); sd->last_decay_max_lb_cost = jiffies; } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { /* @@ -12757,10 +12763,17 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t1 = sched_clock_cpu(this_cpu); domain_cost = t1 - t0; - update_newidle_cost(sd, domain_cost); - curr_cost += domain_cost; t0 = t1; + + /* + * Failing newidle means it is not effective; + * bump the cost so we end up doing less of it. + */ + if (!pulled_task) + domain_cost = (3 * sd->max_newidle_lb_cost) / 2; + + update_newidle_cost(sd, domain_cost); } /* From 570c8efd5eb79c3725ba439ce105ed1bedc5acd9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 May 2025 17:28:00 +0200 Subject: [PATCH 56/84] sched/psi: Optimize psi_group_change() cpu_clock() usage Dietmar reported that commit 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race") caused a regression for him on a high context switch rate benchmark (schbench) due to the now repeating cpu_clock() calls. In particular the problem is that get_recent_times() will extrapolate the current state to 'now'. But if an update uses a timestamp from before the start of the update, it is possible to get two reads with inconsistent results. It is effectively back-dating an update. (note that this all hard-relies on the clock being synchronized across CPUs -- if this is not the case, all bets are off). Combine this problem with the fact that there are per-group-per-cpu seqcounts, the commit in question pushed the clock read into the group iteration, causing tree-depth cpu_clock() calls. On architectures where cpu_clock() has appreciable overhead, this hurts. Instead move to a per-cpu seqcount, which allows us to have a single clock read for all group updates, increasing internal consistency and lowering update overhead. This comes at the cost of a longer update side (proportional to the tree depth) which can cause the read side to retry more often. Fixes: 3840cbe24cf0 ("sched: psi: fix bogus pressure spikes from aggregation race") Reported-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Tested-by: Dietmar Eggemann , Link: https://lkml.kernel.org/20250522084844.GC31726@noisy.programming.kicks-ass.net --- include/linux/psi_types.h | 6 +- kernel/sched/psi.c | 121 +++++++++++++++++++++----------------- 2 files changed, 68 insertions(+), 59 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index f1fd3a8044e0..dd10c22299ab 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -84,11 +84,9 @@ enum psi_aggregators { struct psi_group_cpu { /* 1st cacheline updated by the scheduler */ - /* Aggregator needs to know of concurrent changes */ - seqcount_t seq ____cacheline_aligned_in_smp; - /* States of the tasks belonging to this group */ - unsigned int tasks[NR_PSI_TASK_COUNTS]; + unsigned int tasks[NR_PSI_TASK_COUNTS] + ____cacheline_aligned_in_smp; /* Aggregate pressure state derived from the tasks */ u32 state_mask; diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5837cd8e7b97..2024c1d36402 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -176,6 +176,28 @@ struct psi_group psi_system = { .pcpu = &system_group_pcpu, }; +static DEFINE_PER_CPU(seqcount_t, psi_seq); + +static inline void psi_write_begin(int cpu) +{ + write_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline void psi_write_end(int cpu) +{ + write_seqcount_end(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline u32 psi_read_begin(int cpu) +{ + return read_seqcount_begin(per_cpu_ptr(&psi_seq, cpu)); +} + +static inline bool psi_read_retry(int cpu, u32 seq) +{ + return read_seqcount_retry(per_cpu_ptr(&psi_seq, cpu), seq); +} + static void psi_avgs_work(struct work_struct *work); static void poll_timer_fn(struct timer_list *t); @@ -186,7 +208,7 @@ static void group_init(struct psi_group *group) group->enabled = true; for_each_possible_cpu(cpu) - seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); + seqcount_init(per_cpu_ptr(&psi_seq, cpu)); group->avg_last_update = sched_clock(); group->avg_next_update = group->avg_last_update + psi_period; mutex_init(&group->avgs_lock); @@ -266,14 +288,14 @@ static void get_recent_times(struct psi_group *group, int cpu, /* Snapshot a coherent view of the CPU state */ do { - seq = read_seqcount_begin(&groupc->seq); + seq = psi_read_begin(cpu); now = cpu_clock(cpu); memcpy(times, groupc->times, sizeof(groupc->times)); state_mask = groupc->state_mask; state_start = groupc->state_start; if (cpu == current_cpu) memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); - } while (read_seqcount_retry(&groupc->seq, seq)); + } while (psi_read_retry(cpu, seq)); /* Calculate state time deltas against the previous snapshot */ for (s = 0; s < NR_PSI_STATES; s++) { @@ -772,30 +794,20 @@ static void record_times(struct psi_group_cpu *groupc, u64 now) groupc->times[PSI_NONIDLE] += delta; } +#define for_each_group(iter, group) \ + for (typeof(group) iter = group; iter; iter = iter->parent) + static void psi_group_change(struct psi_group *group, int cpu, unsigned int clear, unsigned int set, - bool wake_clock) + u64 now, bool wake_clock) { struct psi_group_cpu *groupc; unsigned int t, m; u32 state_mask; - u64 now; lockdep_assert_rq_held(cpu_rq(cpu)); groupc = per_cpu_ptr(group->pcpu, cpu); - /* - * First we update the task counts according to the state - * change requested through the @clear and @set bits. - * - * Then if the cgroup PSI stats accounting enabled, we - * assess the aggregate resource states this CPU's tasks - * have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - */ - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - /* * Start with TSK_ONCPU, which doesn't have a corresponding * task count - it's just a boolean flag directly encoded in @@ -847,7 +859,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); return; } @@ -868,8 +879,6 @@ static void psi_group_change(struct psi_group *group, int cpu, groupc->state_mask = state_mask; - write_seqcount_end(&groupc->seq); - if (state_mask & group->rtpoll_states) psi_schedule_rtpoll_work(group, 1, false); @@ -904,24 +913,29 @@ static void psi_flags_change(struct task_struct *task, int clear, int set) void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); - struct psi_group *group; + u64 now; if (!task->pid) return; psi_flags_change(task, clear, set); - group = task_psi_group(task); - do { - psi_group_change(group, cpu, clear, set, true); - } while ((group = group->parent)); + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(task)) + psi_group_change(group, cpu, clear, set, now, true); + psi_write_end(cpu); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep) { - struct psi_group *group, *common = NULL; + struct psi_group *common = NULL; int cpu = task_cpu(prev); + u64 now; + + psi_write_begin(cpu); + now = cpu_clock(cpu); if (next->pid) { psi_flags_change(next, 0, TSK_ONCPU); @@ -930,16 +944,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - group = task_psi_group(next); - do { - if (per_cpu_ptr(group->pcpu, cpu)->state_mask & - PSI_ONCPU) { + for_each_group(group, task_psi_group(next)) { + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + + if (groupc->state_mask & PSI_ONCPU) { common = group; break; } - - psi_group_change(group, cpu, 0, TSK_ONCPU, true); - } while ((group = group->parent)); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); + } } if (prev->pid) { @@ -972,12 +985,11 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - group = task_psi_group(prev); - do { + for_each_group(group, task_psi_group(prev)) { if (group == common) break; - psi_group_change(group, cpu, clear, set, wake_clock); - } while ((group = group->parent)); + psi_group_change(group, cpu, clear, set, now, wake_clock); + } /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -987,20 +999,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) - psi_group_change(group, cpu, clear, set, wake_clock); + for_each_group(group, common) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } + psi_write_end(cpu); } #ifdef CONFIG_IRQ_TIME_ACCOUNTING void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev) { int cpu = task_cpu(curr); - struct psi_group *group; struct psi_group_cpu *groupc; s64 delta; u64 irq; + u64 now; if (static_branch_likely(&psi_disabled) || !irqtime_enabled()) return; @@ -1009,8 +1022,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; lockdep_assert_rq_held(rq); - group = task_psi_group(curr); - if (prev && task_psi_group(prev) == group) + if (prev && task_psi_group(prev) == task_psi_group(curr)) return; irq = irq_time_read(cpu); @@ -1019,25 +1031,22 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st return; rq->psi_irq_time = irq; - do { - u64 now; + psi_write_begin(cpu); + now = cpu_clock(cpu); + for_each_group(group, task_psi_group(curr)) { if (!group->enabled) continue; groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - now = cpu_clock(cpu); - record_times(groupc, now); groupc->times[PSI_IRQ_FULL] += delta; - write_seqcount_end(&groupc->seq); - if (group->rtpoll_states & (1 << PSI_IRQ_FULL)) psi_schedule_rtpoll_work(group, 1, false); - } while ((group = group->parent)); + } + psi_write_end(cpu); } #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ @@ -1225,12 +1234,14 @@ void psi_cgroup_restart(struct psi_group *group) return; for_each_possible_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; + u64 now; - rq_lock_irq(rq, &rf); - psi_group_change(group, cpu, 0, 0, true); - rq_unlock_irq(rq, &rf); + guard(rq_lock_irq)(cpu_rq(cpu)); + + psi_write_begin(cpu); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + psi_write_end(cpu); } } #endif /* CONFIG_CGROUPS */ From cccb45d7c4295bbfeba616582d0249f2d21e6df5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 May 2025 11:19:30 +0200 Subject: [PATCH 57/84] sched/deadline: Less agressive dl_server handling Chris reported that commit 5f6bd380c7bd ("sched/rt: Remove default bandwidth control") caused a significant dip in his favourite benchmark of the day. Simply disabling dl_server cured things. His workload hammers the 0->1, 1->0 transitions, and the dl_server_{start,stop}() overhead kills it -- fairly obviously a bad idea in hind sight and all that. Change things around to only disable the dl_server when there has not been a fair task around for a whole period. Since the default period is 1 second, this ensures the benchmark never trips this, overhead gone. Fixes: 557a6bfc662c ("sched/fair: Add trivial fair server") Reported-by: Chris Mason Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Juri Lelli Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20250702121158.465086194@infradead.org --- include/linux/sched.h | 1 + kernel/sched/deadline.c | 25 ++++++++++++++++++++++--- kernel/sched/fair.c | 9 --------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index eec6b225e9d1..4802fcf738cd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -698,6 +698,7 @@ struct sched_dl_entity { unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; + unsigned int dl_server_idle : 1; /* * Bandwidth enforcement timer. Each -deadline task has its diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0f30697ad795..23668fc60bd3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1150,6 +1150,8 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; +static bool dl_server_stopped(struct sched_dl_entity *dl_se); + static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) { struct rq *rq = rq_of_dl_se(dl_se); @@ -1169,6 +1171,7 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ if (!dl_se->server_has_tasks(dl_se)) { replenish_dl_entity(dl_se); + dl_server_stopped(dl_se); return HRTIMER_NORESTART; } @@ -1572,8 +1575,10 @@ void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_runtime) { + dl_se->dl_server_idle = 0; update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + } } void dl_server_start(struct sched_dl_entity *dl_se) @@ -1596,7 +1601,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) setup_new_dl_entity(dl_se); } - if (!dl_se->dl_runtime) + if (!dl_se->dl_runtime || dl_se->dl_server_active) return; dl_se->dl_server_active = 1; @@ -1617,6 +1622,20 @@ void dl_server_stop(struct sched_dl_entity *dl_se) dl_se->dl_server_active = 0; } +static bool dl_server_stopped(struct sched_dl_entity *dl_se) +{ + if (!dl_se->dl_server_active) + return false; + + if (dl_se->dl_server_idle) { + dl_server_stop(dl_se); + return true; + } + + dl_se->dl_server_idle = 1; + return false; +} + void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task) @@ -2354,7 +2373,7 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - if (dl_server_active(dl_se)) { + if (!dl_server_stopped(dl_se)) { dl_se->dl_yielded = 1; update_curr_dl_se(rq, dl_se, 0); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ab0822cc51c2..a1350c513a87 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5802,7 +5802,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long queued_delta, runnable_delta, idle_delta, dequeue = 1; - long rq_h_nr_queued = rq->cfs.h_nr_queued; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5886,10 +5885,6 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, queued_delta); - - /* Stop the fair server if throttling resulted in no runnable tasks */ - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the @@ -6966,7 +6961,6 @@ static void set_next_buddy(struct sched_entity *se); static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { bool was_sched_idle = sched_idle_rq(rq); - int rq_h_nr_queued = rq->cfs.h_nr_queued; bool task_sleep = flags & DEQUEUE_SLEEP; bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; @@ -7050,9 +7044,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) sub_nr_running(rq, h_nr_queued); - if (rq_h_nr_queued && !rq->cfs.h_nr_queued) - dl_server_stop(&rq->fair_server); - /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; From 9cdb4fe20cd239c848b5c3f5753d83a9443ba329 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:25 +0200 Subject: [PATCH 58/84] sched/fair: Use protect_slice() instead of direct comparison Replace the test by the relevant protect_slice() function. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dhaval Giani (AMD) Link: https://lkml.kernel.org/r/20250708165630.1948751-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a1350c513a87..43fe5c831dd5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1161,7 +1161,7 @@ static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity if (!sched_feat(PREEMPT_SHORT)) return false; - if (curr->vlag == curr->deadline) + if (protect_slice(curr)) return false; return !entity_eligible(cfs_rq, curr); From 74eec63661d46a7153d04c2e0249eeb76cc76d44 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:26 +0200 Subject: [PATCH 59/84] sched/fair: Fix NO_RUN_TO_PARITY case EEVDF expects the scheduler to allocate a time quantum to the selected entity and then pick a new entity for next quantum. Although this notion of time quantum is not strictly doable in our case, we can ensure a minimum runtime for each task most of the time and pick a new entity after a minimum time has elapsed. Reuse the slice protection of run to parity to ensure such runtime quantum. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-3-vincent.guittot@linaro.org --- include/linux/sched.h | 10 +++++++++- kernel/sched/fair.c | 31 ++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4802fcf738cd..55921385927d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -583,7 +583,15 @@ struct sched_entity { u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; - s64 vlag; + union { + /* + * When !@on_rq this field is vlag. + * When cfs_rq->curr == se (which implies @on_rq) + * this field is vprot. See protect_slice(). + */ + s64 vlag; + u64 vprot; + }; u64 slice; u64 nr_migrations; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 43fe5c831dd5..8d288df79a6b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -882,23 +882,35 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) } /* - * HACK, stash a copy of deadline at the point of pick in vlag, - * which isn't used until dequeue. + * Set the vruntime up to which an entity can run before looking + * for another entity to pick. + * In case of run to parity, we protect the entity up to its deadline. + * When run to parity is disabled, we give a minimum quantum to the running + * entity to ensure progress. */ static inline void set_protect_slice(struct sched_entity *se) { - se->vlag = se->deadline; + u64 slice = se->slice; + u64 vprot = se->deadline; + + if (!sched_feat(RUN_TO_PARITY)) + slice = min(slice, normalized_sysctl_sched_base_slice); + + if (slice != se->slice) + vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se)); + + se->vprot = vprot; } static inline bool protect_slice(struct sched_entity *se) { - return se->vlag == se->deadline; + return ((s64)(se->vprot - se->vruntime) > 0); } static inline void cancel_protect_slice(struct sched_entity *se) { if (protect_slice(se)) - se->vlag = se->deadline + 1; + se->vprot = se->vruntime; } /* @@ -937,7 +949,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) + if (curr && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -1156,11 +1168,8 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) cgroup_account_cputime(p, delta_exec); } -static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) +static inline bool resched_next_slice(struct cfs_rq *cfs_rq, struct sched_entity *curr) { - if (!sched_feat(PREEMPT_SHORT)) - return false; - if (protect_slice(curr)) return false; @@ -1248,7 +1257,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (cfs_rq->nr_queued == 1) return; - if (resched || did_preempt_short(cfs_rq, curr)) { + if (resched || resched_next_slice(cfs_rq, curr)) { resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } From 9de74a9850b9468ac2f515bfbe0844e0bfae869d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:27 +0200 Subject: [PATCH 60/84] sched/fair: Remove spurious shorter slice preemption Even if the waking task can preempt current, it might not be the one selected by pick_task_fair. Check that the waking task will be selected if we cancel the slice protection before doing so. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8d288df79a6b..96718b3b4bd1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -932,7 +932,7 @@ static inline void cancel_protect_slice(struct sched_entity *se) * * Which allows tree pruning through eligibility. */ -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; struct sched_entity *se = __pick_first_entity(cfs_rq); @@ -949,7 +949,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - if (curr && protect_slice(curr)) + if (curr && protect && protect_slice(curr)) return curr; /* Pick the leftmost entity if it's eligible */ @@ -993,6 +993,11 @@ found: return best; } +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) +{ + return __pick_eevdf(cfs_rq, true); +} + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); @@ -1176,27 +1181,6 @@ static inline bool resched_next_slice(struct cfs_rq *cfs_rq, struct sched_entity return !entity_eligible(cfs_rq, curr); } -static inline bool do_preempt_short(struct cfs_rq *cfs_rq, - struct sched_entity *pse, struct sched_entity *se) -{ - if (!sched_feat(PREEMPT_SHORT)) - return false; - - if (pse->slice >= se->slice) - return false; - - if (!entity_eligible(cfs_rq, pse)) - return false; - - if (entity_before(pse, se)) - return true; - - if (!entity_eligible(cfs_rq, se)) - return true; - - return false; -} - /* * Used by other classes to account runtime. */ @@ -8658,6 +8642,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; + bool do_preempt_short = false; if (unlikely(se == pse)) return; @@ -8706,7 +8691,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - cancel_protect_slice(se); + do_preempt_short = true; goto preempt; } @@ -8724,22 +8709,21 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int /* * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. - * - * Note that even if @p does not turn out to be the most eligible - * task at this moment, current's slice protection will be lost. */ - if (do_preempt_short(cfs_rq, pse, se)) - cancel_protect_slice(se); + do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); /* * If @p has become the most eligible task, force preemption. */ - if (pick_eevdf(cfs_rq) == pse) + if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt; return; preempt: + if (do_preempt_short) + cancel_protect_slice(se); + resched_curr_lazy(rq); } From 052c3d87c82ea4ee83232b747512847b4e8c9976 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:28 +0200 Subject: [PATCH 61/84] sched/fair: Limit run to parity to the min slice of enqueued entities Run to parity ensures that current will get a chance to run its full slice in one go but this can create large latency and/or lag for entities with shorter slice that have exhausted their previous slice and wait to run their next slice. Clamp the run to parity to the shortest slice of all enqueued entities. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-5-vincent.guittot@linaro.org --- kernel/sched/fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96718b3b4bd1..45e057fc2354 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -884,18 +884,20 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) /* * Set the vruntime up to which an entity can run before looking * for another entity to pick. - * In case of run to parity, we protect the entity up to its deadline. + * In case of run to parity, we use the shortest slice of the enqueued + * entities to set the protected period. * When run to parity is disabled, we give a minimum quantum to the running * entity to ensure progress. */ static inline void set_protect_slice(struct sched_entity *se) { - u64 slice = se->slice; + u64 slice = normalized_sysctl_sched_base_slice; u64 vprot = se->deadline; - if (!sched_feat(RUN_TO_PARITY)) - slice = min(slice, normalized_sysctl_sched_base_slice); + if (sched_feat(RUN_TO_PARITY)) + slice = cfs_rq_min_slice(cfs_rq_of(se)); + slice = min(slice, se->slice); if (slice != se->slice) vprot = min_vruntime(vprot, se->vruntime + calc_delta_fair(slice, se)); From 3a0baa8e6c570c252999cb651398a88f8f990b4a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:29 +0200 Subject: [PATCH 62/84] sched/fair: Fix entity's lag with run to parity When an entity is enqueued without preempting current, we must ensure that the slice protection is updated to take into account the slice duration of the newly enqueued task so that its lag will not exceed its slice (+ tick). Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-6-vincent.guittot@linaro.org --- kernel/sched/fair.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45e057fc2354..1660960d64af 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -889,13 +889,13 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * When run to parity is disabled, we give a minimum quantum to the running * entity to ensure progress. */ -static inline void set_protect_slice(struct sched_entity *se) +static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { u64 slice = normalized_sysctl_sched_base_slice; u64 vprot = se->deadline; if (sched_feat(RUN_TO_PARITY)) - slice = cfs_rq_min_slice(cfs_rq_of(se)); + slice = cfs_rq_min_slice(cfs_rq); slice = min(slice, se->slice); if (slice != se->slice) @@ -904,6 +904,13 @@ static inline void set_protect_slice(struct sched_entity *se) se->vprot = vprot; } +static inline void update_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = cfs_rq_min_slice(cfs_rq); + + se->vprot = min_vruntime(se->vprot, se->vruntime + calc_delta_fair(slice, se)); +} + static inline bool protect_slice(struct sched_entity *se) { return ((s64)(se->vprot - se->vruntime) > 0); @@ -5467,7 +5474,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) __dequeue_entity(cfs_rq, se); update_load_avg(cfs_rq, se, UPDATE_TG); - set_protect_slice(se); + set_protect_slice(cfs_rq, se); } update_stats_curr_start(cfs_rq, se); @@ -8720,6 +8727,9 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) goto preempt; + if (sched_feat(RUN_TO_PARITY) && do_preempt_short) + update_protect_slice(cfs_rq, se); + return; preempt: From 0b9ca2dcabc3c8816a6ee75599cab7bef3330609 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 8 Jul 2025 18:56:30 +0200 Subject: [PATCH 63/84] sched/fair: Always trigger resched at the end of a protected period Always trigger a resched after a protected period even if the entity is still eligible. It can happen that an entity remains eligible at the end of the protected period but must let an entity with a shorter slice to run in order to keep its lag shorter than slice. This is particulalry true with run to parity which tries to maximize the lag. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250708165630.1948751-7-vincent.guittot@linaro.org --- kernel/sched/fair.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1660960d64af..20a845697c1d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1182,14 +1182,6 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) cgroup_account_cputime(p, delta_exec); } -static inline bool resched_next_slice(struct cfs_rq *cfs_rq, struct sched_entity *curr) -{ - if (protect_slice(curr)) - return false; - - return !entity_eligible(cfs_rq, curr); -} - /* * Used by other classes to account runtime. */ @@ -1250,7 +1242,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (cfs_rq->nr_queued == 1) return; - if (resched || resched_next_slice(cfs_rq, curr)) { + if (resched || !protect_slice(curr)) { resched_curr_lazy(rq); clear_buddies(cfs_rq, curr); } From 2885daf47081dd1aaf1a588e9d001eb343df1f90 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 10 Jul 2025 10:27:48 +0200 Subject: [PATCH 64/84] lib/smp_processor_id: Make migration check unconditional of SMP Commit cac5cefbade90 ("sched/smp: Make SMP unconditional") migrate_disable() even on UP builds. Commit 06ddd17521bf1 ("sched/smp: Always define is_percpu_thread() and scheduler_ipi()") made is_percpu_thread() check the affinity mask instead replying always true for UP mask. As a consequence smp_processor_id() now complains if invoked within a migrate_disable() section because is_percpu_thread() checks its mask and the migration check is left out. Make migration check unconditional of SMP. Fixes: cac5cefbade90 ("sched/smp: Make SMP unconditional") Closes: https://lore.kernel.org/oe-lkp/202507100448.6b88d6f1-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Chen Yu Link: https://lore.kernel.org/r/20250710082748.-DPO1rjO@linutronix.de --- lib/smp_processor_id.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index a2bb7738c373..94b3f6b19538 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -22,10 +22,8 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) if (is_percpu_thread()) goto out; -#ifdef CONFIG_SMP if (current->migration_disabled) goto out; -#endif /* * It is valid to assume CPU-locality during early bootup: From 9f239df55546ee1d28f0976130136ffd1cad0fd7 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 27 Jun 2025 13:51:14 +0200 Subject: [PATCH 65/84] sched/deadline: Initialize dl_servers after SMP dl-servers are currently initialized too early at boot when CPUs are not fully up (only boot CPU is). This results in miscalculation of per runqueue DEADLINE variables like extra_bw (which needs a stable CPU count). Move initialization of dl-servers later on after SMP has been initialized and CPUs are all online, so that CPU count is stable and DEADLINE variables can be computed correctly. Fixes: d741f297bceaf ("sched/fair: Fair server interface") Reported-by: Marcel Ziswiler Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Tested-by: Marcel Ziswiler # nuc & rock5b Link: https://lore.kernel.org/r/20250627115118.438797-2-juri.lelli@redhat.com --- kernel/sched/core.c | 2 ++ kernel/sched/deadline.c | 48 +++++++++++++++++++++++++---------------- kernel/sched/sched.h | 1 + 3 files changed, 33 insertions(+), 18 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2f8caa9db78d..89b3ed637465 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8371,6 +8371,8 @@ void __init sched_init_smp(void) init_sched_rt_class(); init_sched_dl_class(); + sched_init_dl_servers(); + sched_smp_initialized = true; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 23668fc60bd3..0d25553f2c17 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -761,6 +761,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); + update_rq_clock(rq); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); @@ -1585,23 +1587,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; - /* - * XXX: the apply do not work fine at the init phase for the - * fair server because things are not yet set. We need to improve - * this before getting generic. - */ - if (!dl_server(dl_se)) { - u64 runtime = 50 * NSEC_PER_MSEC; - u64 period = 1000 * NSEC_PER_MSEC; - - dl_server_apply_params(dl_se, runtime, period, 1); - - dl_se->dl_server = 1; - dl_se->dl_defer = 1; - setup_new_dl_entity(dl_se); - } - - if (!dl_se->dl_runtime || dl_se->dl_server_active) + if (!dl_server(dl_se) || dl_se->dl_server_active) return; dl_se->dl_server_active = 1; @@ -1612,7 +1598,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) void dl_server_stop(struct sched_dl_entity *dl_se) { - if (!dl_se->dl_runtime) + if (!dl_server(dl_se) || !dl_server_active(dl_se)) return; dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); @@ -1645,6 +1631,32 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_se->server_pick_task = pick_task; } +void sched_init_dl_servers(void) +{ + int cpu; + struct rq *rq; + struct sched_dl_entity *dl_se; + + for_each_online_cpu(cpu) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + rq = cpu_rq(cpu); + + guard(rq_lock_irq)(rq); + + dl_se = &rq->fair_server; + + WARN_ON(dl_server(dl_se)); + + dl_server_apply_params(dl_se, runtime, period, 1); + + dl_se->dl_server = 1; + dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } +} + void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) { u64 new_bw = dl_se->dl_bw; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 105190b18020..3058fb6246da 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -385,6 +385,7 @@ extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, dl_server_pick_f pick_task); +extern void sched_init_dl_servers(void); extern void dl_server_update_idle_time(struct rq *rq, struct task_struct *p); From fcc9276c4d331cd1fe9319d793e80b02e09727f5 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 27 Jun 2025 13:51:15 +0200 Subject: [PATCH 66/84] sched/deadline: Reset extra_bw to max_bw when clearing root domains dl_clear_root_domain() doesn't take into account the fact that per-rq extra_bw variables retain values computed before root domain changes, resulting in broken accounting. Fix it by resetting extra_bw to max_bw before restoring back dl-servers contributions. Fixes: 2ff899e351643 ("sched/deadline: Rebuild root domain accounting after every update") Reported-by: Marcel Ziswiler Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # nuc & rock5b Link: https://lore.kernel.org/r/20250627115118.438797-3-juri.lelli@redhat.com --- kernel/sched/deadline.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0d25553f2c17..0abffe3860e3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2924,7 +2924,14 @@ void dl_clear_root_domain(struct root_domain *rd) int i; guard(raw_spinlock_irqsave)(&rd->dl_bw.lock); + + /* + * Reset total_bw to zero and extra_bw to max_bw so that next + * loop will add dl-servers contributions back properly, + */ rd->dl_bw.total_bw = 0; + for_each_cpu(i, rd->span) + cpu_rq(i)->dl.extra_bw = cpu_rq(i)->dl.max_bw; /* * dl_servers are not tasks. Since dl_add_task_root_domain ignores From 440989c10f4e32620e9e2717ca52c3ed7ae11048 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 27 Jun 2025 13:51:16 +0200 Subject: [PATCH 67/84] sched/deadline: Fix accounting after global limits change A global limits change (sched_rt_handler() logic) currently leaves stale and/or incorrect values in variables related to accounting (e.g. extra_bw). Properly clean up per runqueue variables before implementing the change and rebuild scheduling domains (so that accounting is also properly restored) after such a change is complete. Reported-by: Marcel Ziswiler Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # nuc & rock5b Link: https://lore.kernel.org/r/20250627115118.438797-4-juri.lelli@redhat.com --- kernel/sched/deadline.c | 4 +++- kernel/sched/rt.c | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0abffe3860e3..9c7d9528d5fa 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -3183,6 +3183,9 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); + for_each_possible_cpu(cpu) + init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); + for_each_possible_cpu(cpu) { rcu_read_lock_sched(); @@ -3198,7 +3201,6 @@ void sched_dl_do_global(void) raw_spin_unlock_irqrestore(&dl_b->lock, flags); rcu_read_unlock_sched(); - init_dl_rq_bw_ratio(&cpu_rq(cpu)->dl); } } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 15d5855c542c..be6e9bcbe82b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2886,6 +2886,12 @@ undo: sched_domains_mutex_unlock(); mutex_unlock(&mutex); + /* + * After changing maximum available bandwidth for DEADLINE, we need to + * recompute per root domain and per cpus variables accordingly. + */ + rebuild_sched_domains(); + return ret; } From 9fdb12c88e9ba75e2d831fb397dd27f03a534968 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 27 Jun 2025 13:51:17 +0200 Subject: [PATCH 68/84] tools/sched: Add root_domains_dump.py which dumps root domains info Root domains information is somewhat hard to access at runtime. Even with sched_debug and sched_verbose, such information is only printed on kernel console when domains are modified. Add a simple drgn script to more easily retrieve root domains information at runtime. Since tools/sched is a new directory, add it to MAINTAINERS as well. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # nuc & rock5b Link: https://lore.kernel.org/r/20250627115118.438797-5-juri.lelli@redhat.com --- MAINTAINERS | 1 + tools/sched/root_domains_dump.py | 68 ++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 tools/sched/root_domains_dump.py diff --git a/MAINTAINERS b/MAINTAINERS index a92290fffa16..b986a49383c9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22165,6 +22165,7 @@ F: include/linux/wait.h F: include/uapi/linux/sched.h F: kernel/fork.c F: kernel/sched/ +F: tools/sched/ SCHEDULER - SCHED_EXT R: Tejun Heo diff --git a/tools/sched/root_domains_dump.py b/tools/sched/root_domains_dump.py new file mode 100644 index 000000000000..56dc91f017b2 --- /dev/null +++ b/tools/sched/root_domains_dump.py @@ -0,0 +1,68 @@ +#!/usr/bin/env drgn +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 Juri Lelli +# Copyright (C) 2025 Red Hat, Inc. + +desc = """ +This is a drgn script to show the current root domains configuration. For more +info on drgn, visit https://github.com/osandov/drgn. + +Root domains are only printed once, as multiple CPUs might be attached to the +same root domain. +""" + +import os +import argparse + +import drgn +from drgn import FaultError +from drgn.helpers.common import * +from drgn.helpers.linux import * + +def print_root_domains_info(): + + # To store unique root domains found + seen_root_domains = set() + + print("Retrieving (unique) Root Domain Information:") + + runqueues = prog['runqueues'] + def_root_domain = prog['def_root_domain'] + + for cpu_id in for_each_possible_cpu(prog): + try: + rq = per_cpu(runqueues, cpu_id) + + root_domain = rq.rd + + # Check if we've already processed this root domain to avoid duplicates + # Use the memory address of the root_domain as a unique identifier + root_domain_cast = int(root_domain) + if root_domain_cast in seen_root_domains: + continue + seen_root_domains.add(root_domain_cast) + + if root_domain_cast == int(def_root_domain.address_): + print(f"\n--- Root Domain @ def_root_domain ---") + else: + print(f"\n--- Root Domain @ 0x{root_domain_cast:x} ---") + + print(f" From CPU: {cpu_id}") # This CPU belongs to this root domain + + # Access and print relevant fields from struct root_domain + print(f" Span : {cpumask_to_cpulist(root_domain.span[0])}") + print(f" Online : {cpumask_to_cpulist(root_domain.span[0])}") + + except drgn.FaultError as fe: + print(f" (CPU {cpu_id}: Fault accessing kernel memory: {fe})") + except AttributeError as ae: + print(f" (CPU {cpu_id}: Missing attribute for root_domain (kernel struct change?): {ae})") + except Exception as e: + print(f" (CPU {cpu_id}: An unexpected error occurred: {e})") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) + args = parser.parse_args() + + print_root_domains_info() From 634c24068abf8f325e520e663250e4a32a95ea0e Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 27 Jun 2025 13:51:18 +0200 Subject: [PATCH 69/84] tools/sched: Add dl_bw_dump.py for printing bandwidth accounting info dl_rq bandwidth accounting information is crucial for the correct functioning of SCHED_DEADLINE. Add a drgn script for accessing that information at runtime, so that it's easier to check and debug issues related to it. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Marcel Ziswiler # nuc & rock5b Link: https://lore.kernel.org/r/20250627115118.438797-6-juri.lelli@redhat.com --- tools/sched/dl_bw_dump.py | 57 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 tools/sched/dl_bw_dump.py diff --git a/tools/sched/dl_bw_dump.py b/tools/sched/dl_bw_dump.py new file mode 100644 index 000000000000..aae4e42b1769 --- /dev/null +++ b/tools/sched/dl_bw_dump.py @@ -0,0 +1,57 @@ +#!/usr/bin/env drgn +# SPDX-License-Identifier: GPL-2.0 +# Copyright (C) 2025 Juri Lelli +# Copyright (C) 2025 Red Hat, Inc. + +desc = """ +This is a drgn script to show dl_rq bandwidth accounting information. For more +info on drgn, visit https://github.com/osandov/drgn. + +Only online CPUs are reported. +""" + +import os +import argparse + +import drgn +from drgn import FaultError +from drgn.helpers.common import * +from drgn.helpers.linux import * + +def print_dl_bws_info(): + + print("Retrieving dl_rq bandwidth accounting information:") + + runqueues = prog['runqueues'] + + for cpu_id in for_each_possible_cpu(prog): + try: + rq = per_cpu(runqueues, cpu_id) + + if rq.online == 0: + continue + + dl_rq = rq.dl + + print(f" From CPU: {cpu_id}") + + # Access and print relevant fields from struct dl_rq + print(f" running_bw : {dl_rq.running_bw}") + print(f" this_bw : {dl_rq.this_bw}") + print(f" extra_bw : {dl_rq.extra_bw}") + print(f" max_bw : {dl_rq.max_bw}") + print(f" bw_ratio : {dl_rq.bw_ratio}") + + except drgn.FaultError as fe: + print(f" (CPU {cpu_id}: Fault accessing kernel memory: {fe})") + except AttributeError as ae: + print(f" (CPU {cpu_id}: Missing attribute for root_domain (kernel struct change?): {ae})") + except Exception as e: + print(f" (CPU {cpu_id}: An unexpected error occurred: {e})") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) + args = parser.parse_args() + + print_dl_bws_info() From e075f4360931263f5ec006ea5dadc065e5e98eb8 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:07 +0800 Subject: [PATCH 70/84] smpboot: introduce SDTL_INIT() helper to tidy sched topology setup Define a small SDTL_INIT(maskfn, flagsfn, name) macro and use it to build the sched_domain_topology_level array. Purely a cleanup; behaviour is unchanged. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-2-me@linux.beauty --- arch/powerpc/kernel/smp.c | 25 ++++++++++--------------- arch/s390/kernel/topology.c | 10 +++++----- arch/x86/kernel/smpboot.c | 21 ++++++--------------- include/linux/sched/topology.h | 4 ++-- kernel/sched/topology.c | 24 ++++++++---------------- 5 files changed, 31 insertions(+), 53 deletions(-) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 5ac7084eebc0..f59e4b9cc207 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1700,28 +1700,23 @@ static void __init build_sched_topology(void) #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[i++] = (struct sched_domain_topology_level){ - smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) - }; + powerpc_topology[i++] = + SDTL_INIT(smallcore_smt_mask, powerpc_smt_flags, SMT); } else { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) - }; + powerpc_topology[i++] = SDTL_INIT(cpu_smt_mask, powerpc_smt_flags, SMT); } #endif if (shared_caches) { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) - }; + powerpc_topology[i++] = + SDTL_INIT(shared_cache_mask, powerpc_shared_cache_flags, CACHE); } + if (has_coregroup_support()) { - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) - }; + powerpc_topology[i++] = + SDTL_INIT(cpu_mc_mask, powerpc_shared_proc_flags, MC); } - powerpc_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) - }; + + powerpc_topology[i++] = SDTL_INIT(cpu_cpu_mask, powerpc_shared_proc_flags, PKG); /* There must be one trailing NULL entry left. */ BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 3df048e190b1..46569b8e47dd 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -531,11 +531,11 @@ static const struct cpumask *cpu_drawer_mask(int cpu) } static struct sched_domain_topology_level s390_topology[] = { - { cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, - { cpu_book_mask, SD_INIT_NAME(BOOK) }, - { cpu_drawer_mask, SD_INIT_NAME(DRAWER) }, - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_thread_mask, cpu_smt_flags, SMT), + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), + SDTL_INIT(cpu_book_mask, NULL, BOOK), + SDTL_INIT(cpu_drawer_mask, NULL, DRAWER), + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fc78c2325fd2..e0adf75f617a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -485,35 +485,26 @@ static void __init build_sched_topology(void) int i = 0; #ifdef CONFIG_SCHED_SMT - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) - }; + x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT); #endif #ifdef CONFIG_SCHED_CLUSTER - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) - }; + x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS); #endif #ifdef CONFIG_SCHED_MC - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) - }; + x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC); #endif /* * When there is NUMA topology inside the package skip the PKG domain * since the NUMA domains will auto-magically create the right spanning * domains based on the SLIT. */ - if (!x86_has_numa_in_package) { - x86_topology[i++] = (struct sched_domain_topology_level){ - cpu_cpu_mask, x86_sched_itmt_flags, SD_INIT_NAME(PKG) - }; - } + if (!x86_has_numa_in_package) + x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG); /* * There must be one trailing NULL entry left. */ - BUG_ON(i >= ARRAY_SIZE(x86_topology)-1); + BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1); set_sched_topology(x86_topology); } diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index e54e7fa76ba6..0d5daaa277b7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -196,8 +196,8 @@ struct sched_domain_topology_level { extern void __init set_sched_topology(struct sched_domain_topology_level *tl); extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); - -# define SD_INIT_NAME(type) .name = #type +#define SDTL_INIT(maskfn, flagsfn, dname) ((struct sched_domain_topology_level) \ + { .mask = maskfn, .sd_flags = flagsfn, .name = #dname }) #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern void rebuild_sched_domains_energy(void); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8e06b1d22e91..d01f5a49f2e7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1737,17 +1737,17 @@ sd_init(struct sched_domain_topology_level *tl, */ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), #endif #ifdef CONFIG_SCHED_CLUSTER - { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, + SDTL_INIT(cpu_clustergroup_mask, cpu_cluster_flags, CLS), #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + SDTL_INIT(cpu_coregroup_mask, cpu_core_flags, MC), #endif - { cpu_cpu_mask, SD_INIT_NAME(PKG) }, + SDTL_INIT(cpu_cpu_mask, NULL, PKG), { NULL, }, }; @@ -2008,23 +2008,15 @@ void sched_init_numa(int offline_node) /* * Add the NUMA identity distance, aka single NODE. */ - tl[i++] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .numa_level = 0, - SD_INIT_NAME(NODE) - }; + tl[i++] = SDTL_INIT(sd_numa_mask, NULL, NODE); /* * .. and append 'j' levels of NUMA goodness. */ for (j = 1; j < nr_levels; i++, j++) { - tl[i] = (struct sched_domain_topology_level){ - .mask = sd_numa_mask, - .sd_flags = cpu_numa_flags, - .flags = SDTL_OVERLAP, - .numa_level = j, - SD_INIT_NAME(NUMA) - }; + tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); + tl[i].numa_level = j; + tl[i].flags = SDTL_OVERLAP; } sched_domain_topology_saved = sched_domain_topology; From 992de2b02509bed68f693ea5a68b07cd586197b7 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:08 +0800 Subject: [PATCH 71/84] x86/smpboot: remove redundant CONFIG_SCHED_SMT On x86 CONFIG_SCHED_SMT is default y if SMP is enabled, so let's simply drop CONFIG_SCHED_SMT. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-3-me@linux.beauty --- arch/x86/kernel/smpboot.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e0adf75f617a..30e6df2b4db6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -484,9 +484,7 @@ static void __init build_sched_topology(void) { int i = 0; -#ifdef CONFIG_SCHED_SMT x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT); -#endif #ifdef CONFIG_SCHED_CLUSTER x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS); #endif From fbc2010d92e595dc13d8048db2419f963c8cb25e Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:09 +0800 Subject: [PATCH 72/84] x86/smpboot: moves x86_topology to static initialize and truncate The #ifdeffery and the initializers in build_sched_topology() are just disgusting. Statically initialize the domain levels in the topology array and let build_sched_topology() invalidate the package domain level when NUMA in package is available. Suggested-by: Thomas Gleixner Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-4-me@linux.beauty --- arch/x86/kernel/smpboot.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 30e6df2b4db6..8038286454cf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -478,32 +478,30 @@ static int x86_cluster_flags(void) */ static bool x86_has_numa_in_package; -static struct sched_domain_topology_level x86_topology[6]; +static struct sched_domain_topology_level x86_topology[] = { + SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT), +#ifdef CONFIG_SCHED_CLUSTER + SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS), +#endif +#ifdef CONFIG_SCHED_MC + SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC), +#endif + SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG), + { NULL }, +}; static void __init build_sched_topology(void) { - int i = 0; - - x86_topology[i++] = SDTL_INIT(cpu_smt_mask, cpu_smt_flags, SMT); -#ifdef CONFIG_SCHED_CLUSTER - x86_topology[i++] = SDTL_INIT(cpu_clustergroup_mask, x86_cluster_flags, CLS); -#endif -#ifdef CONFIG_SCHED_MC - x86_topology[i++] = SDTL_INIT(cpu_coregroup_mask, x86_core_flags, MC); -#endif /* - * When there is NUMA topology inside the package skip the PKG domain - * since the NUMA domains will auto-magically create the right spanning - * domains based on the SLIT. + * When there is NUMA topology inside the package invalidate the + * PKG domain since the NUMA domains will auto-magically create the + * right spanning domains based on the SLIT. */ - if (!x86_has_numa_in_package) - x86_topology[i++] = SDTL_INIT(cpu_cpu_mask, x86_sched_itmt_flags, PKG); - - /* - * There must be one trailing NULL entry left. - */ - BUG_ON(i >= ARRAY_SIZE(x86_topology) - 1); + if (x86_has_numa_in_package) { + unsigned int pkgdom = ARRAY_SIZE(x86_topology) - 2; + memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); + } set_sched_topology(x86_topology); } From f79c9aa446d638190578515afcd06d6c9d72da55 Mon Sep 17 00:00:00 2001 From: Li Chen Date: Thu, 10 Jul 2025 18:57:10 +0800 Subject: [PATCH 73/84] x86/smpboot: avoid SMT domain attach/destroy if SMT is not enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the SMT domain is added into sched_domain_topology by default. If cpu_attach_domain() finds that the CPU SMT domain’s cpumask_weight is just 1, it will destroy it. On a large machine, such as one with 512 cores, this results in 512 redundant domain attach/destroy operations. Avoid these unnecessary operations by simply checking cpu_smt_num_threads and skip SMT domain if the SMT domain is not enabled. Suggested-by: K Prateek Nayak Signed-off-by: Li Chen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: K Prateek Nayak Tested-by: K Prateek Nayak Link: https://lore.kernel.org/r/20250710105715.66594-5-me@linux.beauty --- arch/x86/kernel/smpboot.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8038286454cf..412c813cd268 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -492,6 +492,8 @@ static struct sched_domain_topology_level x86_topology[] = { static void __init build_sched_topology(void) { + struct sched_domain_topology_level *topology = x86_topology; + /* * When there is NUMA topology inside the package invalidate the * PKG domain since the NUMA domains will auto-magically create the @@ -502,7 +504,15 @@ static void __init build_sched_topology(void) memset(&x86_topology[pkgdom], 0, sizeof(x86_topology[pkgdom])); } - set_sched_topology(x86_topology); + + /* + * Drop the SMT domains if there is only one thread per-core + * since it'll get degenerated by the scheduler anyways. + */ + if (cpu_smt_num_threads <= 1) + ++topology; + + set_sched_topology(topology); } void set_cpu_sibling_map(int cpu) From 1eec89a671413ce38df9fe9e70f5130a9eb79a59 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Fri, 11 Jul 2025 11:20:30 +0530 Subject: [PATCH 74/84] sched/topology: Remove sched_domain_topology_level::flags Support for overlapping domains added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain spans") also allowed forcefully setting SD_OVERLAP for !NUMA domains via FORCE_SD_OVERLAP sched_feat(). Since NUMA domains had to be presumed overlapping to ensure correct behavior, "sched_domain_topology_level::flags" was introduced. NUMA domains added the SDTL_OVERLAP flag would ensure SD_OVERLAP was always added during build_sched_domains() for these domains, even when FORCE_SD_OVERLAP was off. Condition for adding the SD_OVERLAP flag at the aforementioned commit was as follows: if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) sd->flags |= SD_OVERLAP; The FORCE_SD_OVERLAP debug feature was removed in commit af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") which left the NUMA domains as the exclusive users of SDTL_OVERLAP, SD_OVERLAP, and SD_NUMA flags. Get rid of SDTL_OVERLAP and SD_OVERLAP as they have become redundant and instead rely on SD_NUMA to detect the only overlapping domain currently supported. Since SDTL_OVERLAP was the only user of "tl->flags", get rid of "sched_domain_topology_level::flags" too. Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/ba4dbdf8-bc37-493d-b2e0-2efb00ea3e19@amd.com --- include/linux/sched/sd_flags.h | 8 -------- include/linux/sched/topology.h | 3 --- kernel/sched/fair.c | 6 +++--- kernel/sched/topology.c | 19 ++++++++++--------- 4 files changed, 13 insertions(+), 23 deletions(-) diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index b04a5d04dee9..42839cfa2778 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -153,14 +153,6 @@ SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS) */ SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS) -/* - * sched_groups of this level overlap - * - * SHARED_PARENT: Set for all NUMA levels above NODE. - * NEEDS_GROUPS: Overlaps can only exist with more than one group. - */ -SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) - /* * Cross-node balancing * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 0d5daaa277b7..5263746b63e8 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -175,8 +175,6 @@ bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); -#define SDTL_OVERLAP 0x01 - struct sd_data { struct sched_domain *__percpu *sd; struct sched_domain_shared *__percpu *sds; @@ -187,7 +185,6 @@ struct sd_data { struct sched_domain_topology_level { sched_domain_mask_f mask; sched_domain_flags_f sd_flags; - int flags; int numa_level; struct sd_data data; char *name; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20a845697c1d..b9b4bbbf0af6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9926,9 +9926,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu) min_capacity = ULONG_MAX; max_capacity = 0; - if (child->flags & SD_OVERLAP) { + if (child->flags & SD_NUMA) { /* - * SD_OVERLAP domains cannot assume that child groups + * SD_NUMA domains cannot assume that child groups * span the current group. */ @@ -9941,7 +9941,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } } else { /* - * !SD_OVERLAP domains can assume that child groups + * !SD_NUMA domains can assume that child groups * span the current group. */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index d01f5a49f2e7..977e133bb8a4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -89,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!(sd->flags & SD_OVERLAP) && + if (!(sd->flags & SD_NUMA) && cpumask_intersects(groupmask, sched_group_span(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: repeated CPUs\n"); @@ -102,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, group->sgc->id, cpumask_pr_args(sched_group_span(group))); - if ((sd->flags & SD_OVERLAP) && + if ((sd->flags & SD_NUMA) && !cpumask_equal(group_balance_mask(group), sched_group_span(group))) { printk(KERN_CONT " mask=%*pbl", cpumask_pr_args(group_balance_mask(group))); @@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" * which is shared by all the overlapping groups. */ - WARN_ON_ONCE(sd->flags & SD_OVERLAP); + WARN_ON_ONCE(sd->flags & SD_NUMA); sg = sd->groups; if (cpu != sg->asym_prefer_cpu) { @@ -2016,7 +2016,6 @@ void sched_init_numa(int offline_node) for (j = 1; j < nr_levels; i++, j++) { tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA); tl[i].numa_level = j; - tl[i].flags = SDTL_OVERLAP; } sched_domain_topology_saved = sched_domain_topology; @@ -2327,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map) if (sdd->sd) { sd = *per_cpu_ptr(sdd->sd, j); - if (sd && (sd->flags & SD_OVERLAP)) + if (sd && (sd->flags & SD_NUMA)) free_sched_groups(sd->groups, 0); kfree(*per_cpu_ptr(sdd->sd, j)); } @@ -2393,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map) id_seen = sched_domains_tmpmask2; for_each_sd_topology(tl) { + int tl_common_flags = 0; + + if (tl->sd_flags) + tl_common_flags = (*tl->sd_flags)(); /* NUMA levels are allowed to overlap */ - if (tl->flags & SDTL_OVERLAP) + if (tl_common_flags & SD_NUMA) continue; cpumask_clear(covered); @@ -2466,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att if (tl == sched_domain_topology) *per_cpu_ptr(d.sd, i) = sd; - if (tl->flags & SDTL_OVERLAP) - sd->flags |= SD_OVERLAP; if (cpumask_equal(cpu_map, sched_domain_span(sd))) break; } @@ -2480,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att for_each_cpu(i, cpu_map) { for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { sd->span_weight = cpumask_weight(sched_domain_span(sd)); - if (sd->flags & SD_OVERLAP) { + if (sd->flags & SD_NUMA) { if (build_overlap_sched_groups(sd, i)) goto error; } else { From 25c411fce735dda29de26f58d3fce52d4824380c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:42 +0000 Subject: [PATCH 75/84] sched: Add CONFIG_SCHED_PROXY_EXEC & boot argument to enable/disable Add a CONFIG_SCHED_PROXY_EXEC option, along with a boot argument sched_proxy_exec= that can be used to disable the feature at boot time if CONFIG_SCHED_PROXY_EXEC was enabled. Also uses this option to allow the rq->donor to be different from rq->curr. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-2-jstultz@google.com --- .../admin-guide/kernel-parameters.txt | 5 ++++ include/linux/sched.h | 13 +++++++++ init/Kconfig | 12 ++++++++ kernel/sched/core.c | 29 +++++++++++++++++++ kernel/sched/sched.h | 12 ++++++++ 5 files changed, 71 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 07e22ba5bfe3..00b835717860 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6387,6 +6387,11 @@ sa1100ir [NET] See drivers/net/irda/sa1100_ir.c. + sched_proxy_exec= [KNL] + Enables or disables "proxy execution" style + solution to mutex-based priority inversion. + Format: + sched_verbose [KNL,EARLY] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. diff --git a/include/linux/sched.h b/include/linux/sched.h index 54a91261e99b..f225b6b1baa3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1656,6 +1656,19 @@ struct task_struct { randomized_struct_fields_end } __attribute__ ((aligned (64))); +#ifdef CONFIG_SCHED_PROXY_EXEC +DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); +static inline bool sched_proxy_exec(void) +{ + return static_branch_likely(&__sched_proxy_exec); +} +#else +static inline bool sched_proxy_exec(void) +{ + return false; +} +#endif + #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) diff --git a/init/Kconfig b/init/Kconfig index 965699c0a6d9..24dd42d3808d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -878,6 +878,18 @@ config UCLAMP_BUCKETS_COUNT If in doubt, use the default value. +config SCHED_PROXY_EXEC + bool "Proxy Execution" + # Avoid some build failures w/ PREEMPT_RT until it can be fixed + depends on !PREEMPT_RT + # Need to investigate how to inform sched_ext of split contexts + depends on !SCHED_CLASS_EXT + # Not particularly useful until we get to multi-rq proxying + depends on EXPERT + help + This option enables proxy execution, a mechanism for mutex-owning + tasks to inherit the scheduling context of higher priority waiters. + endmenu # diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e9c8bda84d80..dd9f5c08b563 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -119,6 +119,35 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#ifdef CONFIG_SCHED_PROXY_EXEC +DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); +static int __init setup_proxy_exec(char *str) +{ + bool proxy_enable = true; + + if (*str && kstrtobool(str + 1, &proxy_enable)) { + pr_warn("Unable to parse sched_proxy_exec=\n"); + return 0; + } + + if (proxy_enable) { + pr_info("sched_proxy_exec enabled via boot arg\n"); + static_branch_enable(&__sched_proxy_exec); + } else { + pr_info("sched_proxy_exec disabled via boot arg\n"); + static_branch_disable(&__sched_proxy_exec); + } + return 1; +} +#else +static int __init setup_proxy_exec(char *str) +{ + pr_warn("CONFIG_SCHED_PROXY_EXEC=n, so it cannot be enabled or disabled at boot time\n"); + return 0; +} +#endif +__setup("sched_proxy_exec", setup_proxy_exec); + /* * Debugging: various feature bits * diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ac953fad8c21..e53d0b87f780 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1142,10 +1142,15 @@ struct rq { */ unsigned long nr_uninterruptible; +#ifdef CONFIG_SCHED_PROXY_EXEC + struct task_struct __rcu *donor; /* Scheduling context */ + struct task_struct __rcu *curr; /* Execution context */ +#else union { struct task_struct __rcu *donor; /* Scheduler context */ struct task_struct __rcu *curr; /* Execution context */ }; +#endif struct sched_dl_entity *dl_server; struct task_struct *idle; struct task_struct *stop; @@ -1326,10 +1331,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() raw_cpu_ptr(&runqueues) +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline void rq_set_donor(struct rq *rq, struct task_struct *t) +{ + rcu_assign_pointer(rq->donor, t); +} +#else static inline void rq_set_donor(struct rq *rq, struct task_struct *t) { /* Do nothing */ } +#endif #ifdef CONFIG_SCHED_CORE static inline struct cpumask *sched_group_span(struct sched_group *sg); From 44e4e0297c3c01987399bb9973f4d22a096a62c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 12 Jul 2025 03:33:43 +0000 Subject: [PATCH 76/84] locking/mutex: Rework task_struct::blocked_on Track the blocked-on relation for mutexes, to allow following this relation at schedule time. task | blocked-on v mutex | owner v task This all will be used for tracking blocked-task/mutex chains with the prox-execution patch in a similar fashion to how priority inheritance is done with rt_mutexes. For serialization, blocked-on is only set by the task itself (current). And both when setting or clearing (potentially by others), is done while holding the mutex::wait_lock. [minor changes while rebasing] [jstultz: Fix blocked_on tracking in __mutex_lock_common in error paths] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juri Lelli Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-3-jstultz@google.com --- include/linux/sched.h | 5 +---- kernel/fork.c | 3 +-- kernel/locking/mutex-debug.c | 9 +++++---- kernel/locking/mutex.c | 22 ++++++++++++++++++++++ kernel/locking/ww_mutex.h | 18 ++++++++++++++++-- 5 files changed, 45 insertions(+), 12 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index f225b6b1baa3..33ad240ec900 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1230,10 +1230,7 @@ struct task_struct { struct rt_mutex_waiter *pi_blocked_on; #endif -#ifdef CONFIG_DEBUG_MUTEXES - /* Mutex deadlock detection: */ - struct mutex_waiter *blocked_on; -#endif + struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* diff --git a/kernel/fork.c b/kernel/fork.c index 1ee8eb11f38b..5f87f05aff4a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2123,9 +2123,8 @@ __latent_entropy struct task_struct *copy_process( lockdep_init_task(p); #endif -#ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ -#endif + #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 6e6f6071cfa2..758b7a6792b0 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -53,17 +53,18 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, { lockdep_assert_held(&lock->wait_lock); - /* Mark the current thread as blocked on the lock: */ - task->blocked_on = waiter; + /* Current thread can't be already blocked (since it's executing!) */ + DEBUG_LOCKS_WARN_ON(task->blocked_on); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { + struct mutex *blocked_on = READ_ONCE(task->blocked_on); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); - DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); - task->blocked_on = NULL; + DEBUG_LOCKS_WARN_ON(blocked_on && blocked_on != lock); INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a39ecccbd106..e2f59863a866 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -644,6 +644,8 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } + WARN_ON(current->blocked_on); + current->blocked_on = lock; set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { @@ -680,6 +682,12 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas first = __mutex_waiter_is_first(lock, &waiter); + /* + * As we likely have been woken up by task + * that has cleared our blocked_on state, re-set + * it to the lock we are trying to aquire. + */ + current->blocked_on = lock; set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -691,8 +699,11 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (first) { trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); + /* clear blocked_on as mutex_optimistic_spin may schedule() */ + current->blocked_on = NULL; if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) break; + current->blocked_on = lock; trace_contention_begin(lock, LCB_F_MUTEX); } @@ -700,6 +711,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: + current->blocked_on = NULL; __set_current_state(TASK_RUNNING); if (ww_ctx) { @@ -729,9 +741,11 @@ skip_wait: return 0; err: + current->blocked_on = NULL; __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: + WARN_ON(current->blocked_on); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -942,6 +956,14 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); + /* + * Unlock wakeups can be happening in parallel + * (when optimistic spinners steal and release + * the lock), so blocked_on may already be + * cleared here. + */ + WARN_ON(next->blocked_on && next->blocked_on != lock); + next->blocked_on = NULL; wake_q_add(&wake_q, next); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 37f025a096c9..45fe05e51db1 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -283,7 +283,15 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) { #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); + /* + * When waking up the task to die, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + */ + WARN_ON(waiter->task->blocked_on && + waiter->task->blocked_on != lock); #endif + waiter->task->blocked_on = NULL; wake_q_add(wake_q, waiter->task); } @@ -331,9 +339,15 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * it's wounded in __ww_mutex_check_kill() or has a * wakeup pending to re-read the wounded state. */ - if (owner != current) + if (owner != current) { + /* + * When waking up the task to wound, be sure to clear the + * blocked_on pointer. Otherwise we can see circular + * blocked_on relationships that can't resolve. + */ + owner->blocked_on = NULL; wake_q_add(wake_q, owner); - + } return true; } From a4f0b6fef4b08e9928449206390133e48ac185a7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sat, 12 Jul 2025 03:33:44 +0000 Subject: [PATCH 77/84] locking/mutex: Add p->blocked_on wrappers for correctness checks This lets us assert mutex::wait_lock is held whenever we access p->blocked_on, as well as warn us for unexpected state changes. [fix conflicts, call in more places] [jstultz: tweaked commit subject, reworked a good bit] Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-4-jstultz@google.com --- include/linux/sched.h | 64 ++++++++++++++++++++++++++++++++++-- kernel/locking/mutex-debug.c | 4 +-- kernel/locking/mutex.c | 32 ++++++++---------- kernel/locking/ww_mutex.h | 8 ++--- 4 files changed, 81 insertions(+), 27 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 33ad240ec900..5b4e1cd52e27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -2129,6 +2130,67 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock); __cond_resched_rwlock_write(lock); \ }) +#ifndef CONFIG_PREEMPT_RT +static inline struct mutex *__get_task_blocked_on(struct task_struct *p) +{ + struct mutex *m = p->blocked_on; + + if (m) + lockdep_assert_held_once(&m->wait_lock); + return m; +} + +static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + WARN_ON_ONCE(!m); + /* The task should only be setting itself as blocked */ + WARN_ON_ONCE(p != current); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * Check ensure we don't overwrite existing mutex value + * with a different mutex. Note, setting it to the same + * lock repeatedly is ok. + */ + WARN_ON_ONCE(p->blocked_on && p->blocked_on != m); + p->blocked_on = m; +} + +static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&m->wait_lock); + __set_task_blocked_on(p, m); +} + +static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + WARN_ON_ONCE(!m); + /* Currently we serialize blocked_on under the mutex::wait_lock */ + lockdep_assert_held_once(&m->wait_lock); + /* + * There may be cases where we re-clear already cleared + * blocked_on relationships, but make sure we are not + * clearing the relationship with a different lock. + */ + WARN_ON_ONCE(m && p->blocked_on && p->blocked_on != m); + p->blocked_on = NULL; +} + +static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) +{ + guard(raw_spinlock_irqsave)(&m->wait_lock); + __clear_task_blocked_on(p, m); +} +#else +static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ +} + +static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) +{ +} +#endif /* !CONFIG_PREEMPT_RT */ + static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); @@ -2168,8 +2230,6 @@ extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); -#include - /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 758b7a6792b0..949103fd8e9b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -54,13 +54,13 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, lockdep_assert_held(&lock->wait_lock); /* Current thread can't be already blocked (since it's executing!) */ - DEBUG_LOCKS_WARN_ON(task->blocked_on); + DEBUG_LOCKS_WARN_ON(__get_task_blocked_on(task)); } void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { - struct mutex *blocked_on = READ_ONCE(task->blocked_on); + struct mutex *blocked_on = __get_task_blocked_on(task); DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); DEBUG_LOCKS_WARN_ON(waiter->task != task); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e2f59863a866..80d778fedd60 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -644,8 +644,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas goto err_early_kill; } - WARN_ON(current->blocked_on); - current->blocked_on = lock; + __set_task_blocked_on(current, lock); set_current_state(state); trace_contention_begin(lock, LCB_F_MUTEX); for (;;) { @@ -685,9 +684,9 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas /* * As we likely have been woken up by task * that has cleared our blocked_on state, re-set - * it to the lock we are trying to aquire. + * it to the lock we are trying to acquire. */ - current->blocked_on = lock; + set_task_blocked_on(current, lock); set_current_state(state); /* * Here we order against unlock; we must either see it change @@ -699,11 +698,15 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas if (first) { trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN); - /* clear blocked_on as mutex_optimistic_spin may schedule() */ - current->blocked_on = NULL; + /* + * mutex_optimistic_spin() can call schedule(), so + * clear blocked on so we don't become unselectable + * to run. + */ + clear_task_blocked_on(current, lock); if (mutex_optimistic_spin(lock, ww_ctx, &waiter)) break; - current->blocked_on = lock; + set_task_blocked_on(current, lock); trace_contention_begin(lock, LCB_F_MUTEX); } @@ -711,7 +714,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas } raw_spin_lock_irqsave(&lock->wait_lock, flags); acquired: - current->blocked_on = NULL; + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); if (ww_ctx) { @@ -741,11 +744,11 @@ skip_wait: return 0; err: - current->blocked_on = NULL; + __clear_task_blocked_on(current, lock); __set_current_state(TASK_RUNNING); __mutex_remove_waiter(lock, &waiter); err_early_kill: - WARN_ON(current->blocked_on); + WARN_ON(__get_task_blocked_on(current)); trace_contention_end(lock, ret); raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q); debug_mutex_free_waiter(&waiter); @@ -956,14 +959,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - /* - * Unlock wakeups can be happening in parallel - * (when optimistic spinners steal and release - * the lock), so blocked_on may already be - * cleared here. - */ - WARN_ON(next->blocked_on && next->blocked_on != lock); - next->blocked_on = NULL; + __clear_task_blocked_on(next, lock); wake_q_add(&wake_q, next); } diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h index 45fe05e51db1..086fd5487ca7 100644 --- a/kernel/locking/ww_mutex.h +++ b/kernel/locking/ww_mutex.h @@ -283,15 +283,13 @@ __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, if (waiter->ww_ctx->acquired > 0 && __ww_ctx_less(waiter->ww_ctx, ww_ctx)) { #ifndef WW_RT debug_mutex_wake_waiter(lock, waiter); +#endif /* * When waking up the task to die, be sure to clear the * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. */ - WARN_ON(waiter->task->blocked_on && - waiter->task->blocked_on != lock); -#endif - waiter->task->blocked_on = NULL; + __clear_task_blocked_on(waiter->task, lock); wake_q_add(wake_q, waiter->task); } @@ -345,7 +343,7 @@ static bool __ww_mutex_wound(struct MUTEX *lock, * blocked_on pointer. Otherwise we can see circular * blocked_on relationships that can't resolve. */ - owner->blocked_on = NULL; + __clear_task_blocked_on(owner, lock); wake_q_add(wake_q, owner); } return true; From 865d8cfb1672089e4b628d6899ac5c6e49787150 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:45 +0000 Subject: [PATCH 78/84] sched: Move update_curr_task logic into update_curr_se Absorb update_curr_task() into update_curr_se(), and in the process simplify update_curr_common(). This will make the next step a bit easier. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-5-jstultz@google.com --- kernel/sched/fair.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b9b4bbbf0af6..8334580ed3a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1164,6 +1164,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) curr->exec_start = now; curr->sum_exec_runtime += delta_exec; + if (entity_is_task(curr)) { + struct task_struct *p = task_of(curr); + + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); + cgroup_account_cputime(p, delta_exec); + } + if (schedstat_enabled()) { struct sched_statistics *stats; @@ -1175,26 +1183,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) return delta_exec; } -static inline void update_curr_task(struct task_struct *p, s64 delta_exec) -{ - trace_sched_stat_runtime(p, delta_exec); - account_group_exec_runtime(p, delta_exec); - cgroup_account_cputime(p, delta_exec); -} - /* * Used by other classes to account runtime. */ s64 update_curr_common(struct rq *rq) { struct task_struct *donor = rq->donor; - s64 delta_exec; - delta_exec = update_curr_se(rq, &donor->se); - if (likely(delta_exec > 0)) - update_curr_task(donor, delta_exec); - - return delta_exec; + return update_curr_se(rq, &donor->se); } /* @@ -1219,10 +1215,6 @@ static void update_curr(struct cfs_rq *cfs_rq) update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { - struct task_struct *p = task_of(curr); - - update_curr_task(p, delta_exec); - /* * If the fair_server is active, we need to account for the * fair_server time whether or not the task is running on From aa4f74dfd42ba4399f785fb9c460a11bd1756f0a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:46 +0000 Subject: [PATCH 79/84] sched: Fix runtime accounting w/ split exec & sched contexts Without proxy-exec, we normally charge the "current" task for both its vruntime as well as its sum_exec_runtime. With proxy, however, we have two "current" contexts: the scheduler context and the execution context. We want to charge the execution context rq->curr (ie: proxy/lock holder) execution time to its sum_exec_runtime (so it's clear to userland the rq->curr task *is* running), as well as its thread group. However the rest of the time accounting (such a vruntime and cgroup accounting), we charge against the scheduler context (rq->donor) task, because it is from that task that the time is being "donated". If the donor and curr tasks are the same, then it's the same as without proxy. Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-6-jstultz@google.com --- kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8334580ed3a3..97176458f2be 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1152,30 +1152,40 @@ void post_init_entity_util_avg(struct task_struct *p) sa->runnable_avg = sa->util_avg; } -static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) +static s64 update_se(struct rq *rq, struct sched_entity *se) { u64 now = rq_clock_task(rq); s64 delta_exec; - delta_exec = now - curr->exec_start; + delta_exec = now - se->exec_start; if (unlikely(delta_exec <= 0)) return delta_exec; - curr->exec_start = now; - curr->sum_exec_runtime += delta_exec; + se->exec_start = now; + if (entity_is_task(se)) { + struct task_struct *donor = task_of(se); + struct task_struct *running = rq->curr; + /* + * If se is a task, we account the time against the running + * task, as w/ proxy-exec they may not be the same. + */ + running->se.exec_start = now; + running->se.sum_exec_runtime += delta_exec; - if (entity_is_task(curr)) { - struct task_struct *p = task_of(curr); + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); - trace_sched_stat_runtime(p, delta_exec); - account_group_exec_runtime(p, delta_exec); - cgroup_account_cputime(p, delta_exec); + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); + } else { + /* If not task, account the time against donor se */ + se->sum_exec_runtime += delta_exec; } if (schedstat_enabled()) { struct sched_statistics *stats; - stats = __schedstats_from_se(curr); + stats = __schedstats_from_se(se); __schedstat_set(stats->exec_max, max(delta_exec, stats->exec_max)); } @@ -1188,9 +1198,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) */ s64 update_curr_common(struct rq *rq) { - struct task_struct *donor = rq->donor; - - return update_curr_se(rq, &donor->se); + return update_se(rq, &rq->donor->se); } /* @@ -1198,6 +1206,12 @@ s64 update_curr_common(struct rq *rq) */ static void update_curr(struct cfs_rq *cfs_rq) { + /* + * Note: cfs_rq->curr corresponds to the task picked to + * run (ie: rq->donor.se) which due to proxy-exec may + * not necessarily be the actual task running + * (rq->curr.se). This is easy to confuse! + */ struct sched_entity *curr = cfs_rq->curr; struct rq *rq = rq_of(cfs_rq); s64 delta_exec; @@ -1206,7 +1220,7 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq, curr); + delta_exec = update_se(rq, curr); if (unlikely(delta_exec <= 0)) return; From be41bde4c3a86de4be5cd3d1ca613e24664e68dc Mon Sep 17 00:00:00 2001 From: John Stultz Date: Sat, 12 Jul 2025 03:33:47 +0000 Subject: [PATCH 80/84] sched: Add an initial sketch of the find_proxy_task() function Add a find_proxy_task() function which doesn't do much. When we select a blocked task to run, we will just deactivate it and pick again. The exception being if it has become unblocked after find_proxy_task() was called. This allows us to validate keeping blocked tasks on the runqueue and later deactivating them is working ok, stressing the failure cases for when a proxy isn't found. Greatly simplified from patch by: Peter Zijlstra (Intel) Juri Lelli Valentin Schneider Connor O'Brien [jstultz: Split out from larger proxy patch and simplified for review and testing.] Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-7-jstultz@google.com --- kernel/sched/core.c | 117 +++++++++++++++++++++++++++++++++++++++++-- kernel/sched/sched.h | 10 +++- 2 files changed, 121 insertions(+), 6 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dd9f5c08b563..cb55d4247e65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6522,11 +6522,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Helper function for __schedule() * - * If a task does not have signals pending, deactivate it - * Otherwise marks the task's __state as RUNNING + * Tries to deactivate the task, unless the should_block arg + * is false or if a signal is pending. In the case a signal + * is pending, marks the task's __state as RUNNING (and clear + * blocked_on). */ static bool try_to_block_task(struct rq *rq, struct task_struct *p, - unsigned long *task_state_p) + unsigned long *task_state_p, bool should_block) { unsigned long task_state = *task_state_p; int flags = DEQUEUE_NOCLOCK; @@ -6537,6 +6539,16 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return false; } + /* + * We check should_block after signal_pending because we + * will want to wake the task in that case. But if + * should_block is false, its likely due to the task being + * blocked on a mutex, and we want to keep it on the runqueue + * to be selectable for proxy-execution. + */ + if (!should_block) + return false; + p->sched_contributes_to_load = (task_state & TASK_UNINTERRUPTIBLE) && !(task_state & TASK_NOLOAD) && @@ -6560,6 +6572,88 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, return true; } +#ifdef CONFIG_SCHED_PROXY_EXEC +static inline void proxy_resched_idle(struct rq *rq) +{ + put_prev_set_next_task(rq, rq->donor, rq->idle); + rq_set_donor(rq, rq->idle); + set_tsk_need_resched(rq->idle); +} + +static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + unsigned long state = READ_ONCE(donor->__state); + + /* Don't deactivate if the state has been changed to TASK_RUNNING */ + if (state == TASK_RUNNING) + return false; + /* + * Because we got donor from pick_next_task(), it is *crucial* + * that we call proxy_resched_idle() before we deactivate it. + * As once we deactivate donor, donor->on_rq is set to zero, + * which allows ttwu() to immediately try to wake the task on + * another rq. So we cannot use *any* references to donor + * after that point. So things like cfs_rq->curr or rq->donor + * need to be changed from next *before* we deactivate. + */ + proxy_resched_idle(rq); + return try_to_block_task(rq, donor, &state, true); +} + +static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *donor) +{ + if (!__proxy_deactivate(rq, donor)) { + /* + * XXX: For now, if deactivation failed, set donor + * as unblocked, as we aren't doing proxy-migrations + * yet (more logic will be needed then). + */ + donor->blocked_on = NULL; + } + return NULL; +} + +/* + * Initial simple sketch that just deactivates the blocked task + * chosen by pick_next_task() so we can then pick something that + * isn't blocked. + */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + struct mutex *mutex; + + mutex = donor->blocked_on; + /* Something changed in the chain, so pick again */ + if (!mutex) + return NULL; + /* + * By taking mutex->wait_lock we hold off concurrent mutex_unlock() + * and ensure @owner sticks around. + */ + guard(raw_spinlock)(&mutex->wait_lock); + + /* Check again that donor is blocked with blocked_lock held */ + if (!task_is_blocked(donor) || mutex != __get_task_blocked_on(donor)) { + /* + * Something changed in the blocked_on chain and + * we don't know if only at this level. So, let's + * just bail out completely and let __schedule() + * figure things out (pick_again loop). + */ + return NULL; /* do pick_next_task() again */ + } + return proxy_deactivate(rq, donor); +} +#else /* SCHED_PROXY_EXEC */ +static struct task_struct * +find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) +{ + WARN_ONCE(1, "This should never be called in the !SCHED_PROXY_EXEC case\n"); + return donor; +} +#endif /* SCHED_PROXY_EXEC */ + /* * __schedule() is the main scheduler function. * @@ -6672,12 +6766,25 @@ static void __sched notrace __schedule(int sched_mode) goto picked; } } else if (!preempt && prev_state) { - try_to_block_task(rq, prev, &prev_state); + /* + * We pass task_is_blocked() as the should_block arg + * in order to keep mutex-blocked tasks on the runqueue + * for slection with proxy-exec (without proxy-exec + * task_is_blocked() will always be false). + */ + try_to_block_task(rq, prev, &prev_state, + !task_is_blocked(prev)); switch_count = &prev->nvcsw; } - next = pick_next_task(rq, prev, &rf); +pick_again: + next = pick_next_task(rq, rq->donor, &rf); rq_set_donor(rq, next); + if (unlikely(task_is_blocked(next))) { + next = find_proxy_task(rq, next, &rf); + if (!next) + goto pick_again; + } picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e53d0b87f780..d3f33d10c58c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2262,6 +2262,14 @@ static inline int task_current_donor(struct rq *rq, struct task_struct *p) return rq->donor == p; } +static inline bool task_is_blocked(struct task_struct *p) +{ + if (!sched_proxy_exec()) + return false; + + return !!p->blocked_on; +} + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { return p->on_cpu; @@ -2459,7 +2467,7 @@ static inline void put_prev_set_next_task(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - WARN_ON_ONCE(rq->curr != prev); + WARN_ON_ONCE(rq->donor != prev); __put_prev_set_next_dl_server(rq, prev, next); From be39617e38e0b1939a6014d77ee6f14707d59b1b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sat, 12 Jul 2025 03:33:48 +0000 Subject: [PATCH 81/84] sched: Fix proxy/current (push,pull)ability Proxy execution forms atomic pairs of tasks: The waiting donor task (scheduling context) and a proxy (execution context). The donor task, along with the rest of the blocked chain, follows the proxy wrt CPU placement. They can be the same task, in which case push/pull doesn't need any modification. When they are different, however, FIFO1 & FIFO42: ,-> RT42 | | blocked-on | v blocked_donor | mutex | | owner | v `-- RT1 RT1 RT42 CPU0 CPU1 ^ ^ | | overloaded !overloaded rq prio = 42 rq prio = 0 RT1 is eligible to be pushed to CPU1, but should that happen it will "carry" RT42 along. Clearly here neither RT1 nor RT42 must be seen as push/pullable. Unfortunately, only the donor task is usually dequeued from the rq, and the proxy'ed execution context (rq->curr) remains on the rq. This can cause RT1 to be selected for migration from logic like the rt pushable_list. Thus, adda a dequeue/enqueue cycle on the proxy task before __schedule returns, which allows the sched class logic to avoid adding the now current task to the pushable_list. Furthermore, tasks becoming blocked on a mutex don't need an explicit dequeue/enqueue cycle to be made (push/pull)able: they have to be running to block on a mutex, thus they will eventually hit put_prev_task(). Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-8-jstultz@google.com --- kernel/sched/core.c | 25 +++++++++++++++++++++++++ kernel/sched/deadline.c | 7 +++++++ kernel/sched/rt.c | 5 +++++ 3 files changed, 37 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cb55d4247e65..a0b11201a7b4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6654,6 +6654,23 @@ find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) } #endif /* SCHED_PROXY_EXEC */ +static inline void proxy_tag_curr(struct rq *rq, struct task_struct *owner) +{ + if (!sched_proxy_exec()) + return; + /* + * pick_next_task() calls set_next_task() on the chosen task + * at some point, which ensures it is not push/pullable. + * However, the chosen/donor task *and* the mutex owner form an + * atomic pair wrt push/pull. + * + * Make sure owner we run is not pushable. Unfortunately we can + * only deal with that by means of a dequeue/enqueue cycle. :-/ + */ + dequeue_task(rq, owner, DEQUEUE_NOCLOCK | DEQUEUE_SAVE); + enqueue_task(rq, owner, ENQUEUE_NOCLOCK | ENQUEUE_RESTORE); +} + /* * __schedule() is the main scheduler function. * @@ -6798,6 +6815,10 @@ picked: * changes to task_struct made by pick_next_task(). */ RCU_INIT_POINTER(rq->curr, next); + + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + /* * The membarrier system call requires each architecture * to have a full memory barrier after updating @@ -6832,6 +6853,10 @@ picked: /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + /* In case next was already curr but just got blocked_donor */ + if (!task_current_donor(rq, next)) + proxy_tag_curr(rq, next); + rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1af06e48227d..e2d51f4306b3 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2121,6 +2121,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) if (dl_server(&p->dl)) return; + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && !p->dl.dl_throttled && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } @@ -2415,6 +2418,10 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_s update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); + + if (task_is_blocked(p)) + return; + if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index be6e9bcbe82b..7936d4333731 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1440,6 +1440,9 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags); + if (task_is_blocked(p)) + return; + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1716,6 +1719,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1); + if (task_is_blocked(p)) + return; /* * The previous task needs to be made eligible for pushing * if it is still active From 7de9d4f946383f48ec393b6e9ad0c20e49e174e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 12 Jul 2025 03:33:49 +0000 Subject: [PATCH 82/84] sched: Start blocked_on chain processing in find_proxy_task() Start to flesh out the real find_proxy_task() implementation, but avoid the migration cases for now, in those cases just deactivate the donor task and pick again. To ensure the donor task or other blocked tasks in the chain aren't migrated away while we're running the proxy, also tweak the fair class logic to avoid migrating donor or mutex blocked tasks. [jstultz: This change was split out from the larger proxy patch] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Juri Lelli Signed-off-by: Valentin Schneider Signed-off-by: Connor O'Brien Signed-off-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20250712033407.2383110-9-jstultz@google.com --- kernel/locking/mutex.h | 3 +- kernel/sched/core.c | 148 ++++++++++++++++++++++++++++++++++------- kernel/sched/fair.c | 12 +++- 3 files changed, 136 insertions(+), 27 deletions(-) diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index cbff35b9b7ae..2e8080a9bee3 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -6,7 +6,7 @@ * * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar */ - +#ifndef CONFIG_PREEMPT_RT /* * This is the control structure for tasks blocked on mutex, which resides * on the blocked task's kernel stack: @@ -70,3 +70,4 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_unlock(lock) do { } while (0) # define debug_mutex_init(lock, name, key) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ +#endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a0b11201a7b4..f7f576ad9b22 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -96,6 +96,7 @@ #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" #include "../smpboot.h" +#include "../locking/mutex.h" EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); @@ -2933,8 +2934,15 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag struct set_affinity_pending my_pending = { }, *pending = NULL; bool stop_pending, complete = false; - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + /* + * Can the task run on the task's current CPU? If so, we're done + * + * We are also done if the task is the current donor, boosting a lock- + * holding proxy, (and potentially has been migrated outside its + * current or previous affinity mask) + */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask) || + (task_current_donor(rq, p) && !task_current(rq, p))) { struct task_struct *push_task = NULL; if ((flags & SCA_MIGRATE_ENABLE) && @@ -6573,11 +6581,12 @@ static bool try_to_block_task(struct rq *rq, struct task_struct *p, } #ifdef CONFIG_SCHED_PROXY_EXEC -static inline void proxy_resched_idle(struct rq *rq) +static inline struct task_struct *proxy_resched_idle(struct rq *rq) { put_prev_set_next_task(rq, rq->donor, rq->idle); rq_set_donor(rq, rq->idle); set_tsk_need_resched(rq->idle); + return rq->idle; } static bool __proxy_deactivate(struct rq *rq, struct task_struct *donor) @@ -6614,36 +6623,124 @@ static struct task_struct *proxy_deactivate(struct rq *rq, struct task_struct *d } /* - * Initial simple sketch that just deactivates the blocked task - * chosen by pick_next_task() so we can then pick something that - * isn't blocked. + * Find runnable lock owner to proxy for mutex blocked donor + * + * Follow the blocked-on relation: + * task->blocked_on -> mutex->owner -> task... + * + * Lock order: + * + * p->pi_lock + * rq->lock + * mutex->wait_lock + * + * Returns the task that is going to be used as execution context (the one + * that is actually going to be run on cpu_of(rq)). */ static struct task_struct * find_proxy_task(struct rq *rq, struct task_struct *donor, struct rq_flags *rf) { + struct task_struct *owner = NULL; + int this_cpu = cpu_of(rq); + struct task_struct *p; struct mutex *mutex; - mutex = donor->blocked_on; - /* Something changed in the chain, so pick again */ - if (!mutex) - return NULL; - /* - * By taking mutex->wait_lock we hold off concurrent mutex_unlock() - * and ensure @owner sticks around. - */ - guard(raw_spinlock)(&mutex->wait_lock); - - /* Check again that donor is blocked with blocked_lock held */ - if (!task_is_blocked(donor) || mutex != __get_task_blocked_on(donor)) { + /* Follow blocked_on chain. */ + for (p = donor; task_is_blocked(p); p = owner) { + mutex = p->blocked_on; + /* Something changed in the chain, so pick again */ + if (!mutex) + return NULL; /* - * Something changed in the blocked_on chain and - * we don't know if only at this level. So, let's - * just bail out completely and let __schedule() - * figure things out (pick_again loop). + * By taking mutex->wait_lock we hold off concurrent mutex_unlock() + * and ensure @owner sticks around. + */ + guard(raw_spinlock)(&mutex->wait_lock); + + /* Check again that p is blocked with wait_lock held */ + if (mutex != __get_task_blocked_on(p)) { + /* + * Something changed in the blocked_on chain and + * we don't know if only at this level. So, let's + * just bail out completely and let __schedule() + * figure things out (pick_again loop). + */ + return NULL; + } + + owner = __mutex_owner(mutex); + if (!owner) { + __clear_task_blocked_on(p, mutex); + return p; + } + + if (!READ_ONCE(owner->on_rq) || owner->se.sched_delayed) { + /* XXX Don't handle blocked owners/delayed dequeue yet */ + return proxy_deactivate(rq, donor); + } + + if (task_cpu(owner) != this_cpu) { + /* XXX Don't handle migrations yet */ + return proxy_deactivate(rq, donor); + } + + if (task_on_rq_migrating(owner)) { + /* + * One of the chain of mutex owners is currently migrating to this + * CPU, but has not yet been enqueued because we are holding the + * rq lock. As a simple solution, just schedule rq->idle to give + * the migration a chance to complete. Much like the migrate_task + * case we should end up back in find_proxy_task(), this time + * hopefully with all relevant tasks already enqueued. + */ + return proxy_resched_idle(rq); + } + + /* + * Its possible to race where after we check owner->on_rq + * but before we check (owner_cpu != this_cpu) that the + * task on another cpu was migrated back to this cpu. In + * that case it could slip by our checks. So double check + * we are still on this cpu and not migrating. If we get + * inconsistent results, try again. + */ + if (!task_on_rq_queued(owner) || task_cpu(owner) != this_cpu) + return NULL; + + if (owner == p) { + /* + * It's possible we interleave with mutex_unlock like: + * + * lock(&rq->lock); + * find_proxy_task() + * mutex_unlock() + * lock(&wait_lock); + * donor(owner) = current->blocked_donor; + * unlock(&wait_lock); + * + * wake_up_q(); + * ... + * ttwu_runnable() + * __task_rq_lock() + * lock(&wait_lock); + * owner == p + * + * Which leaves us to finish the ttwu_runnable() and make it go. + * + * So schedule rq->idle so that ttwu_runnable() can get the rq + * lock and mark owner as running. + */ + return proxy_resched_idle(rq); + } + /* + * OK, now we're absolutely sure @owner is on this + * rq, therefore holding @rq->lock is sufficient to + * guarantee its existence, as per ttwu_remote(). */ - return NULL; /* do pick_next_task() again */ } - return proxy_deactivate(rq, donor); + + WARN_ON_ONCE(owner && !owner->on_rq); + return owner; } #else /* SCHED_PROXY_EXEC */ static struct task_struct * @@ -6801,10 +6898,13 @@ pick_again: next = find_proxy_task(rq, next, &rf); if (!next) goto pick_again; + if (next == rq->idle) + goto keep_resched; } picked: clear_tsk_need_resched(prev); clear_preempt_need_resched(); +keep_resched: rq->last_seen_need_resched_ns = 0; is_switch = prev != next; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 97176458f2be..b173a059315c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9291,7 +9291,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) throttled_lb_pair, or * 3) cannot be migrated to this CPU due to cpus_ptr, or * 4) running (obviously), or - * 5) are cache-hot on their current CPU. + * 5) are cache-hot on their current CPU, or + * 6) are blocked on mutexes (if SCHED_PROXY_EXEC is enabled) */ if ((p->se.sched_delayed) && (env->migration_type != migrate_load)) return 0; @@ -9313,6 +9314,9 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (kthread_is_per_cpu(p)) return 0; + if (task_is_blocked(p)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { int cpu; @@ -9348,7 +9352,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_on_cpu(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p) || + task_current_donor(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -9392,6 +9397,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env) schedstat_inc(p->stats.nr_forced_migrations); } + WARN_ON(task_current(env->src_rq, p)); + WARN_ON(task_current_donor(env->src_rq, p)); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } From 8671bad873ebeb082afcf7b4501395c374da6023 Mon Sep 17 00:00:00 2001 From: "Luis Claudio R. Goncalves" Date: Mon, 7 Jul 2025 11:03:59 -0300 Subject: [PATCH 83/84] sched: Do not call __put_task_struct() on rt if pi_blocked_on is set With PREEMPT_RT enabled, some of the calls to put_task_struct() coming from rt_mutex_adjust_prio_chain() could happen in preemptible context and with a mutex enqueued. That could lead to this sequence: rt_mutex_adjust_prio_chain() put_task_struct() __put_task_struct() sched_ext_free() spin_lock_irqsave() rtlock_lock() ---> TRIGGERS lockdep_assert(!current->pi_blocked_on); This is not a SCHED_EXT bug. The first cleanup function called by __put_task_struct() is sched_ext_free() and it happens to take a (RT) spin_lock, which in the scenario described above, would trigger the lockdep assertion of "!current->pi_blocked_on". Crystal Wood was able to identify the problem as __put_task_struct() being called during rt_mutex_adjust_prio_chain(), in the context of a process with a mutex enqueued. Instead of adding more complex conditions to decide when to directly call __put_task_struct() and when to defer the call, unconditionally resort to the deferred call on PREEMPT_RT to simplify the code. Fixes: 893cdaaa3977 ("sched: avoid false lockdep splat in put_task_struct()") Suggested-by: Crystal Wood Signed-off-by: Luis Claudio R. Goncalves Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Wander Lairson Costa Reviewed-by: Valentin Schneider Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/aGvTz5VaPFyj0pBV@uudg.org --- include/linux/sched/task.h | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index c517dbc242f7..ea41795a352b 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -131,24 +131,17 @@ static inline void put_task_struct(struct task_struct *t) return; /* - * In !RT, it is always safe to call __put_task_struct(). - * Under RT, we can only call it in preemptible context. - */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { - static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); - - lock_map_acquire_try(&put_task_map); - __put_task_struct(t); - lock_map_release(&put_task_map); - return; - } - - /* - * under PREEMPT_RT, we can't call put_task_struct + * Under PREEMPT_RT, we can't call __put_task_struct * in atomic context because it will indirectly - * acquire sleeping locks. + * acquire sleeping locks. The same is true if the + * current process has a mutex enqueued (blocked on + * a PI chain). * - * call_rcu() will schedule delayed_put_task_struct_rcu() + * In !RT, it is always safe to call __put_task_struct(). + * Though, in order to simplify the code, resort to the + * deferred call too. + * + * call_rcu() will schedule __put_task_struct_rcu_cb() * to be called in process context. * * __put_task_struct() is called when @@ -161,7 +154,7 @@ static inline void put_task_struct(struct task_struct *t) * * delayed_free_task() also uses ->rcu, but it is only called * when it fails to fork a process. Therefore, there is no - * way it can conflict with put_task_struct(). + * way it can conflict with __put_task_struct(). */ call_rcu(&t->rcu, __put_task_struct_rcu_cb); } From 1b5f1454091e9e9fb5c944b3161acf4ec0894d0d Mon Sep 17 00:00:00 2001 From: Feng Lee <379943137@qq.com> Date: Mon, 21 Jul 2025 16:04:35 +0800 Subject: [PATCH 84/84] sched/idle: Remove play_idle() play_idle() is no longer in use, so delete it. Signed-off-by: Feng Lee <379943137@qq.com> Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/tencent_C3E0BD9B812C27A30FC49F1EA6A4B1352707@qq.com --- include/linux/cpu.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 6378370a952f..8b1abbf5b6d2 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -187,11 +187,6 @@ static inline void arch_cpu_finalize_init(void) { } void play_idle_precise(u64 duration_ns, u64 latency_ns); -static inline void play_idle(unsigned long duration_us) -{ - play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX); -} - #ifdef CONFIG_HOTPLUG_CPU void cpuhp_report_idle_dead(void); #else