From d5cf4d34a3331aa1dba954949ea71b5eafcbf0a8 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 19 Oct 2025 22:32:06 -0400 Subject: [PATCH 01/21] cgroup/cpuset: Don't track # of local child partitions The cpuset structure has a nr_subparts field which tracks the number of child local partitions underneath a particular cpuset. Right now, nr_subparts is only used in partition_is_populated() to avoid iteration of child cpusets if the condition is right. So by always performing the child iteration, we can avoid tracking the number of child partitions and simplify the code a bit. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 3 --- kernel/cgroup/cpuset.c | 41 +++++++++++---------------------- 2 files changed, 13 insertions(+), 31 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 337608f408ce..5cac42c5fd97 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -155,9 +155,6 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid local child partitions */ - int nr_subparts; - /* partition root state */ int partition_root_state; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 52468d2c178a..7aef59ea9627 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -358,8 +358,13 @@ static inline bool is_in_v2_mode(void) * @excluded_child: a child cpuset to be excluded in task checking * Return: true if there are tasks, false otherwise * - * It is assumed that @cs is a valid partition root. @excluded_child should - * be non-NULL when this cpuset is going to become a partition itself. + * @cs should be a valid partition root or going to become a partition root. + * @excluded_child should be non-NULL when this cpuset is going to become a + * partition itself. + * + * Note that a remote partition is not allowed underneath a valid local + * or remote partition. So if a non-partition root child is populated, + * the whole partition is considered populated. */ static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) @@ -369,8 +374,6 @@ static inline bool partition_is_populated(struct cpuset *cs, if (cs->css.cgroup->nr_populated_csets) return true; - if (!excluded_child && !cs->nr_subparts) - return cgroup_is_populated(cs->css.cgroup); rcu_read_lock(); cpuset_for_each_child(child, css, cs) { @@ -1302,7 +1305,6 @@ static void reset_partition_data(struct cpuset *cs) lockdep_assert_held(&callback_lock); - cs->nr_subparts = 0; if (cpumask_empty(cs->exclusive_cpus)) { cpumask_clear(cs->effective_xcpus); if (is_cpu_exclusive(cs)) @@ -1746,7 +1748,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ - int subparts_delta = 0; int isolcpus_updated = 0; struct cpumask *xcpus = user_xcpus(cs); bool nocpu; @@ -1771,10 +1772,9 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(parent)) adding = cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus); - if (old_prs > 0) { + if (old_prs > 0) new_prs = -old_prs; - subparts_delta--; - } + goto write_error; } @@ -1829,7 +1829,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus)); deleting = true; - subparts_delta++; } else if (cmd == partcmd_disable) { /* * May need to add cpus back to parent's effective_cpus @@ -1840,7 +1839,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (is_partition_valid(cs)) { cpumask_copy(tmp->addmask, cs->effective_xcpus); adding = true; - subparts_delta--; } new_prs = PRS_MEMBER; } else if (newmask) { @@ -1963,17 +1961,13 @@ write_error: switch (cs->partition_root_state) { case PRS_ROOT: case PRS_ISOLATED: - if (part_error) { + if (part_error) new_prs = -old_prs; - subparts_delta--; - } break; case PRS_INVALID_ROOT: case PRS_INVALID_ISOLATED: - if (!part_error) { + if (!part_error) new_prs = -old_prs; - subparts_delta++; - } break; } } @@ -2002,11 +1996,9 @@ write_error: * newly deleted ones will be added back to effective_cpus. */ spin_lock_irq(&callback_lock); - if (old_prs != new_prs) { + if (old_prs != new_prs) cs->partition_root_state = new_prs; - if (new_prs <= 0) - cs->nr_subparts = 0; - } + /* * Adding to parent's effective_cpus means deletion CPUs from cs * and vice versa. @@ -2018,10 +2010,6 @@ write_error: isolcpus_updated += partition_xcpus_add(new_prs, parent, tmp->delmask); - if (is_partition_valid(parent)) { - parent->nr_subparts += subparts_delta; - WARN_ON_ONCE(parent->nr_subparts < 0); - } spin_unlock_irq(&callback_lock); update_unbound_workqueue_cpumask(isolcpus_updated); @@ -2105,8 +2093,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, */ spin_lock_irq(&callback_lock); make_partition_invalid(child); - cs->nr_subparts--; - child->nr_subparts = 0; spin_unlock_irq(&callback_lock); notify_partition_change(child, old_prs); continue; @@ -4021,7 +4007,6 @@ static void cpuset_handle_hotplug(void) */ if (!cpumask_empty(subpartitions_cpus)) { if (cpumask_subset(&new_cpus, subpartitions_cpus)) { - top_cpuset.nr_subparts = 0; cpumask_clear(subpartitions_cpus); } else { cpumask_andnot(&new_cpus, &new_cpus, From 16dad7801aad73138a2dff5ea950130646914d1f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:15 -1000 Subject: [PATCH 02/21] cgroup: Rename cgroup lifecycle hooks to cgroup_task_*() The current names cgroup_exit(), cgroup_release(), and cgroup_free() are confusing because they look like they're operating on cgroups themselves when they're actually task lifecycle hooks. For example, cgroup_init() initializes the cgroup subsystem while cgroup_exit() is a task exit notification to cgroup. Rename them to cgroup_task_exit(), cgroup_task_release(), and cgroup_task_free() to make it clear that these operate on tasks. Cc: Dan Schatzberg Cc: Peter Zijlstra Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 12 ++++++------ kernel/cgroup/cgroup.c | 11 ++++++----- kernel/exit.c | 4 ++-- kernel/fork.c | 2 +- kernel/sched/autogroup.c | 4 ++-- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6ed477338b16..4068035176c4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -137,9 +137,9 @@ extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); -void cgroup_exit(struct task_struct *p); -void cgroup_release(struct task_struct *p); -void cgroup_free(struct task_struct *p); +void cgroup_task_exit(struct task_struct *p); +void cgroup_task_release(struct task_struct *p); +void cgroup_task_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); @@ -680,9 +680,9 @@ static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} -static inline void cgroup_exit(struct task_struct *p) {} -static inline void cgroup_release(struct task_struct *p) {} -static inline void cgroup_free(struct task_struct *p) {} +static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_release(struct task_struct *p) {} +static inline void cgroup_task_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6ae5f48cf64e..826b7fd2f85d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -944,7 +944,8 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit()/cgroup_free() dropping the css_set. + * against cgroup_task_exit()/cgroup_task_free() dropping + * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -6972,13 +6973,13 @@ void cgroup_post_fork(struct task_struct *child, } /** - * cgroup_exit - detach cgroup from exiting task + * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ -void cgroup_exit(struct task_struct *tsk) +void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; struct css_set *cset; @@ -7010,7 +7011,7 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_release(struct task_struct *task) +void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; @@ -7027,7 +7028,7 @@ void cgroup_release(struct task_struct *task) } } -void cgroup_free(struct task_struct *task) +void cgroup_task_free(struct task_struct *task) { struct css_set *cset = task_css_set(task); put_css_set(cset); diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..46173461e8de 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -257,7 +257,7 @@ repeat: rcu_read_unlock(); pidfs_exit(p); - cgroup_release(p); + cgroup_task_release(p); /* Retrieve @thread_pid before __unhash_process() may set it to NULL. */ thread_pid = task_pid(p); @@ -967,7 +967,7 @@ void __noreturn do_exit(long code) exit_thread(tsk); sched_autogroup_exit_task(tsk); - cgroup_exit(tsk); + cgroup_task_exit(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..960c39c9c264 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -738,7 +738,7 @@ void __put_task_struct(struct task_struct *tsk) unwind_task_free(tsk); sched_ext_free(tsk); io_uring_free(tsk); - cgroup_free(tsk); + cgroup_task_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index cdea931aae30..954137775f38 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -178,8 +178,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) * this process can already run with task_group() == prev->tg or we can * race with cgroup code which can read autogroup = prev under rq->lock. * In the latter case for_each_thread() can not miss a migrating thread, - * cpu_cgroup_attach() must not be possible after cgroup_exit() and it - * can't be removed from thread list, we hold ->siglock. + * cpu_cgroup_attach() must not be possible after cgroup_task_exit() + * and it can't be removed from thread list, we hold ->siglock. * * If an exiting thread was already removed from thread list we rely on * sched_autogroup_exit_task(). From 260fbcb92bbeacfcd050410fdc2d24ab15044400 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:16 -1000 Subject: [PATCH 03/21] cgroup: Move dying_tasks cleanup from cgroup_task_release() to cgroup_task_free() Currently, cgroup_task_exit() adds thread group leaders with live member threads to their css_set's dying_tasks list (so cgroup.procs iteration can still see the leader), and cgroup_task_release() later removes them with list_del_init(&task->cg_list). An upcoming patch will defer the dying_tasks list addition, moving it from cgroup_task_exit() (called from do_exit()) to a new function called from finish_task_switch(). However, release_task() (which calls cgroup_task_release()) can run either before or after finish_task_switch(), creating a race where cgroup_task_release() might try to remove the task from dying_tasks before or while it's being added. Move the list_del_init() from cgroup_task_release() to cgroup_task_free() to fix this race. cgroup_task_free() runs from __put_task_struct(), which is always after both paths, making the cleanup safe. Cc: Dan Schatzberg Cc: Peter Zijlstra Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 826b7fd2f85d..b3c27900c5d2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -7019,6 +7019,11 @@ void cgroup_task_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); +} + +void cgroup_task_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); if (!list_empty(&task->cg_list)) { spin_lock_irq(&css_set_lock); @@ -7026,11 +7031,7 @@ void cgroup_task_release(struct task_struct *task) list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } -} -void cgroup_task_free(struct task_struct *task) -{ - struct css_set *cset = task_css_set(task); put_css_set(cset); } From d245698d727ab8f5420b3e28d1243f96a5234851 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Oct 2025 20:19:17 -1000 Subject: [PATCH 04/21] cgroup: Defer task cgroup unlink until after the task is done switching out When a task exits, css_set_move_task(tsk, cset, NULL, false) unlinks the task from its cgroup. From the cgroup's perspective, the task is now gone. If this makes the cgroup empty, it can be removed, triggering ->css_offline() callbacks that notify controllers the cgroup is going offline resource-wise. However, the exiting task can still run, perform memory operations, and schedule until the final context switch in finish_task_switch(). This creates a confusing situation where controllers are told a cgroup is offline while resource activities are still happening in it. While this hasn't broken existing controllers, it has caused direct confusion for sched_ext schedulers. Split cgroup_task_exit() into two functions. cgroup_task_exit() now only calls the subsystem exit callbacks and continues to be called from do_exit(). The css_set cleanup is moved to the new cgroup_task_dead() which is called from finish_task_switch() after the final context switch, so that the cgroup only appears empty after the task is truly done running. This also reorders operations so that subsys->exit() is now called before unlinking from the cgroup, which shouldn't break anything. Cc: Dan Schatzberg Cc: Peter Zijlstra Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ kernel/cgroup/cgroup.c | 23 ++++++++++++++--------- kernel/sched/core.c | 2 ++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4068035176c4..bc892e3b37ee 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p, extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_task_exit(struct task_struct *p); +void cgroup_task_dead(struct task_struct *p); void cgroup_task_release(struct task_struct *p); void cgroup_task_free(struct task_struct *p); @@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p, static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_task_exit(struct task_struct *p) {} +static inline void cgroup_task_dead(struct task_struct *p) {} static inline void cgroup_task_release(struct task_struct *p) {} static inline void cgroup_task_free(struct task_struct *p) {} diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index b3c27900c5d2..aae180d56c8c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -944,7 +944,7 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_task_exit()/cgroup_task_free() dropping + * against cgroup_task_dead()/cgroup_task_free() dropping * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -6982,10 +6982,20 @@ void cgroup_post_fork(struct task_struct *child, void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +void cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); @@ -7003,12 +7013,7 @@ void cgroup_task_exit(struct task_struct *tsk) test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); - - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); + spin_unlock_irqrestore(&css_set_lock, flags); } void cgroup_task_release(struct task_struct *task) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..40f12e37f60f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5222,6 +5222,8 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + cgroup_task_dead(prev); + /* Task is done with its stack. */ put_task_stack(prev); From 55939cf28a48dad27d6906e57d3b45905bb0d001 Mon Sep 17 00:00:00 2001 From: Gabriele Monaco Date: Tue, 4 Nov 2025 23:38:44 -0500 Subject: [PATCH 05/21] cgroup/cpuset: Rename update_unbound_workqueue_cpumask() to update_isolation_cpumasks() update_unbound_workqueue_cpumask() updates unbound workqueues settings when there's a change in isolated CPUs, but it can be used for other subsystems requiring updated when isolated CPUs change. Generalise the name to update_isolation_cpumasks() to prepare for other functions unrelated to workqueues to be called in that spot. [longman: Change the function name to update_isolation_cpumasks()] Acked-by: Frederic Weisbecker Acked-by: Waiman Long Signed-off-by: Gabriele Monaco Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 7aef59ea9627..da770dac955e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1393,7 +1393,7 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, return isolcpus_updated; } -static void update_unbound_workqueue_cpumask(bool isolcpus_updated) +static void update_isolation_cpumasks(bool isolcpus_updated) { int ret; @@ -1557,7 +1557,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, list_add(&cs->remote_sibling, &remote_children); cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1598,7 +1598,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); cpuset_force_rebuild(); /* @@ -1667,7 +1667,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); if (adding || deleting) cpuset_force_rebuild(); @@ -2011,7 +2011,7 @@ write_error: tmp->delmask); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -3029,7 +3029,7 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_unbound_workqueue_cpumask(isolcpus_updated); + update_isolation_cpumasks(isolcpus_updated); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); From 103b08709e8a59876980a8edddf4e68f3a23e34e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 4 Nov 2025 23:38:45 -0500 Subject: [PATCH 06/21] cgroup/cpuset: Fail if isolated and nohz_full don't leave any housekeeping Currently the user can set up isolated cpus via cpuset and nohz_full in such a way that leaves no housekeeping CPU (i.e. no CPU that is neither domain isolated nor nohz full). This can be a problem for other subsystems (e.g. the timer wheel imgration). Prevent this configuration by blocking any assignation that would cause the union of domain isolated cpus and nohz_full to covers all CPUs. [longman: Remove isolated_cpus_should_update() and rewrite the checking in update_prstate() and update_parent_effective_cpumask()] Originally-by: Gabriele Monaco Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 74 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index da770dac955e..99622e90991a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1393,6 +1393,45 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, return isolcpus_updated; } +/* + * isolated_cpus_can_update - check for isolated & nohz_full conflicts + * @add_cpus: cpu mask for cpus that are going to be isolated + * @del_cpus: cpu mask for cpus that are no longer isolated, can be NULL + * Return: false if there is conflict, true otherwise + * + * If nohz_full is enabled and we have isolated CPUs, their combination must + * still leave housekeeping CPUs. + * + * TBD: Should consider merging this function into + * prstate_housekeeping_conflict(). + */ +static bool isolated_cpus_can_update(struct cpumask *add_cpus, + struct cpumask *del_cpus) +{ + cpumask_var_t full_hk_cpus; + int res = true; + + if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE)) + return true; + + if (del_cpus && cpumask_weight_and(del_cpus, + housekeeping_cpumask(HK_TYPE_KERNEL_NOISE))) + return true; + + if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL)) + return false; + + cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE), + housekeeping_cpumask(HK_TYPE_DOMAIN)); + cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus); + cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask); + if (!cpumask_weight_andnot(full_hk_cpus, add_cpus)) + res = false; + + free_cpumask_var(full_hk_cpus); + return res; +} + static void update_isolation_cpumasks(bool isolcpus_updated) { int ret; @@ -1551,6 +1590,9 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; + if ((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->new_cpus, NULL)) + return PERR_HKEEPING; spin_lock_irq(&callback_lock); isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); @@ -1650,6 +1692,9 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) || cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)) cs->prs_err = PERR_NOCPUS; + else if ((prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + cs->prs_err = PERR_HKEEPING; if (cs->prs_err) goto invalidate; } @@ -1750,6 +1795,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int part_error = PERR_NONE; /* Partition error? */ int isolcpus_updated = 0; struct cpumask *xcpus = user_xcpus(cs); + int parent_prs = parent->partition_root_state; bool nocpu; lockdep_assert_held(&cpuset_mutex); @@ -1813,6 +1859,10 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, if (prstate_housekeeping_conflict(new_prs, xcpus)) return PERR_HKEEPING; + if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) && + !isolated_cpus_can_update(xcpus, NULL)) + return PERR_HKEEPING; + if (tasks_nocpu_error(parent, cs, xcpus)) return PERR_NOCPUS; @@ -1866,6 +1916,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, * * For invalid partition: * delmask = newmask & parent->effective_xcpus + * The partition may become valid soon. */ if (is_partition_invalid(cs)) { adding = false; @@ -1880,6 +1931,23 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->effective_xcpus); } + + /* + * TBD: Invalidate a currently valid child root partition may + * still break isolated_cpus_can_update() rule if parent is an + * isolated partition. + */ + if (is_partition_valid(cs) && (old_prs != parent_prs)) { + if ((parent_prs == PRS_ROOT) && + /* Adding to parent means removing isolated CPUs */ + !isolated_cpus_can_update(tmp->delmask, tmp->addmask)) + part_error = PERR_HKEEPING; + if ((parent_prs == PRS_ISOLATED) && + /* Adding to parent means adding isolated CPUs */ + !isolated_cpus_can_update(tmp->addmask, tmp->delmask)) + part_error = PERR_HKEEPING; + } + /* * The new CPUs to be removed from parent's effective CPUs * must be present. @@ -2994,7 +3062,11 @@ static int update_prstate(struct cpuset *cs, int new_prs) * A change in load balance state only, no change in cpumasks. * Need to update isolated_cpus. */ - isolcpus_updated = true; + if ((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(cs->effective_xcpus, NULL)) + err = PERR_HKEEPING; + else + isolcpus_updated = true; } else { /* * Switching back to member is always allowed even if it From 6cfeddbf4ade9202849d75c27c4d0c82b42c73d1 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 4 Nov 2025 23:38:46 -0500 Subject: [PATCH 07/21] cgroup/cpuset: Move up prstate_housekeeping_conflict() helper Move up the prstate_housekeeping_conflict() helper so that it can be used in remote partition code. Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 99622e90991a..cc9c3402f16b 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1432,6 +1432,26 @@ static bool isolated_cpus_can_update(struct cpumask *add_cpus, return res; } +/* + * prstate_housekeeping_conflict - check for partition & housekeeping conflicts + * @prstate: partition root state to be checked + * @new_cpus: cpu mask + * Return: true if there is conflict, false otherwise + * + * CPUs outside of boot_hk_cpus, if defined, can only be used in an + * isolated partition. + */ +static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) +{ + if (!have_boot_isolcpus) + return false; + + if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) + return true; + + return false; +} + static void update_isolation_cpumasks(bool isolcpus_updated) { int ret; @@ -1727,26 +1747,6 @@ invalidate: remote_partition_disable(cs, tmp); } -/* - * prstate_housekeeping_conflict - check for partition & housekeeping conflicts - * @prstate: partition root state to be checked - * @new_cpus: cpu mask - * Return: true if there is conflict, false otherwise - * - * CPUs outside of boot_hk_cpus, if defined, can only be used in an - * isolated partition. - */ -static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) -{ - if (!have_boot_isolcpus) - return false; - - if ((prstate != PRS_ISOLATED) && !cpumask_subset(new_cpus, boot_hk_cpus)) - return true; - - return false; -} - /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state From b1034a690129acd8995137bf4462470b4a2aa690 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 4 Nov 2025 23:38:47 -0500 Subject: [PATCH 08/21] cgroup/cpuset: Ensure domain isolated CPUs stay in root or isolated partition Commit 4a74e418881f ("cgroup/cpuset: Check partition conflict with housekeeping setup") is supposed to ensure that domain isolated CPUs designated by the "isolcpus" boot command line option stay either in root partition or in isolated partitions. However, the required check wasn't implemented when a remote partition was created or when an existing partition changed type from "root" to "isolated". Even though this is a relatively minor issue, we still need to add the required prstate_housekeeping_conflict() call in the right places to ensure that the rule is strictly followed. The following steps can be used to reproduce the problem before this fix. # fmt -1 /proc/cmdline | grep isolcpus isolcpus=9 # cd /sys/fs/cgroup/ # echo +cpuset > cgroup.subtree_control # mkdir test # echo 9 > test/cpuset.cpus # echo isolated > test/cpuset.cpus.partition # cat test/cpuset.cpus.partition isolated # cat test/cpuset.cpus.effective 9 # echo root > test/cpuset.cpus.partition # cat test/cpuset.cpus.effective 9 # cat test/cpuset.cpus.partition root With this fix, the last few steps will become: # echo root > test/cpuset.cpus.partition # cat test/cpuset.cpus.effective 0-8,10-95 # cat test/cpuset.cpus.partition root invalid (partition config conflicts with housekeeping setup) Reported-by: Chen Ridong Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index cc9c3402f16b..2daf58bf0bbb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1610,8 +1610,9 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) return PERR_INVCPUS; - if ((new_prs == PRS_ISOLATED) && - !isolated_cpus_can_update(tmp->new_cpus, NULL)) + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(tmp->new_cpus, NULL)) || + prstate_housekeeping_conflict(new_prs, tmp->new_cpus)) return PERR_HKEEPING; spin_lock_irq(&callback_lock); @@ -3062,8 +3063,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) * A change in load balance state only, no change in cpumasks. * Need to update isolated_cpus. */ - if ((new_prs == PRS_ISOLATED) && - !isolated_cpus_can_update(cs->effective_xcpus, NULL)) + if (((new_prs == PRS_ISOLATED) && + !isolated_cpus_can_update(cs->effective_xcpus, NULL)) || + prstate_housekeeping_conflict(new_prs, cs->effective_xcpus)) err = PERR_HKEEPING; else isolcpus_updated = true; From be04e96ba911fac1dc4c7f89ebb42018d167043f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 4 Nov 2025 23:38:48 -0500 Subject: [PATCH 09/21] cgroup/cpuset: Globally track isolated_cpus update The current cpuset code passes a local isolcpus_updated flag around in a number of functions to determine if external isolation related cpumasks like wq_unbound_cpumask should be updated. It is a bit cumbersome and makes the code more complex. Simplify the code by using a global boolean flag "isolated_cpus_updating" to track this. This flag will be set in isolated_cpus_update() and cleared in update_isolation_cpumasks(). No functional change is expected. Signed-off-by: Waiman Long Reviewed-by: Chen Ridong Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 73 ++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 2daf58bf0bbb..90288efe5367 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -81,6 +81,13 @@ static cpumask_var_t subpartitions_cpus; */ static cpumask_var_t isolated_cpus; +/* + * isolated_cpus updating flag (protected by cpuset_mutex) + * Set if isolated_cpus is going to be updated in the current + * cpuset_mutex crtical section. + */ +static bool isolated_cpus_updating; + /* * Housekeeping (HK_TYPE_DOMAIN) CPUs at boot */ @@ -1327,6 +1334,8 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus cpumask_or(isolated_cpus, isolated_cpus, xcpus); else cpumask_andnot(isolated_cpus, isolated_cpus, xcpus); + + isolated_cpus_updating = true; } /* @@ -1334,15 +1343,12 @@ static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added - * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static bool partition_xcpus_add(int new_prs, struct cpuset *parent, +static void partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { - bool isolcpus_updated; - WARN_ON_ONCE(new_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1352,13 +1358,11 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus); - isolcpus_updated = (new_prs != parent->partition_root_state); - if (isolcpus_updated) + if (new_prs != parent->partition_root_state) isolated_cpus_update(parent->partition_root_state, new_prs, xcpus); cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus); - return isolcpus_updated; } /* @@ -1366,15 +1370,12 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed - * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ -static bool partition_xcpus_del(int old_prs, struct cpuset *parent, +static void partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { - bool isolcpus_updated; - WARN_ON_ONCE(old_prs < 0); lockdep_assert_held(&callback_lock); if (!parent) @@ -1383,14 +1384,12 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, if (parent == &top_cpuset) cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus); - isolcpus_updated = (old_prs != parent->partition_root_state); - if (isolcpus_updated) + if (old_prs != parent->partition_root_state) isolated_cpus_update(old_prs, parent->partition_root_state, xcpus); cpumask_and(xcpus, xcpus, cpu_active_mask); cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus); - return isolcpus_updated; } /* @@ -1452,17 +1451,24 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) return false; } -static void update_isolation_cpumasks(bool isolcpus_updated) +/* + * update_isolation_cpumasks - Update external isolation related CPU masks + * + * The following external CPU masks will be updated if necessary: + * - workqueue unbound cpumask + */ +static void update_isolation_cpumasks(void) { int ret; - lockdep_assert_cpus_held(); - - if (!isolcpus_updated) + if (!isolated_cpus_updating) return; + lockdep_assert_cpus_held(); + ret = workqueue_unbound_exclude_cpumask(isolated_cpus); WARN_ON_ONCE(ret < 0); + isolated_cpus_updating = false; } /** @@ -1587,8 +1593,6 @@ static inline bool is_local_partition(struct cpuset *cs) static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { - bool isolcpus_updated; - /* * The user must have sysadmin privilege. */ @@ -1616,11 +1620,11 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, return PERR_HKEEPING; spin_lock_irq(&callback_lock); - isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus); + partition_xcpus_add(new_prs, NULL, tmp->new_cpus); list_add(&cs->remote_sibling, &remote_children); cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); cpuset_force_rebuild(); cs->prs_err = 0; @@ -1643,15 +1647,12 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { - bool isolcpus_updated; - WARN_ON_ONCE(!is_remote_partition(cs)); WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); list_del_init(&cs->remote_sibling); - isolcpus_updated = partition_xcpus_del(cs->partition_root_state, - NULL, cs->effective_xcpus); + partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); if (cs->prs_err) cs->partition_root_state = -cs->partition_root_state; else @@ -1661,7 +1662,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) compute_excpus(cs, cs->effective_xcpus); reset_partition_data(cs); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); cpuset_force_rebuild(); /* @@ -1686,7 +1687,6 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, { bool adding, deleting; int prs = cs->partition_root_state; - int isolcpus_updated = 0; if (WARN_ON_ONCE(!is_remote_partition(cs))) return; @@ -1722,9 +1722,9 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, spin_lock_irq(&callback_lock); if (adding) - isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask); + partition_xcpus_add(prs, NULL, tmp->addmask); if (deleting) - isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask); + partition_xcpus_del(prs, NULL, tmp->delmask); /* * Need to update effective_xcpus and exclusive_cpus now as * update_sibling_cpumasks() below may iterate back to the same cs. @@ -1733,7 +1733,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus, if (xcpus) cpumask_copy(cs->exclusive_cpus, xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); if (adding || deleting) cpuset_force_rebuild(); @@ -1794,7 +1794,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, int deleting; /* Deleting cpus from parent's effective_cpus */ int old_prs, new_prs; int part_error = PERR_NONE; /* Partition error? */ - int isolcpus_updated = 0; struct cpumask *xcpus = user_xcpus(cs); int parent_prs = parent->partition_root_state; bool nocpu; @@ -2073,14 +2072,12 @@ write_error: * and vice versa. */ if (adding) - isolcpus_updated += partition_xcpus_del(old_prs, parent, - tmp->addmask); + partition_xcpus_del(old_prs, parent, tmp->addmask); if (deleting) - isolcpus_updated += partition_xcpus_add(new_prs, parent, - tmp->delmask); + partition_xcpus_add(new_prs, parent, tmp->delmask); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); if ((old_prs != new_prs) && (cmd == partcmd_update)) update_partition_exclusive_flag(cs, new_prs); @@ -3103,7 +3100,7 @@ out: else if (isolcpus_updated) isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus); spin_unlock_irq(&callback_lock); - update_isolation_cpumasks(isolcpus_updated); + update_isolation_cpumasks(); /* Force update if switching back to member & update effective_xcpus */ update_cpumasks_hier(cs, &tmpmask, !new_prs); From 9311e6c29b348b005e79228ef6facd38ebcc73f9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Nov 2025 08:12:36 -1000 Subject: [PATCH 10/21] cgroup: Fix sleeping from invalid context warning on PREEMPT_RT cgroup_task_dead() is called from finish_task_switch() which runs with preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The function needs to acquire css_set_lock which is a regular spinlock that can sleep on RT kernels, leading to "sleeping function called from invalid context" warnings. css_set_lock is too large in scope to convert to a raw_spinlock. However, the unlinking operations don't need to run synchronously - they just need to complete after the task is done running. On PREEMPT_RT, defer the work through irq_work. While the work doesn't need to happen immediately, it can't be delayed indefinitely either as the dead task pins the cgroup and task_struct can be pinned indefinitely. Use the lazy version of irq_work to allow batching and lower impact while ensuring timely completion. v2: Use IRQ_WORK_INIT_LAZY instead of immediate irq_work and add explanation for why the work can't be delayed indefinitely (Sebastian Andrzej Siewior). Fixes: d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out") Reported-by: Calvin Owens Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org Signed-off-by: Tejun Heo --- include/linux/sched.h | 5 +++- kernel/cgroup/cgroup.c | 55 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index cbb7340c5866..5e80d48488ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1324,7 +1324,10 @@ struct task_struct { struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; -#endif +#ifdef CONFIG_PREEMPT_RT + struct llist_node cg_dead_lnode; +#endif /* CONFIG_PREEMPT_RT */ +#endif /* CONFIG_CGROUPS */ #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index aae180d56c8c..48019a661c08 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_rt_init(void); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline @@ -6360,6 +6361,7 @@ int __init cgroup_init(void) BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); + cgroup_rt_init(); cgroup_lock(); @@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_task_dead(struct task_struct *tsk) +static void do_cgroup_task_dead(struct task_struct *tsk) { struct css_set *cset; unsigned long flags; @@ -7016,6 +7018,57 @@ void cgroup_task_dead(struct task_struct *tsk) spin_unlock_irqrestore(&css_set_lock, flags); } +#ifdef CONFIG_PREEMPT_RT +/* + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, + * this lead to sleeping in the invalid context warning bug. css_set_lock is too + * big to become a raw_spinlock. The task_dead path doesn't need to run + * synchronously but can't be delayed indefinitely either as the dead task pins + * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy + * irq_work to allow batching while ensuring timely completion. + */ +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); + +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) +{ + struct llist_node *lnode; + struct task_struct *task, *next; + + lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); + llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { + do_cgroup_task_dead(task); + put_task_struct(task); + } +} + +static void __init cgroup_rt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); + per_cpu(cgrp_dead_tasks_iwork, cpu) = + IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); + } +} + +void cgroup_task_dead(struct task_struct *task) +{ + get_task_struct(task); + llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); + irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); +} +#else /* CONFIG_PREEMPT_RT */ +static void __init cgroup_rt_init(void) {} + +void cgroup_task_dead(struct task_struct *task) +{ + do_cgroup_task_dead(task); +} +#endif /* CONFIG_PREEMPT_RT */ + void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; From 01a743550b46eba1dacbd593ccc094781b882d76 Mon Sep 17 00:00:00 2001 From: Bert Karwatzki Date: Tue, 11 Nov 2025 18:01:43 +0100 Subject: [PATCH 11/21] cgroup: include missing header for struct irq_work To compile cgroup.c with PREEMPT_RT=y include header which declares struct irq_work. Fixes: 9311e6c29b34 ("cgroup: Fix sleeping from invalid context warning on PREEMPT_RT") Signed-off-by: Bert Karwatzki Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 48019a661c08..f6cc504dfe1c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS From 0241e9e2bda3077ca6ec90f0a22120ba7a73e43b Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 11 Nov 2025 13:24:27 +0000 Subject: [PATCH 12/21] cpuset: simplify node setting on error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to jump to the 'done' label upon failure, as no cleanup is required. Return the error code directly instead. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 90288efe5367..7830c1b68205 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2897,21 +2897,19 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, */ retval = nodelist_parse(buf, trialcs->mems_allowed); if (retval < 0) - goto done; + return retval; if (!nodes_subset(trialcs->mems_allowed, - top_cpuset.mems_allowed)) { - retval = -EINVAL; - goto done; - } + top_cpuset.mems_allowed)) + return -EINVAL; + + /* No change? nothing to do */ + if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) + return 0; - if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { - retval = 0; /* Too easy - nothing to do */ - goto done; - } retval = validate_change(cs, trialcs); if (retval < 0) - goto done; + return retval; check_insane_mems_config(&trialcs->mems_allowed); @@ -2921,8 +2919,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); -done: - return retval; + return 0; } bool current_cpuset_is_being_rebound(void) From 648d43da64f0221ab1050825b28995c17ce091a4 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 11 Nov 2025 13:24:28 +0000 Subject: [PATCH 13/21] cpuset: remove global remote_children list The remote_children list is used to track all remote partitions attached to a cpuset. However, it serves no other purpose. Using a boolean flag to indicate whether a cpuset is a remote partition is a more direct approach, making remote_children unnecessary. This patch replaces the list with a remote_partition flag in the cpuset structure and removes remote_children entirely. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-internal.h | 10 +++++++--- kernel/cgroup/cpuset.c | 13 ++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h index 5cac42c5fd97..01976c8e7d49 100644 --- a/kernel/cgroup/cpuset-internal.h +++ b/kernel/cgroup/cpuset-internal.h @@ -158,6 +158,13 @@ struct cpuset { /* partition root state */ int partition_root_state; + /* + * Whether cpuset is a remote partition. + * It used to be a list anchoring all remote partitions — we can switch back + * to a list if we need to iterate over the remote partitions. + */ + bool remote_partition; + /* * number of SCHED_DEADLINE tasks attached to this cpuset, so that we * know when to rebuild associated root domain bandwidth information. @@ -172,9 +179,6 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; - /* Remote partition silbling list anchored at remote_children */ - struct list_head remote_sibling; - /* Used to merge intersecting subsets for generate_sched_domains */ struct uf_node node; }; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 7830c1b68205..ca3d3f2450ae 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -94,9 +94,6 @@ static bool isolated_cpus_updating; static cpumask_var_t boot_hk_cpus; static bool have_boot_isolcpus; -/* List of remote partition root children */ -static struct list_head remote_children; - /* * A flag to force sched domain rebuild at the end of an operation. * It can be set in @@ -219,7 +216,7 @@ static struct cpuset top_cpuset = { BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, .relax_domain_level = -1, - .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), + .remote_partition = false, }; /* @@ -1572,7 +1569,7 @@ static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) static inline bool is_remote_partition(struct cpuset *cs) { - return !list_empty(&cs->remote_sibling); + return cs->remote_partition; } static inline bool is_local_partition(struct cpuset *cs) @@ -1621,7 +1618,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, spin_lock_irq(&callback_lock); partition_xcpus_add(new_prs, NULL, tmp->new_cpus); - list_add(&cs->remote_sibling, &remote_children); + cs->remote_partition = true; cpumask_copy(cs->effective_xcpus, tmp->new_cpus); spin_unlock_irq(&callback_lock); update_isolation_cpumasks(); @@ -1651,7 +1648,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); spin_lock_irq(&callback_lock); - list_del_init(&cs->remote_sibling); + cs->remote_partition = false; partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus); if (cs->prs_err) cs->partition_root_state = -cs->partition_root_state; @@ -3603,7 +3600,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; - INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cpuset_v2()) @@ -3874,7 +3870,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); From f23cb0ced8fb28ba65bf4ddaa2fcaf044c6894cc Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 11 Nov 2025 13:24:29 +0000 Subject: [PATCH 14/21] cpuset: remove need_rebuild_sched_domains Previously, update_cpumasks_hier() used need_rebuild_sched_domains to decide whether to invoke rebuild_sched_domains_locked(). Now that rebuild_sched_domains_locked() only sets force_rebuild, the flag is redundant. Hence, remove it. Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca3d3f2450ae..976bce6e5673 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2184,7 +2184,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, { struct cpuset *cp; struct cgroup_subsys_state *pos_css; - bool need_rebuild_sched_domains = false; int old_prs, new_prs; rcu_read_lock(); @@ -2348,15 +2347,12 @@ get_css: if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cpuset_v2() || is_partition_valid(cp))) - need_rebuild_sched_domains = true; + cpuset_force_rebuild(); rcu_read_lock(); css_put(&cp->css); } rcu_read_unlock(); - - if (need_rebuild_sched_domains) - cpuset_force_rebuild(); } /** From 1dc830ee4c155bff572a43a66b948e1c46483d6a Mon Sep 17 00:00:00 2001 From: Guopeng Zhang Date: Fri, 14 Nov 2025 18:24:40 +0800 Subject: [PATCH 15/21] selftests/cgroup: conform test to KTAP format output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Conform the layout, informational and status messages to KTAP. No functional change is intended other than the layout of output messages. Signed-off-by: Guopeng Zhang Suggested-by: Sebastian Chlad Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_core.c | 7 ++++--- tools/testing/selftests/cgroup/test_cpu.c | 7 ++++--- tools/testing/selftests/cgroup/test_cpuset.c | 7 ++++--- tools/testing/selftests/cgroup/test_freezer.c | 7 ++++--- tools/testing/selftests/cgroup/test_kill.c | 7 ++++--- tools/testing/selftests/cgroup/test_kmem.c | 7 ++++--- tools/testing/selftests/cgroup/test_memcontrol.c | 7 ++++--- tools/testing/selftests/cgroup/test_zswap.c | 7 ++++--- 8 files changed, 32 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index a360e2eb2eef..1d778c8b7764 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -923,8 +923,10 @@ struct corecg_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) { if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME)) ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n"); @@ -946,12 +948,11 @@ post_v2_setup: ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } cleanup_named_v1_root(root); - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_cpu.c b/tools/testing/selftests/cgroup/test_cpu.c index 2a60e6c41940..36be3e820df2 100644 --- a/tools/testing/selftests/cgroup/test_cpu.c +++ b/tools/testing/selftests/cgroup/test_cpu.c @@ -796,8 +796,10 @@ struct cpucg_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -814,11 +816,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_cpuset.c b/tools/testing/selftests/cgroup/test_cpuset.c index 4034d14ba69a..8086d2ea394f 100644 --- a/tools/testing/selftests/cgroup/test_cpuset.c +++ b/tools/testing/selftests/cgroup/test_cpuset.c @@ -247,8 +247,10 @@ struct cpuset_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -265,11 +267,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index dfb763819581..465cdad2bfca 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -1488,8 +1488,10 @@ struct cgfreezer_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { @@ -1501,11 +1503,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c index 0e5bb6c7307a..ed590b150a17 100644 --- a/tools/testing/selftests/cgroup/test_kill.c +++ b/tools/testing/selftests/cgroup/test_kill.c @@ -274,8 +274,10 @@ struct cgkill_test { int main(int argc, char *argv[]) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); for (i = 0; i < ARRAY_SIZE(tests); i++) { @@ -287,11 +289,10 @@ int main(int argc, char *argv[]) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index 63b3c9aad399..d4c4a514ee43 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -421,8 +421,10 @@ struct kmem_test { int main(int argc, char **argv) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -446,11 +448,10 @@ int main(int argc, char **argv) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index a680f773f2d5..b117325c0439 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -1650,8 +1650,10 @@ struct memcg_test { int main(int argc, char **argv) { char root[PATH_MAX]; - int i, proc_status, ret = EXIT_SUCCESS; + int i, proc_status; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -1685,11 +1687,10 @@ int main(int argc, char **argv) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index e1f578ca2841..86a8930b47e3 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -597,8 +597,10 @@ static bool zswap_configured(void) int main(int argc, char **argv) { char root[PATH_MAX]; - int i, ret = EXIT_SUCCESS; + int i; + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(tests)); if (cg_find_unified_root(root, sizeof(root), NULL)) ksft_exit_skip("cgroup v2 isn't mounted\n"); @@ -625,11 +627,10 @@ int main(int argc, char **argv) ksft_test_result_skip("%s\n", tests[i].name); break; default: - ret = EXIT_FAILURE; ksft_test_result_fail("%s\n", tests[i].name); break; } } - return ret; + ksft_finished(); } From e27179958ce76b182ea38718113cdff1d2bb7d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 14 Nov 2025 19:21:25 +0100 Subject: [PATCH 16/21] docs: cgroup: Explain reclaim protection target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The protection target is necessary to understand how effective reclaim protection applies in the hierarchy. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 0e6c67ac585a..97a9f8a046c5 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -53,7 +53,8 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst Date: Fri, 14 Nov 2025 19:21:26 +0100 Subject: [PATCH 17/21] docs: cgroup: Note about sibling relative reclaim protection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 97a9f8a046c5..e0a659474fa4 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1952,6 +1952,10 @@ When the reclaim targets ancestors of A, the effective protection of B is capped by the protection value configured for A (and any other intermediate ancestors between A and the target). +To express indifference about relative sibling protection, it is suggested to +use memory_recursiveprot. Configuring all descendants of a parent with finite +protection to "max" works but it may unnecessarily skew memory.events:low +field. Memory Ownership ~~~~~~~~~~~~~~~~ From a0131c39270de634c33950a799d8870da2191974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 14 Nov 2025 19:21:27 +0100 Subject: [PATCH 18/21] docs: cgroup: No special handling of unpopulated memcgs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current kernel doesn't handle unpopulated cgroups any special regarding reclaim protection. Furthermore, this wasn't a case even when this was introduced in bf8d5d52ffe89 ("memcg: introduce memory.min") Drop the incorrect documentation. (Implementation taking into account the inner-node constraint may be added later.) Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index e0a659474fa4..4c072e85acdf 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1327,9 +1327,6 @@ PAGE_SIZE multiple when read back. Putting more memory than generally available under this protection is discouraged and may lead to constant OOMs. - If a memory cgroup is not populated with processes, - its memory.min is ignored. - memory.low A read-write single value file which exists on non-root cgroups. The default is "0". From 1f382215119a0bc165e766e5bc424b3d3e8dae35 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Wed, 19 Nov 2025 17:55:24 +0800 Subject: [PATCH 19/21] cgroup/cpuset: Introduce cpuset_cpus_allowed_locked() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cpuset_cpus_allowed() uses a reader lock that is sleepable under RT, which means it cannot be called inside raw_spin_lock_t context. Introduce a new cpuset_cpus_allowed_locked() helper that performs the same function as cpuset_cpus_allowed() except that the caller must have acquired the cpuset_mutex so that no further locking will be needed. Suggested-by: Waiman Long Signed-off-by: Pingfan Liu Cc: Waiman Long Cc: Tejun Heo Cc: Johannes Weiner Cc: Michal Koutný Cc: linux-kernel@vger.kernel.org To: cgroups@vger.kernel.org Reviewed-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- include/linux/cpuset.h | 9 +++++++- kernel/cgroup/cpuset.c | 51 +++++++++++++++++++++++++++++------------- 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2ddb256187b5..a98d3330385c 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -74,6 +74,7 @@ extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); +extern void cpuset_cpus_allowed_locked(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpu_is_isolated(int cpu); @@ -195,10 +196,16 @@ static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } +static inline void cpuset_cpus_allowed_locked(struct task_struct *p, + struct cpumask *mask) +{ + cpumask_copy(mask, task_cpu_possible_mask(p)); +} + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { - cpumask_copy(mask, task_cpu_possible_mask(p)); + cpuset_cpus_allowed_locked(p, mask); } static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 976bce6e5673..ec8bebc66469 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4160,24 +4160,13 @@ void __init cpuset_init_smp(void) BUG_ON(!cpuset_migrate_mm_wq); } -/** - * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. - * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. - * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. - * - * Description: Returns the cpumask_var_t cpus_allowed of the cpuset - * attached to the specified @tsk. Guaranteed to return some non-empty - * subset of cpu_active_mask, even if this means going outside the - * tasks cpuset, except when the task is in the top cpuset. - **/ - -void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +/* + * Return cpus_allowed mask from a task's cpuset. + */ +static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) { - unsigned long flags; struct cpuset *cs; - spin_lock_irqsave(&callback_lock, flags); - cs = task_cs(tsk); if (cs != &top_cpuset) guarantee_active_cpus(tsk, pmask); @@ -4197,7 +4186,39 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) if (!cpumask_intersects(pmask, cpu_active_mask)) cpumask_copy(pmask, possible_mask); } +} +/** + * cpuset_cpus_allowed_locked - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Similir to cpuset_cpus_allowed() except that the caller must have acquired + * cpuset_mutex. + */ +void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) +{ + lockdep_assert_held(&cpuset_mutex); + __cpuset_cpus_allowed_locked(tsk, pmask); +} + +/** + * cpuset_cpus_allowed - return cpus_allowed mask from a task's cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of cpu_active_mask, even if this means going outside the + * tasks cpuset, except when the task is in the top cpuset. + **/ + +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +{ + unsigned long flags; + + spin_lock_irqsave(&callback_lock, flags); + __cpuset_cpus_allowed_locked(tsk, pmask); spin_unlock_irqrestore(&callback_lock, flags); } From 318e18ed22e89397635e15095c014accaf47ed30 Mon Sep 17 00:00:00 2001 From: Pingfan Liu Date: Wed, 19 Nov 2025 17:55:25 +0800 Subject: [PATCH 20/21] sched/deadline: Walk up cpuset hierarchy to decide root domain when hot-unplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit *** Bug description *** When testing kexec-reboot on a 144 cpus machine with isolcpus=managed_irq,domain,1-71,73-143 in kernel command line, I encounter the following bug: [ 97.114759] psci: CPU142 killed (polled 0 ms) [ 97.333236] Failed to offline CPU143 - error=-16 [ 97.333246] ------------[ cut here ]------------ [ 97.342682] kernel BUG at kernel/cpu.c:1569! [ 97.347049] Internal error: Oops - BUG: 00000000f2000800 [#1] SMP [...] In essence, the issue originates from the CPU hot-removal process, not limited to kexec. It can be reproduced by writing a SCHED_DEADLINE program that waits indefinitely on a semaphore, spawning multiple instances to ensure some run on CPU 72, and then offlining CPUs 1–143 one by one. When attempting this, CPU 143 failed to go offline. bash -c 'taskset -cp 0 $$ && for i in {1..143}; do echo 0 > /sys/devices/system/cpu/cpu$i/online 2>/dev/null; done' Tracking down this issue, I found that dl_bw_deactivate() returned -EBUSY, which caused sched_cpu_deactivate() to fail on the last CPU. But that is not the fact, and contributed by the following factors: When a CPU is inactive, cpu_rq()->rd is set to def_root_domain. For an blocked-state deadline task (in this case, "cppc_fie"), it was not migrated to CPU0, and its task_rq() information is stale. So its rq->rd points to def_root_domain instead of the one shared with CPU0. As a result, its bandwidth is wrongly accounted into a wrong root domain during domain rebuild. *** Issue *** The key point is that root_domain is only tracked through active rq->rd. To avoid using a global data structure to track all root_domains in the system, there should be a method to locate an active CPU within the corresponding root_domain. *** Solution *** To locate the active cpu, the following rules for deadline sub-system is useful -1.any cpu belongs to a unique root domain at a given time -2.DL bandwidth checker ensures that the root domain has active cpus. Now, let's examine the blocked-state task P. If P is attached to a cpuset that is a partition root, it is straightforward to find an active CPU. If P is attached to a cpuset that has changed from 'root' to 'member', the active CPUs are grouped into the parent root domain. Naturally, the CPUs' capacity and reserved DL bandwidth are taken into account in the ancestor root domain. (In practice, it may be unsafe to attach P to an arbitrary root domain, since that domain may lack sufficient DL bandwidth for P.) Again, it is straightforward to find an active CPU in the ancestor root domain. This patch groups CPUs into isolated and housekeeping sets. For the housekeeping group, it walks up the cpuset hierarchy to find active CPUs in P's root domain and retrieves the valid rd from cpu_rq(cpu)->rd. Signed-off-by: Pingfan Liu Cc: Waiman Long Cc: Chen Ridong Cc: Peter Zijlstra Cc: Juri Lelli Cc: Pierre Gondois Cc: Ingo Molnar Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider To: linux-kernel@vger.kernel.org Signed-off-by: Tejun Heo --- kernel/sched/deadline.c | 54 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7b7671060bf9..194a341e8586 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2465,6 +2465,7 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu return NULL; } +/* Access rule: must be called on local CPU with preemption disabled */ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl); static int find_later_rq(struct task_struct *task) @@ -2907,11 +2908,43 @@ void __init init_sched_dl_class(void) GFP_KERNEL, cpu_to_node(i)); } +/* + * This function always returns a non-empty bitmap in @cpus. This is because + * if a root domain has reserved bandwidth for DL tasks, the DL bandwidth + * check will prevent CPU hotplug from deactivating all CPUs in that domain. + */ +static void dl_get_task_effective_cpus(struct task_struct *p, struct cpumask *cpus) +{ + const struct cpumask *hk_msk; + + hk_msk = housekeeping_cpumask(HK_TYPE_DOMAIN); + if (housekeeping_enabled(HK_TYPE_DOMAIN)) { + if (!cpumask_intersects(p->cpus_ptr, hk_msk)) { + /* + * CPUs isolated by isolcpu="domain" always belong to + * def_root_domain. + */ + cpumask_andnot(cpus, cpu_active_mask, hk_msk); + return; + } + } + + /* + * If a root domain holds a DL task, it must have active CPUs. So + * active CPUs can always be found by walking up the task's cpuset + * hierarchy up to the partition root. + */ + cpuset_cpus_allowed_locked(p, cpus); +} + +/* The caller should hold cpuset_mutex */ void dl_add_task_root_domain(struct task_struct *p) { struct rq_flags rf; struct rq *rq; struct dl_bw *dl_b; + unsigned int cpu; + struct cpumask *msk = this_cpu_cpumask_var_ptr(local_cpu_mask_dl); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (!dl_task(p) || dl_entity_is_special(&p->dl)) { @@ -2919,16 +2952,25 @@ void dl_add_task_root_domain(struct task_struct *p) return; } - rq = __task_rq_lock(p, &rf); - + /* + * Get an active rq, whose rq->rd traces the correct root + * domain. + * Ideally this would be under cpuset reader lock until rq->rd is + * fetched. However, sleepable locks cannot nest inside pi_lock, so we + * rely on the caller of dl_add_task_root_domain() holds 'cpuset_mutex' + * to guarantee the CPU stays in the cpuset. + */ + dl_get_task_effective_cpus(p, msk); + cpu = cpumask_first_and(cpu_active_mask, msk); + BUG_ON(cpu >= nr_cpu_ids); + rq = cpu_rq(cpu); dl_b = &rq->rd->dl_bw; + /* End of fetching rd */ + raw_spin_lock(&dl_b->lock); - __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span)); - raw_spin_unlock(&dl_b->lock); - - task_rq_unlock(rq, p, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); } void dl_clear_root_domain(struct root_domain *rd) From b1bcaed1e39a9e0dfbe324a15d2ca4253deda316 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Fri, 14 Nov 2025 02:08:47 +0000 Subject: [PATCH 21/21] cpuset: Treat cpusets in attaching as populated Currently, the check for whether a partition is populated does not account for tasks in the cpuset of attaching. This is a corner case that can leave a task stuck in a partition with no effective CPUs. The race condition occurs as follows: cpu0 cpu1 //cpuset A with cpu N migrate task p to A cpuset_can_attach // with effective cpus // check ok // cpuset_mutex is not held // clear cpuset.cpus.exclusive // making effective cpus empty update_exclusive_cpumask // tasks_nocpu_error check ok // empty effective cpus, partition valid cpuset_attach ... // task p stays in A, with non-effective cpus. To fix this issue, this patch introduces cs_is_populated, which considers tasks in the attaching cpuset. This new helper is used in validate_change and partition_is_populated. Fixes: e2d59900d936 ("cgroup/cpuset: Allow no-task partition to have empty cpuset.cpus.effective") Signed-off-by: Chen Ridong Reviewed-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ec8bebc66469..1e3aadc09d3a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -356,6 +356,15 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } +static inline bool cpuset_is_populated(struct cpuset *cs) +{ + lockdep_assert_held(&cpuset_mutex); + + /* Cpusets in the process of attaching should be considered as populated */ + return cgroup_is_populated(cs->css.cgroup) || + cs->attach_in_progress; +} + /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked @@ -373,19 +382,29 @@ static inline bool is_in_v2_mode(void) static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) { - struct cgroup_subsys_state *css; - struct cpuset *child; + struct cpuset *cp; + struct cgroup_subsys_state *pos_css; - if (cs->css.cgroup->nr_populated_csets) + /* + * We cannot call cs_is_populated(cs) directly, as + * nr_populated_domain_children may include populated + * csets from descendants that are partitions. + */ + if (cs->css.cgroup->nr_populated_csets || + cs->attach_in_progress) return true; rcu_read_lock(); - cpuset_for_each_child(child, css, cs) { - if (child == excluded_child) + cpuset_for_each_descendant_pre(cp, pos_css, cs) { + if (cp == cs || cp == excluded_child) continue; - if (is_partition_valid(child)) + + if (is_partition_valid(cp)) { + pos_css = css_rightmost_descendant(pos_css); continue; - if (cgroup_is_populated(child->css.cgroup)) { + } + + if (cpuset_is_populated(cp)) { rcu_read_unlock(); return true; } @@ -670,7 +689,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) { + if (cpuset_is_populated(cur)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out;