More MM-CID fixes, mostly fixing hangs/races:
- Fix CID hangs due to a race between concurrent forks
- Fix vfork()/CLONE_VM MMCID bug causing hangs
- Remove pointless preemption guard
- Fix CID task list walk performance regression on large systems
by removing the known-flaky and slow counting logic using
for_each_process_thread() in mm_cid_*fixup_tasks_to_cpus(),
and implementing a simple sched_mm_cid::node list instead
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-----BEGIN PGP SIGNATURE-----
iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmm2JvIRHG1pbmdvQGtl
cm5lbC5vcmcACgkQEnMQ0APhK1hTqg/+K7b4LDOi3nVblmoj6q+mQj2i8DFPbi10
zeAWJJnamYWPvUi+Wxq30JjZJ9v+15Ddcmbhea9m/3u1YO6nAL5TbGeQcJ2LU/7p
Ynu9cznv9PfqO4X7WQc3gJC9xx8PbcM00E3JzGxDX/3NDmDBaTOwwuTp41ymcbhm
cGfnUQWGt81sMummVzqehszfIRMZHnWflYDJ2gC66rcGXMNBlEX125F8jybOm66n
Ez6gO7e9EGn28+hZIufySsxaeeK/3NFVKj1UjGP/FMuBwQFAjHPv61nic33nOKXT
yrw7U8DIaYUqFN4d1lplTG72j2YSUj7snn3Q+ubxpzFmOt7RmouVqwlVGEoey5fh
cEe2VYSQFoZKQioWWyms1LP1hTOa2JkNVhdjBfRZ8IM+Wp47OaDiw1h1+zwwMDbJ
xpDAXEuU+sBZiv2SeBLFQgrGj58gb8pdjN4o47X89mx8TKYWtStrCMsD+MF10LBm
dz780Eiinbw5D8JBsxU/ehETpgrAAVmo1KbFx2Q2grAgkJs7jSqBN2KF8NpmH/ZS
Jk8SpQOn4Vp8iO32TbpsV/GErG9EQgixQxnkTukv2Qd9kguhmjwbi/blN3rLBlBb
XbmR9rRAMfAjlPrk84tn9ecXNWO0NV83IYheAwjip36alSbOs+OcxdhrZ78nxh8C
EsKqGl3PeOk=
=ce5G
-----END PGP SIGNATURE-----
Merge tag 'sched-urgent-2026-03-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
"More MM-CID fixes, mostly fixing hangs/races:
- Fix CID hangs due to a race between concurrent forks
- Fix vfork()/CLONE_VM MMCID bug causing hangs
- Remove pointless preemption guard
- Fix CID task list walk performance regression on large systems
by removing the known-flaky and slow counting logic using
for_each_process_thread() in mm_cid_*fixup_tasks_to_cpus(), and
implementing a simple sched_mm_cid::node list instead"
* tag 'sched-urgent-2026-03-15' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/mmcid: Avoid full tasklist walks
sched/mmcid: Remove pointless preempt guard
sched/mmcid: Handle vfork()/CLONE_VM correctly
sched/mmcid: Prevent CID stalls due to concurrent forks
master
commit
63724e9519
|
|
@ -133,10 +133,12 @@ struct rseq_data { };
|
|||
* @active: MM CID is active for the task
|
||||
* @cid: The CID associated to the task either permanently or
|
||||
* borrowed from the CPU
|
||||
* @node: Queued in the per MM MMCID list
|
||||
*/
|
||||
struct sched_mm_cid {
|
||||
unsigned int active;
|
||||
unsigned int cid;
|
||||
struct hlist_node node;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -157,6 +159,7 @@ struct mm_cid_pcpu {
|
|||
* @work: Regular work to handle the affinity mode change case
|
||||
* @lock: Spinlock to protect against affinity setting which can't take @mutex
|
||||
* @mutex: Mutex to serialize forks and exits related to this mm
|
||||
* @user_list: List of the MM CID users of a MM
|
||||
* @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map
|
||||
* is growth only.
|
||||
* @users: The number of tasks sharing this MM. Separate from mm::mm_users
|
||||
|
|
@ -177,13 +180,14 @@ struct mm_mm_cid {
|
|||
|
||||
raw_spinlock_t lock;
|
||||
struct mutex mutex;
|
||||
struct hlist_head user_list;
|
||||
|
||||
/* Low frequency modified */
|
||||
unsigned int nr_cpus_allowed;
|
||||
unsigned int users;
|
||||
unsigned int pcpu_thrs;
|
||||
unsigned int update_deferred;
|
||||
}____cacheline_aligned_in_smp;
|
||||
} ____cacheline_aligned;
|
||||
#else /* CONFIG_SCHED_MM_CID */
|
||||
struct mm_mm_cid { };
|
||||
struct sched_mm_cid { };
|
||||
|
|
|
|||
|
|
@ -2354,7 +2354,6 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
|
|||
#ifdef CONFIG_SCHED_MM_CID
|
||||
void sched_mm_cid_before_execve(struct task_struct *t);
|
||||
void sched_mm_cid_after_execve(struct task_struct *t);
|
||||
void sched_mm_cid_fork(struct task_struct *t);
|
||||
void sched_mm_cid_exit(struct task_struct *t);
|
||||
static __always_inline int task_mm_cid(struct task_struct *t)
|
||||
{
|
||||
|
|
@ -2363,7 +2362,6 @@ static __always_inline int task_mm_cid(struct task_struct *t)
|
|||
#else
|
||||
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
|
||||
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
|
||||
static inline void sched_mm_cid_fork(struct task_struct *t) { }
|
||||
static inline void sched_mm_cid_exit(struct task_struct *t) { }
|
||||
static __always_inline int task_mm_cid(struct task_struct *t)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1000,6 +1000,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
|
|||
#ifdef CONFIG_SCHED_MM_CID
|
||||
tsk->mm_cid.cid = MM_CID_UNSET;
|
||||
tsk->mm_cid.active = 0;
|
||||
INIT_HLIST_NODE(&tsk->mm_cid.node);
|
||||
#endif
|
||||
return tsk;
|
||||
|
||||
|
|
@ -1586,7 +1587,6 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
|
|||
|
||||
tsk->mm = mm;
|
||||
tsk->active_mm = mm;
|
||||
sched_mm_cid_fork(tsk);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -2498,7 +2498,6 @@ bad_fork_cleanup_namespaces:
|
|||
exit_nsproxy_namespaces(p);
|
||||
bad_fork_cleanup_mm:
|
||||
if (p->mm) {
|
||||
sched_mm_cid_exit(p);
|
||||
mm_clear_owner(p->mm, p);
|
||||
mmput(p->mm);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4729,8 +4729,11 @@ void sched_cancel_fork(struct task_struct *p)
|
|||
scx_cancel_fork(p);
|
||||
}
|
||||
|
||||
static void sched_mm_cid_fork(struct task_struct *t);
|
||||
|
||||
void sched_post_fork(struct task_struct *p)
|
||||
{
|
||||
sched_mm_cid_fork(p);
|
||||
uclamp_post_fork(p);
|
||||
scx_post_fork(p);
|
||||
}
|
||||
|
|
@ -10617,13 +10620,10 @@ static inline void mm_cid_transit_to_cpu(struct task_struct *t, struct mm_cid_pc
|
|||
}
|
||||
}
|
||||
|
||||
static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
|
||||
static void mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
|
||||
{
|
||||
/* Remote access to mm::mm_cid::pcpu requires rq_lock */
|
||||
guard(task_rq_lock)(t);
|
||||
/* If the task is not active it is not in the users count */
|
||||
if (!t->mm_cid.active)
|
||||
return false;
|
||||
if (cid_on_task(t->mm_cid.cid)) {
|
||||
/* If running on the CPU, put the CID in transit mode, otherwise drop it */
|
||||
if (task_rq(t)->curr == t)
|
||||
|
|
@ -10631,69 +10631,43 @@ static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm
|
|||
else
|
||||
mm_unset_cid_on_task(t);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void mm_cid_do_fixup_tasks_to_cpus(struct mm_struct *mm)
|
||||
{
|
||||
struct task_struct *p, *t;
|
||||
unsigned int users;
|
||||
|
||||
/*
|
||||
* This can obviously race with a concurrent affinity change, which
|
||||
* increases the number of allowed CPUs for this mm, but that does
|
||||
* not affect the mode and only changes the CID constraints. A
|
||||
* possible switch back to per task mode happens either in the
|
||||
* deferred handler function or in the next fork()/exit().
|
||||
*
|
||||
* The caller has already transferred. The newly incoming task is
|
||||
* already accounted for, but not yet visible.
|
||||
*/
|
||||
users = mm->mm_cid.users - 2;
|
||||
if (!users)
|
||||
return;
|
||||
|
||||
guard(rcu)();
|
||||
for_other_threads(current, t) {
|
||||
if (mm_cid_fixup_task_to_cpu(t, mm))
|
||||
users--;
|
||||
}
|
||||
|
||||
if (!users)
|
||||
return;
|
||||
|
||||
/* Happens only for VM_CLONE processes. */
|
||||
for_each_process_thread(p, t) {
|
||||
if (t == current || t->mm != mm)
|
||||
continue;
|
||||
if (mm_cid_fixup_task_to_cpu(t, mm)) {
|
||||
if (--users == 0)
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void mm_cid_fixup_tasks_to_cpus(void)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct task_struct *t;
|
||||
|
||||
lockdep_assert_held(&mm->mm_cid.mutex);
|
||||
|
||||
hlist_for_each_entry(t, &mm->mm_cid.user_list, mm_cid.node) {
|
||||
/* Current has already transferred before invoking the fixup. */
|
||||
if (t != current)
|
||||
mm_cid_fixup_task_to_cpu(t, mm);
|
||||
}
|
||||
|
||||
mm_cid_do_fixup_tasks_to_cpus(mm);
|
||||
mm_cid_complete_transit(mm, MM_CID_ONCPU);
|
||||
}
|
||||
|
||||
static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
|
||||
{
|
||||
lockdep_assert_held(&mm->mm_cid.lock);
|
||||
|
||||
t->mm_cid.active = 1;
|
||||
hlist_add_head(&t->mm_cid.node, &mm->mm_cid.user_list);
|
||||
mm->mm_cid.users++;
|
||||
return mm_update_max_cids(mm);
|
||||
}
|
||||
|
||||
void sched_mm_cid_fork(struct task_struct *t)
|
||||
static void sched_mm_cid_fork(struct task_struct *t)
|
||||
{
|
||||
struct mm_struct *mm = t->mm;
|
||||
bool percpu;
|
||||
|
||||
WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
|
||||
if (!mm)
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(t->mm_cid.cid != MM_CID_UNSET);
|
||||
|
||||
guard(mutex)(&mm->mm_cid.mutex);
|
||||
scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
|
||||
|
|
@ -10732,12 +10706,13 @@ void sched_mm_cid_fork(struct task_struct *t)
|
|||
|
||||
static bool sched_mm_cid_remove_user(struct task_struct *t)
|
||||
{
|
||||
lockdep_assert_held(&t->mm->mm_cid.lock);
|
||||
|
||||
t->mm_cid.active = 0;
|
||||
scoped_guard(preempt) {
|
||||
/* Clear the transition bit */
|
||||
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
|
||||
mm_unset_cid_on_task(t);
|
||||
}
|
||||
/* Clear the transition bit */
|
||||
t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
|
||||
mm_unset_cid_on_task(t);
|
||||
hlist_del_init(&t->mm_cid.node);
|
||||
t->mm->mm_cid.users--;
|
||||
return mm_update_max_cids(t->mm);
|
||||
}
|
||||
|
|
@ -10880,11 +10855,13 @@ void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
|
|||
mutex_init(&mm->mm_cid.mutex);
|
||||
mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
|
||||
INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
|
||||
INIT_HLIST_HEAD(&mm->mm_cid.user_list);
|
||||
cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
|
||||
bitmap_zero(mm_cidmask(mm), num_possible_cpus());
|
||||
}
|
||||
#else /* CONFIG_SCHED_MM_CID */
|
||||
static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
|
||||
static inline void sched_mm_cid_fork(struct task_struct *t) { }
|
||||
#endif /* !CONFIG_SCHED_MM_CID */
|
||||
|
||||
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
|
||||
|
|
|
|||
Loading…
Reference in New Issue