From b4aea43cd37afad714b5684fe9fdfcb0e78dba26 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 13 May 2026 09:56:58 +0100 Subject: [PATCH 01/13] mm/hugetlb: avoid false positive lockdep assertion Commit 081056dc00a2 ("mm/hugetlb: unshare page tables during VMA split, not before") changed the locking model around hugetlbfs PMD unsharing on VMA split, but did not update the function which asserts the locks, hugetlb_vma_assert_locked(). This function asserts that either the hugetlb VMA lock is held (if a shared mapping) or that the reservation map lock is held (if private). If you get an unfortunate race between something which results in one of these locks being released and a hugetlb VMA split and you have CONFIG_LOCKDEP enabled, you can therefore see a false positive assertion arise when there is in fact no issue. Since this change introduced a new take_locks parameter to hugetlb_unshare_pmds(), which, when set to false, indicates that locking is sufficient, simply pass this to the unsharing logic and predicate the lock assertions on this. This is safe, as we already asserted the file rmap lock and the VMA write lock prior to this (implying exclusive mmap write lock), so we cannot be raced by either rmap or page fault page table walkers which the asserted locks are intended to protect against (we don't mind GUP-fast). Separate out huge_pmd_unshare() into __huge_pmd_unshare() to add a check_locks parameter, and update hugetlb_unshare_pmds() to pass this parameter to it. This leaves all other callers of huge_pmd_unshare() still correctly asserting the locks. The below reproducer will trigger the assert in a kernel with CONFIG_LOCKDEP enabled by racing process teardown (which will release the hugetlb lock) against a hugetlb split. void execute_one(void) { void *ptr; pid_t pid; /* * Create a hugetlb mapping spanning a PUD entry. * * We force the hugetlb page allocation with populate and * noreserve. * * |---------------------| * | | * |---------------------| * 0 PUD boundary */ ptr = mmap(0, PUD_SIZE, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED | MAP_ANON | MAP_NORESERVE | MAP_HUGETLB | MAP_POPULATE, -1, 0); if (ptr == MAP_FAILED) { perror("mmap"); exit(EXIT_FAILURE); } /* * Fork but with a bogus stack pointer so we try to execute code in * a non-VM_EXEC VMA, causing segfault + teardown via exit_mmap(). * * The clone will cause PMD page table sharing between the * processes first via: * copy_process() -> ... -> huge_pte_alloc() -> huge_pmd_share() * * Then tear down and release the hugetlb 'VMA' lock via: * exit_mmap() -> ... -> vma_close() -> hugetlb_vma_lock_free() */ pid = syscall(__NR_clone, 0, 2 * PMD_SIZE, 0, 0, 0); if (pid < 0) { perror("clone"); exit(EXIT_FAILURE); } if (pid == 0) { /* Pop stack... */ return; } /* * We are the parent process. * * Race the child process's teardown with a PMD unshare. * * We do this by triggering: * * __split_vma() -> hugetlb_split() -> hugetlb_unshare_pmds() * * Which, importantly, doesn't hold the hugetlb VMA lock (nor can * it), meaning we assert in hugetlb_vma_assert_locked(). * * . * |----------.----------| * | . | * |----------.----------| * 0 . PUD boundary */ mmap(0, PUD_SIZE / 2, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); } int main(void) { int i; /* Kick off fork children. */ for (i = 0; i < NUM_FORKS; i++) { pid_t pid = fork(); if (pid < 0) { perror("fork"); exit(EXIT_FAILURE); } /* Fork children do their work and exit. */ if (!pid) { int j; for (j = 0; j < NUM_ITERS; j++) execute_one(); return EXIT_SUCCESS; } } /* If we succeeded, wait on children. */ for (i = 0; i < NUM_FORKS; i++) wait(NULL); return EXIT_SUCCESS; } [ljs@kernel.org: account for the !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING case] Link: https://lore.kernel.org/agWZsPGYid08uU6O@lucifer Link: https://lore.kernel.org/20260513085658.45264-1-ljs@kernel.org Fixes: 081056dc00a2 ("mm/hugetlb: unshare page tables during VMA split, not before") Signed-off-by: Lorenzo Stoakes Acked-by: David Hildenbrand (Arm) Acked-by: Oscar Salvador Cc: Jann Horn Cc: Muchun Song Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 56 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4b80b167cc9c..ece86d9339ce 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -118,6 +118,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); +static int __huge_pmd_unshare(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + bool check_locks); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); @@ -6891,6 +6894,31 @@ out: return pte; } +static int __huge_pmd_unshare(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + bool check_locks) +{ + unsigned long sz = huge_page_size(hstate_vma(vma)); + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd = pgd_offset(mm, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + + if (sz != PMD_SIZE) + return 0; + if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) + return 0; + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + if (check_locks) + hugetlb_vma_assert_locked(vma); + pud_clear(pud); + + tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); + + mm_dec_nr_pmds(mm); + return 1; +} + /** * huge_pmd_unshare - Unmap a pmd table if it is shared by multiple users * @tlb: the current mmu_gather. @@ -6910,24 +6938,7 @@ out: int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - unsigned long sz = huge_page_size(hstate_vma(vma)); - struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd = pgd_offset(mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - - if (sz != PMD_SIZE) - return 0; - if (!ptdesc_pmd_is_shared(virt_to_ptdesc(ptep))) - return 0; - i_mmap_assert_write_locked(vma->vm_file->f_mapping); - hugetlb_vma_assert_locked(vma); - pud_clear(pud); - - tlb_unshare_pmd_ptdesc(tlb, virt_to_ptdesc(ptep), addr); - - mm_dec_nr_pmds(mm); - return 1; + return __huge_pmd_unshare(tlb, vma, addr, ptep, /*check_locks=*/true); } /* @@ -6961,6 +6972,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; } +static int __huge_pmd_unshare(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + bool check_locks) +{ + return 0; +} + int huge_pmd_unshare(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -7269,7 +7287,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, if (!ptep) continue; ptl = huge_pte_lock(h, mm, ptep); - huge_pmd_unshare(&tlb, vma, address, ptep); + __huge_pmd_unshare(&tlb, vma, address, ptep, take_locks); spin_unlock(ptl); } huge_pmd_unshare_flush(&tlb, vma); From c0cafe24d3f6534294c4b2bc2d47734ff7cbd313 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 21 May 2026 15:37:51 -0700 Subject: [PATCH 02/13] memcg: use round-robin victim selection in refill_stock Harry Yoo reported that get_random_u32_below() is not safe to call in the nmi context and memcg charge draining can happen in nmi context. More specifically get_random_u32_below() is neither reentrant- nor NMI-safe: it acquires a per-cpu local_lock via local_lock_irqsave() on the batched_entropy_u32 state. An NMI that lands on a CPU mid-update of the ChaCha batch state and recurses into the random subsystem would corrupt that state. The memcg_stock local_trylock prevents re-entry on the percpu stock itself, but cannot protect an unrelated subsystem's per-cpu lock. Replace the random pick with a per-cpu round-robin counter stored in memcg_stock_pcp and serialized by the same local_trylock that already guards cached[] and nr_pages[]. No atomics, no random calls, no extra locks needed. Link: https://lore.kernel.org/20260521223751.3794625-1-shakeel.butt@linux.dev Fixes: f735eebe55f8f ("memcg: multi-memcg percpu charge cache") Signed-off-by: Shakeel Butt Reported-by: Harry Yoo Closes: https://lore.kernel.org/4e20f643-6983-4b6e-b12d-c6c4eb20ae0c@kernel.org/ Acked-by: Harry Yoo (Oracle) Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 177732fef010..1a4fd2504bcd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2011,6 +2011,7 @@ struct memcg_stock_pcp { struct work_struct work; unsigned long flags; + uint8_t drain_idx; }; static DEFINE_PER_CPU_ALIGNED(struct memcg_stock_pcp, memcg_stock) = { @@ -2194,7 +2195,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (!success) { i = empty_slot; if (i == -1) { - i = get_random_u32_below(NR_MEMCG_STOCK); + i = stock->drain_idx++; + if (stock->drain_idx == NR_MEMCG_STOCK) + stock->drain_idx = 0; drain_stock(stock, i); } css_get(&memcg->css); From c0ca59beb5252ea2bd4fdaef009d003dedc2030e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 20 May 2026 14:10:25 +0800 Subject: [PATCH 03/13] mm/cma_debug: fix invalid accesses for inactive CMA areas cma_activate_area() can fail after allocating range bitmaps. Its cleanup path frees those bitmaps, but only clears cma->count and cma->available_count. It leaves cma->nranges and each range's count in place, so cma_debugfs_init() can still register debugfs files for an area that never activated successfully. That exposes two problems. Reading the bitmap file can make debugfs walk a freed range bitmap and trigger an invalid memory access. Reading maxchunk can also take cma->lock even though that lock is initialized only on the successful activation path. Fix this by creating debugfs entries only for CMA areas that reached CMA_ACTIVATED. c009da4258f9 introduced the invalid access to bitmap file. 2e32b947606d introduced the invalid access to cma->lock. This change applies to both issues. So I added two Fixes tags. Link: https://lore.kernel.org/20260520061025.3971821-1-songmuchun@bytedance.com Fixes: c009da4258f9 ("mm, cma: support multiple contiguous ranges, if requested") Fixes: 2e32b947606d ("mm: cma: add functions to get region pages counters") Signed-off-by: Muchun Song Acked-by: Mike Rapoport (Microsoft) Acked-by: Oscar Salvador (SUSE) Acked-by: David Hildenbrand (Arm) Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Frank van der Linden Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Michal Nazarewicz Cc: Stefan Strogin Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/cma_debug.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 5ae38f5abbcc..523ba4a0f9f7 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -205,7 +205,8 @@ static int __init cma_debugfs_init(void) cma_debugfs_root = debugfs_create_dir("cma", NULL); for (i = 0; i < cma_area_count; i++) - cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root); + if (test_bit(CMA_ACTIVATED, &cma_areas[i].flags)) + cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root); return 0; } From 40c81856e622a9dc59294a90d169ac07ea25b0b0 Mon Sep 17 00:00:00 2001 From: David Carlier Date: Wed, 20 May 2026 05:49:12 +0100 Subject: [PATCH 04/13] mm/hugetlb: restore reservation on error in hugetlb folio copy paths Two sites in mm/hugetlb.c allocate a hugetlb folio via alloc_hugetlb_folio() (consuming a VMA reservation) and then call copy_user_large_folio(), which became int-returning in commit 1cb9dc4b475c ("mm: hwpoison: support recovery from HugePage copy-on-write faults") and can now fail (e.g. -EHWPOISON on a hwpoisoned source page). On the failure path, folio_put() restores the global hugetlb pool count through free_huge_folio(), but the per-VMA reservation map entry is left marked consumed: - hugetlb_mfill_atomic_pte() resubmission path (UFFDIO_COPY) - copy_hugetlb_page_range() fork-time CoW path when hugetlb_try_dup_anon_rmap() fails (rare: pinned hugetlb anon folio under fork) User-visible effect: on UFFDIO_COPY into a private hugetlb VMA where the resubmission copy fails, the reservation for that address is leaked from the VMA's reserve map. A subsequent fault at the same address takes the no-reservation path, and under hugetlb pool pressure the task is SIGBUSed at an address it had previously reserved. The fork-time CoW path leaks the same way in the child VMA's reserve map, though it requires the much rarer combination of pinned hugetlb anon page + hwpoisoned source. Add the missing restore_reserve_on_error() call before folio_put() on both error paths. Link: https://lore.kernel.org/20260520044912.6751-1-devnexen@gmail.com Fixes: 1cb9dc4b475c ("mm: hwpoison: support recovery from HugePage copy-on-write faults") Signed-off-by: David Carlier Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Mina Almasry Cc: Muchun Song Cc: Oscar Salvador Cc: yuehaibing Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ece86d9339ce..1b1d4f87a3a4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4977,6 +4977,7 @@ again: addr, dst_vma); folio_put(pte_folio); if (ret) { + restore_reserve_on_error(h, dst_vma, addr, new_folio); folio_put(new_folio); break; } @@ -6273,6 +6274,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, folio_put(*foliop); *foliop = NULL; if (ret) { + restore_reserve_on_error(h, dst_vma, dst_addr, folio); folio_put(folio); goto out; } From 3c2d42b8ee345b17a4ba56b0f6492d1ff4c1178e Mon Sep 17 00:00:00 2001 From: Wupeng Ma Date: Fri, 22 May 2026 09:03:05 +0800 Subject: [PATCH 05/13] mm/memory-failure: fix hugetlb_lock AA deadlock in get_huge_page_for_hwpoison Two concurrent madvise(MADV_HWPOISON) calls on the same hugetlb page can trigger a recursive spinlock self-deadlock (AA deadlock) on hugetlb_lock when racing with a concurrent unmap: thread#0 thread#1 -------- -------- madvise(folio, MADV_HWPOISON) -> poisons the folio successfully madvise(folio, MADV_HWPOISON) unmap(folio) try_memory_failure_hugetlb get_huge_page_for_hwpoison spin_lock_irq(&hugetlb_lock) <- held __get_huge_page_for_hwpoison hugetlb_update_hwpoison() -> MF_HUGETLB_FOLIO_PRE_POISONED goto out: folio_put() refcount: 1 -> 0 free_huge_folio() spin_lock_irqsave(&hugetlb_lock) -> AA DEADLOCK! The out: path in __get_huge_page_for_hwpoison() calls folio_put() to drop the GUP reference while the hugetlb_lock is still held by the hugetlb.c wrapper get_huge_page_for_hwpoison(). If concurrent unmap has released the page table mapping reference, folio_put() drops the folio refcount to zero, triggering free_huge_folio() which attempts to re-acquire the non-recursive hugetlb_lock. Fix this by moving hugetlb_lock acquisition from the hugetlb.c wrapper into get_huge_page_for_hwpoison(). Place spin_unlock_irq() before the folio_put() at the out: label so the folio is always released outside the lock. [akpm@linux-foundation.org: fix race, rename label per Miaohe] Link: https://sashiko.dev/#/patchset/20260522010305.4099834-1-mawupeng1@huawei.com Link: https://lore.kernel.org/f39f405e-4b4b-8f79-70fe-a2b5b62114eb@huawei.com Link: https://lore.kernel.org/20260522010305.4099834-1-mawupeng1@huawei.com Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()") Signed-off-by: Wupeng Ma Acked-by: Oscar Salvador (SUSE) Acked-by: Muchun Song Reviewed-by: Kefeng Wang Acked-by: Miaohe Lin Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 8 -------- include/linux/mm.h | 8 -------- mm/hugetlb.c | 11 ----------- mm/memory-failure.c | 19 ++++++++++--------- 4 files changed, 10 insertions(+), 36 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5957bc25efa8..2abaf99321e9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -153,8 +153,6 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); -int get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared); void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); @@ -421,12 +419,6 @@ static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, return 0; } -static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared) -{ - return 0; -} - static inline void folio_putback_hugetlb(struct folio *folio) { } diff --git a/include/linux/mm.h b/include/linux/mm.h index 06bbe9eba636..fc2acedf0b76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4975,8 +4975,6 @@ extern int soft_offline_page(unsigned long pfn, int flags); */ extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); -extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else @@ -4984,12 +4982,6 @@ static inline void memory_failure_queue(unsigned long pfn, int flags) { } -static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared) -{ - return 0; -} - static inline void num_poisoned_pages_inc(unsigned long pfn) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1b1d4f87a3a4..c921287489de 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7161,17 +7161,6 @@ int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison return ret; } -int get_huge_page_for_hwpoison(unsigned long pfn, int flags, - bool *migratable_cleared) -{ - int ret; - - spin_lock_irq(&hugetlb_lock); - ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); - spin_unlock_irq(&hugetlb_lock); - return ret; -} - /** * folio_putback_hugetlb - unisolate a hugetlb folio * @folio: the isolated hugetlb folio diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ee42d4361309..d47aef256a32 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1966,20 +1966,19 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) folio_free_raw_hwp(folio, true); } -/* - * Called from hugetlb code with hugetlb_lock held. - */ -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, +static int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); - struct folio *folio = page_folio(page); + struct folio *folio; bool count_increased = false; int ret, rc; + spin_lock_irq(&hugetlb_lock); + folio = page_folio(page); if (!folio_test_hugetlb(folio)) { ret = MF_HUGETLB_NON_HUGEPAGE; - goto out; + goto out_unlock; } else if (flags & MF_COUNT_INCREASED) { ret = MF_HUGETLB_IN_USED; count_increased = true; @@ -1995,13 +1994,13 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, } else { ret = MF_HUGETLB_RETRY; if (!(flags & MF_NO_RETRY)) - goto out; + goto out_unlock; } rc = hugetlb_update_hwpoison(folio, page); if (rc >= MF_HUGETLB_FOLIO_PRE_POISONED) { ret = rc; - goto out; + goto out_unlock; } /* @@ -2013,8 +2012,10 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, *migratable_cleared = true; } + spin_unlock_irq(&hugetlb_lock); return ret; -out: +out_unlock: + spin_unlock_irq(&hugetlb_lock); if (count_increased) folio_put(folio); return ret; From 00739e4dd46dde2b39dd9dd19a27e3c8af4ca0d0 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Sat, 23 May 2026 14:01:23 +0800 Subject: [PATCH 06/13] mm/cma: fix reserved page leak on activation failure If cma_activate_area() fails after allocating only part of the range bitmaps, the cleanup path still has to release the reserved pages when CMA_RESERVE_PAGES_ON_ERROR is clear. That is still worth doing even in this __init path. A bitmap_zalloc() failure does not necessarily mean the system cannot make further progress: freeing the reserved CMA pages can return a substantial amount of memory to the buddy allocator and may relieve the temporary memory shortage that caused the allocation failure in the first place. However, the cleanup path currently uses the bitmap-freeing bound for page release as well. That is only correct for ranges whose bitmap allocation already succeeded. The failed range and all later ranges still keep their reserved pages, so a partial bitmap allocation failure can permanently leak them. Fix this by releasing reserved pages for all ranges. Use the saved early_pfn[] value for ranges whose bitmap allocation already succeeded and for the failed range, and use cmr->early_pfn for later ranges whose bitmap allocation was never attempted. Link: https://lore.kernel.org/20260523060123.2207992-1-songmuchun@bytedance.com Fixes: c009da4258f9 ("mm, cma: support multiple contiguous ranges, if requested") Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador (SUSE) Acked-by: Usama Arif Cc: David Hildenbrand Cc: Frank van der Linden Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/cma.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index c7ca567f4c5c..a13ce4999b39 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -188,10 +188,13 @@ cleanup: /* Expose all pages to the buddy, they are useless for CMA. */ if (!test_bit(CMA_RESERVE_PAGES_ON_ERROR, &cma->flags)) { - for (r = 0; r < allocrange; r++) { + for (r = 0; r < cma->nranges; r++) { + unsigned long start_pfn; + cmr = &cma->ranges[r]; + start_pfn = r <= allocrange ? early_pfn[r] : cmr->early_pfn; end_pfn = cmr->base_pfn + cmr->count; - for (pfn = early_pfn[r]; pfn < end_pfn; pfn++) + for (pfn = start_pfn; pfn < end_pfn; pfn++) free_reserved_page(pfn_to_page(pfn)); } } From d6b8b02a27b3dd09ec12144322b3dac46d9bc9ef Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 25 May 2026 09:22:55 -0700 Subject: [PATCH 07/13] mm/damon/ops-common: call folio_test_lru() after folio_get() damon_get_folio() speculatively calls folio_test_lru() before folio_try_get(). The folio can get freed and reallocated to a tail page. In the case, VM_BUG_ON_PGFLAGS() in const_folio_flags() can be triggered. Remove the speculative call. Also mark folio_test_lru() check right after folio_try_get() success as no more unlikely. The race should be rare. Also the problem can happen only if the kernel has enabled CONFIG_DEBUG_VM_PGFLAGS. No real world report of this issue has been made so far. This fix is based on only theoretical analysis. That said, a bug is a bug. A similar issue was also fixed via commit 3203b3ab0fcf ("mm/filemap: don't call folio_test_locked() without a reference in next_uptodate_folio()"). I don't expect this change will make a meaningful impact to DAMON performance in the real world, though I will be happy to be corrected from the real world reports. The issue was discovered [1] by Sashiko. Link: https://lore.kernel.org/20260525162256.8317-1-sj@kernel.org Link: https://lore.kernel.org/20260517234112.89245-1-sj@kernel.org [1] Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces") Signed-off-by: SeongJae Park Cc: Fernand Sieber Cc: Leonard Foerster Cc: Shakeel Butt Cc: # 5.15.x Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 8c6d613425c1..c3e4c871b0bb 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -32,9 +32,9 @@ struct folio *damon_get_folio(unsigned long pfn) return NULL; folio = page_folio(page); - if (!folio_test_lru(folio) || !folio_try_get(folio)) + if (!folio_try_get(folio)) return NULL; - if (unlikely(page_folio(page) != folio || !folio_test_lru(folio))) { + if (unlikely(page_folio(page) != folio) || !folio_test_lru(folio)) { folio_put(folio); folio = NULL; } From c7bde43f6daf70e05a64fbca7efdf6fa93e057dc Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 25 May 2026 10:52:13 +0800 Subject: [PATCH 08/13] mm/hugetlb_vmemmap: fix incorrect vmemmap restore in rollback vmemmap_restore_pte() rebuilds restored vmemmap pages from a tail-page template derived from compound_head(). This is wrong when the current PTE already maps a page whose contents are not tail-page metadata. In the rollback path of vmemmap_remap_free(), the first restored PTE is backed by vmemmap_head and contains head-page metadata. Reconstructing that page from a tail-page template overwrites the head-page state and corrupts the restored vmemmap page. Fix this by copying the full page from the page currently mapped by the PTE. Also pass vmemmap_tail to the rollback walk so only PTEs backed by the shared tail page are restored, while the head PTE remains mapped to vmemmap_head. Add VM_WARN_ON_ONCE() checks for unexpected cases. Link: https://lore.kernel.org/20260525025213.2229628-1-songmuchun@bytedance.com Fixes: c0b495b91a47 ("mm/hugetlb: refactor code around vmemmap_walk") Signed-off-by: Muchun Song Acked-by: Kiryl Shutsemau Acked-by: Oscar Salvador (SUSE) Cc: David Hildenbrand Cc: Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4a077d231d3a..133b46dfb09f 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -207,6 +207,8 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, /* Remapping the head page requires r/w */ if (unlikely(walk->nr_walked == 0 && walk->vmemmap_head)) { + VM_WARN_ON_ONCE(!PageHead((const struct page *)addr)); + list_del(&walk->vmemmap_head->lru); /* @@ -218,6 +220,8 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, entry = mk_pte(walk->vmemmap_head, PAGE_KERNEL); } else { + VM_WARN_ON_ONCE(!PageTail((const struct page *)addr)); + /* * Remap the tail pages as read-only to catch illegal write * operation to the tail pages. @@ -232,33 +236,28 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { - struct page *page; - struct page *from, *to; - - page = list_first_entry(walk->vmemmap_pages, struct page, lru); - list_del(&page->lru); + struct page *src = pte_page(ptep_get(pte)), *dst; /* - * Initialize tail pages in the newly allocated vmemmap page. - * - * There is folio-scope metadata that is encoded in the first few - * tail pages. - * - * Use the value last tail page in the page with the head page - * to initialize the rest of tail pages. + * When rolling back vmemmap_remap_free(), keep the copied head page + * mapping and restore only PTEs currently pointing at the shared tail + * page. */ - from = compound_head((struct page *)addr) + - PAGE_SIZE / sizeof(struct page) - 1; - to = page_to_virt(page); - for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++, to++) - *to = *from; + if (walk->vmemmap_tail && walk->vmemmap_tail != src) + return; + + VM_WARN_ON_ONCE(PageHead((const struct page *)addr)); + + dst = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&dst->lru); + copy_page(page_to_virt(dst), page_to_virt(src)); /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); - set_pte_at(&init_mm, addr, pte, mk_pte(page, PAGE_KERNEL)); + set_pte_at(&init_mm, addr, pte, mk_pte(dst, PAGE_KERNEL)); } /** @@ -324,6 +323,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, */ walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, + .vmemmap_tail = vmemmap_tail, .vmemmap_pages = vmemmap_pages, .flags = 0, }; From 40990c87a26e371594475acdc560c93cfae308a1 Mon Sep 17 00:00:00 2001 From: Yin Tirui Date: Tue, 26 May 2026 18:13:55 +0800 Subject: [PATCH 09/13] mm/huge_memory: update file PUD counter before folio_put() __split_huge_pud_locked() updates the file/shmem RSS counter after dropping the PUD mapping's folio reference. If folio_put() drops the last reference, mm_counter_file() can later read freed folio state via folio_test_swapbacked(). Move the counter update before folio_put(). Link: https://lore.kernel.org/20260526101355.1984244-1-yintirui@huawei.com Fixes: dbe54153296d ("mm/huge_memory: add vmf_insert_folio_pud()") Signed-off-by: Yin Tirui Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand (arm) Reviewed-by: Lance Yang Reviewed-by: Dev Jain Cc: Alistair Popple Cc: Baolin Wang Cc: Barry Song Cc: Chen Jun Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 970e077019b7..1f78b73a0ca4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3015,9 +3015,9 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, if (!folio_test_referenced(folio) && pud_young(old_pud)) folio_set_referenced(folio); folio_remove_rmap_pud(folio, page, vma); - folio_put(folio); add_mm_counter(vma->vm_mm, mm_counter_file(folio), -HPAGE_PUD_NR); + folio_put(folio); } void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, From 8d878059924f12c1bc24556a92ec56add74de3c8 Mon Sep 17 00:00:00 2001 From: Yin Tirui Date: Tue, 26 May 2026 18:13:37 +0800 Subject: [PATCH 10/13] mm/huge_memory: update file PMD counter before folio_put() __split_huge_pmd_locked() updates the file/shmem RSS counter after dropping the PMD mapping's folio reference. If folio_put() drops the last reference, mm_counter_file() can later read freed folio state via folio_test_swapbacked(). Move the counter update before folio_put(). Link: https://lore.kernel.org/20260526101337.1984081-1-yintirui@huawei.com Fixes: fadae2953072 ("thp: use mm_file_counter to determine update which rss counter") Signed-off-by: Yin Tirui Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand (arm) Reviewed-by: Lance Yang Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Barry Song Cc: Chen Jun Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Nico Pache Cc: Ryan Roberts Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1f78b73a0ca4..653f2dc03403 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3133,7 +3133,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!folio_test_referenced(folio) && pmd_young(old_pmd)) folio_set_referenced(folio); folio_remove_rmap_pmd(folio, page, vma); + add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); folio_put(folio); + return; } add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR); return; From 85668fda932a5b8f15f649cf06411525a0e4c8ec Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 27 May 2026 21:47:49 +0300 Subject: [PATCH 11/13] userfaultfd: verify VMA state across UFFDIO_COPY retry Patch series "userfaultfd: verify VMA state across UFFDIO_COPY retry", v2. ... and two more small fixes. This patch (of 3): mfill_copy_folio_retry() drops the VMA lock for copy_from_user() and reacquires it afterwards. The destination VMA can be replaced during that window. The existing check compares vma_uffd_ops() before and after the retry, but if a shmem VMA with MAP_SHARED is replaced with a shmem VMA with MAP_PRIVATE (or vice versa) the replacement goes undetected. The change from MAP_PRIVATE to MAP_SHARED will treat the folio allocated with shmem_alloc_folio() as anonymous and this will cause BUG() when mfill_atomic_install_pte() will try to folio_add_new_anon_rmap(). The change from MAP_SHARED to MAP_PRIVATE allows injection of folios into the page cache of the original VMA. There is no need to change for hugetlb because it never uses mfill_copy_folio_retry(). Introduce helpers for more comprehensive comparison of VMA state: - mfill_retry_state_save() to save the relevant VMA state into a struct mfill_retry_state (original uffd_ops, relevant VMA flags, vm_file and pgoff) before dropping the lock - mfill_retry_state_changed() to compare the saved state with the state of the VMA acquired after retaking the locks - mfill_retry_state_put() to release vm_file pinning. Use DEFINE_FREE() cleanup to wrap mfill_retry_state_put() to avoid complicating error handling paths in mfill_copy_folio_retry(). Link: https://lore.kernel.org/20260527184751.4147364-1-rppt@kernel.org Link: https://lore.kernel.org/20260527184751.4147364-2-rppt@kernel.org Fixes: 292411fda25b ("mm/userfaultfd: detect VMA type change after copy retry in mfill_copy_folio_retry()") Fixes: 6ab703034f14 ("userfaultfd: mfill_atomic(): remove retry logic") Co-developed-by: Michael Bommarito Signed-off-by: Michael Bommarito Signed-off-by: Mike Rapoport (Microsoft) Suggested-by: Peter Xu Co-developed-by: David Carlier Signed-off-by: David Carlier Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 85 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 12 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 180bad42fc79..e5d2fb3ce2c1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include #include "internal.h" @@ -443,16 +445,80 @@ static int mfill_copy_folio_locked(struct folio *folio, unsigned long src_addr) return ret; } -static int mfill_copy_folio_retry(struct mfill_state *state, +#define MFILL_RETRY_STATE_VMA_FLAGS \ + append_vma_flags(__VMA_UFFD_FLAGS, VMA_SHARED_BIT) + +/* + * VMA state saved before dropping the locks in mfill_copy_folio_retry(). + * Used to detect VMA replacement or incompatible changes after reacquiring the + * locks. + */ +struct mfill_retry_state { + const struct vm_uffd_ops *ops; + struct file *file; + vma_flags_t flags; + pgoff_t pgoff; +}; + +static void mfill_retry_state_save(struct mfill_retry_state *s, + struct vm_area_struct *vma) +{ + s->flags = vma_flags_and_mask(&vma->flags, MFILL_RETRY_STATE_VMA_FLAGS); + s->ops = vma_uffd_ops(vma); + s->pgoff = vma->vm_pgoff; + + if (vma->vm_file) + s->file = get_file(vma->vm_file); +} + +static bool mfill_retry_state_changed(struct mfill_retry_state *state, + struct vm_area_struct *vma) +{ + vma_flags_t flags = vma_flags_and_mask(&vma->flags, + MFILL_RETRY_STATE_VMA_FLAGS); + + /* Have any UFFD flags (missing, WP, minor) changed? */ + if (!vma_flags_same_pair(&state->flags, &flags)) + return true; + + /* VMA type or effective uffd_ops changed while the lock was dropped */ + if (state->ops != vma_uffd_ops(vma)) + return true; + + /* VMA was anonymous before; changed only if it no longer is */ + if (!state->file) + return !vma_is_anonymous(vma); + + /* VMA was file backed, but file, inode or offset has changed */ + if (!vma->vm_file || vma->vm_file->f_inode != state->file->f_inode || + state->file != vma->vm_file || vma->vm_pgoff != state->pgoff) + return true; + + return false; +} + +static void mfill_retry_state_put(struct mfill_retry_state *s) +{ + if (s->file) + fput(s->file); +} + +DEFINE_FREE(retry_put, struct mfill_retry_state *, + if (_T) mfill_retry_state_put(_T)); + +static int mfill_copy_folio_retry(struct mfill_state *mfill_state, struct folio *folio) { - const struct vm_uffd_ops *orig_ops = vma_uffd_ops(state->vma); - unsigned long src_addr = state->src_addr; + struct mfill_retry_state retry_state = { 0 }; + struct mfill_retry_state *for_free __free(retry_put) = &retry_state; + unsigned long src_addr = mfill_state->src_addr; void *kaddr; int err; + mfill_retry_state_save(&retry_state, mfill_state->vma); + /* retry copying with mm_lock dropped */ - mfill_put_vma(state); + mfill_put_vma(mfill_state); kaddr = kmap_local_folio(folio, 0); err = copy_from_user(kaddr, (const void __user *) src_addr, PAGE_SIZE); @@ -463,19 +529,14 @@ static int mfill_copy_folio_retry(struct mfill_state *state, flush_dcache_folio(folio); /* reget VMA and PMD, they could change underneath us */ - err = mfill_get_vma(state); + err = mfill_get_vma(mfill_state); if (err) return err; - /* - * The VMA type may have changed while the lock was dropped - * (e.g. replaced with a hugetlb mapping), making the caller's - * ops pointer stale. - */ - if (vma_uffd_ops(state->vma) != orig_ops) + if (mfill_retry_state_changed(&retry_state, mfill_state->vma)) return -EAGAIN; - err = mfill_establish_pmd(state); + err = mfill_establish_pmd(mfill_state); if (err) return err; From df3ee3b3bbc327f570c5451666bbaf6cf8b4436a Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 27 May 2026 21:47:50 +0300 Subject: [PATCH 12/13] userfaultfd: refuse to __mfill_atomic_pte() for unsupported VMAs __mfill_atomic_pte() unconditionally dereferences ops because there is an assumption that VMAs that can undergo mfill_* operations are vetted on registration and must have valid vm_uffd_ops. Add a guard against potential bugs and make sure __mfill_atomic_pte() bails out if ops is NULL. Link: https://lore.kernel.org/20260527184751.4147364-3-rppt@kernel.org Fixes: ad9ac3081332 ("userfaultfd: introduce vm_uffd_ops->alloc_folio()") Signed-off-by: Mike Rapoport (Microsoft) Suggested-by: Lorenzo Stoakes Reviewed-by: Lorenzo Stoakes Reviewed-by: David CARLIER Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Michael Bommarito Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e5d2fb3ce2c1..2872c71bbf36 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -552,6 +552,11 @@ static int __mfill_atomic_pte(struct mfill_state *state, struct folio *folio; int ret; + if (!ops) { + VM_WARN_ONCE(1, "UFFDIO_COPY for unsupported VMA"); + return -EOPNOTSUPP; + } + folio = ops->alloc_folio(state->vma, state->dst_addr); if (!folio) return -ENOMEM; From 9d7bea186ba5a002456471edf36cc9b69f809397 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 27 May 2026 21:47:51 +0300 Subject: [PATCH 13/13] userfaultfd: remove redundant check in vm_uffd_ops() Lorenzo says: static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) { if (vma_is_anonymous(vma)) return &anon_uffd_ops; return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; } This is doing a redundant check _and_ making life confusing, as if !vma->vm_ops is a condition that can be reached there, it can't, as vma_is_anonymous() is literally a !vma->vm_ops check :) Remove the redundant check. Link: https://lore.kernel.org/20260527184751.4147364-4-rppt@kernel.org Fixes: 0f48947c4232 ("userfaultfd: introduce vm_uffd_ops") Signed-off-by: Mike Rapoport (Microsoft) Suggested-by: Lorenzo Stoakes Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Peter Xu Cc: David Carlier Cc: Michael Bommarito Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 2872c71bbf36..80cc8be5725f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -68,7 +68,7 @@ static const struct vm_uffd_ops *vma_uffd_ops(struct vm_area_struct *vma) { if (vma_is_anonymous(vma)) return &anon_uffd_ops; - return vma->vm_ops ? vma->vm_ops->uffd_ops : NULL; + return vma->vm_ops->uffd_ops; } static __always_inline