From 4c5e7f0fcd592801c9cc18f29f80fbee84eb8669 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 19 Mar 2026 09:25:41 +0800 Subject: [PATCH 01/10] mm/huge_memory: fix folio isn't locked in softleaf_to_folio() On arm64 server, we found folio that get from migration entry isn't locked in softleaf_to_folio(). This issue triggers when mTHP splitting and zap_nonpresent_ptes() races, and the root cause is lack of memory barrier in softleaf_to_folio(). The race is as follows: CPU0 CPU1 deferred_split_scan() zap_nonpresent_ptes() lock folio split_folio() unmap_folio() change ptes to migration entries __split_folio_to_order() softleaf_to_folio() set flags(including PG_locked) for tail pages folio = pfn_folio(softleaf_to_pfn(entry)) smp_wmb() VM_WARN_ON_ONCE(!folio_test_locked(folio)) prep_compound_page() for tail pages In __split_folio_to_order(), smp_wmb() guarantees page flags of tail pages are visible before the tail page becomes non-compound. smp_wmb() should be paired with smp_rmb() in softleaf_to_folio(), which is missed. As a result, if zap_nonpresent_ptes() accesses migration entry that stores tail pfn, softleaf_to_folio() may see the updated compound_head of tail page before page->flags. This issue will trigger VM_WARN_ON_ONCE() in pfn_swap_entry_folio() because of the race between folio split and zap_nonpresent_ptes() leading to a folio incorrectly undergoing modification without a folio lock being held. This is a BUG_ON() before commit 93976a20345b ("mm: eliminate further swapops predicates"), which in merged in v6.19-rc1. To fix it, add missing smp_rmb() if the softleaf entry is migration entry in softleaf_to_folio() and softleaf_to_page(). [tujinjiang@huawei.com: update function name and comments] Link: https://lkml.kernel.org/r/20260321075214.3305564-1-tujinjiang@huawei.com Link: https://lkml.kernel.org/r/20260319012541.4158561-1-tujinjiang@huawei.com Fixes: e9b61f19858a ("thp: reintroduce split_huge_page()") Signed-off-by: Jinjiang Tu Acked-by: David Hildenbrand (Arm) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Barry Song Cc: Kefeng Wang Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Nanyong Sun Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/leafops.h | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/include/linux/leafops.h b/include/linux/leafops.h index a9ff94b744f2..05673d3529e7 100644 --- a/include/linux/leafops.h +++ b/include/linux/leafops.h @@ -363,6 +363,23 @@ static inline unsigned long softleaf_to_pfn(softleaf_t entry) return swp_offset(entry) & SWP_PFN_MASK; } +static inline void softleaf_migration_sync(softleaf_t entry, + struct folio *folio) +{ + /* + * Ensure we do not race with split, which might alter tail pages into new + * folios and thus result in observing an unlocked folio. + * This matches the write barrier in __split_folio_to_order(). + */ + smp_rmb(); + + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + VM_WARN_ON_ONCE(!folio_test_locked(folio)); +} + /** * softleaf_to_page() - Obtains struct page for PFN encoded within leaf entry. * @entry: Leaf entry, softleaf_has_pfn(@entry) must return true. @@ -374,11 +391,8 @@ static inline struct page *softleaf_to_page(softleaf_t entry) struct page *page = pfn_to_page(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding page is locked - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && !PageLocked(page)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, page_folio(page)); return page; } @@ -394,12 +408,8 @@ static inline struct folio *softleaf_to_folio(softleaf_t entry) struct folio *folio = pfn_folio(softleaf_to_pfn(entry)); VM_WARN_ON_ONCE(!softleaf_has_pfn(entry)); - /* - * Any use of migration entries may only occur while the - * corresponding folio is locked. - */ - VM_WARN_ON_ONCE(softleaf_is_migration(entry) && - !folio_test_locked(folio)); + if (softleaf_is_migration(entry)) + softleaf_migration_sync(entry, folio); return folio; } From 26d3dca201f3662e51d25022cfce0f642a150a90 Mon Sep 17 00:00:00 2001 From: "Harry Yoo (Oracle)" Date: Fri, 20 Mar 2026 21:59:25 +0900 Subject: [PATCH 02/10] MAINTAINERS, mailmap: update email address for Harry Yoo Update my email address to harry@kernel.org. Link: https://lkml.kernel.org/r/20260320125925.2259998-1-harry@kernel.org Signed-off-by: Harry Yoo (Oracle) Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 7d14504daf24..2d04aeba68b4 100644 --- a/.mailmap +++ b/.mailmap @@ -316,6 +316,7 @@ Hans Verkuil Hans Verkuil Hao Ge Harry Yoo <42.hyeyoo@gmail.com> +Harry Yoo Heiko Carstens Heiko Carstens Heiko Stuebner diff --git a/MAINTAINERS b/MAINTAINERS index 7d10988cbc62..915f074c3577 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16877,7 +16877,7 @@ M: Lorenzo Stoakes R: Rik van Riel R: Liam R. Howlett R: Vlastimil Babka -R: Harry Yoo +R: Harry Yoo R: Jann Horn L: linux-mm@kvack.org S: Maintained @@ -24343,7 +24343,7 @@ F: drivers/nvmem/layouts/sl28vpd.c SLAB ALLOCATOR M: Vlastimil Babka -M: Harry Yoo +M: Harry Yoo M: Andrew Morton R: Hao Li R: Christoph Lameter From 9e0d0ddfbc0e3491da7e2db73faa08d8d4f322b2 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Fri, 20 Mar 2026 06:05:59 +0100 Subject: [PATCH 03/10] mm/swap: fix swap cache memcg accounting The swap readahead path was recently refactored and while doing this, the order between the charging of the folio in the memcg and the addition of the folio in the swap cache was inverted. Since the accounting of the folio is done while adding the folio to the swap cache and the folio is not charged in the memcg yet, the accounting is then done at the node level, which is wrong. Fix this by charging the folio in the memcg before adding it to the swap cache. Link: https://lkml.kernel.org/r/20260320050601.1833108-1-alex@ghiti.fr Fixes: 2732acda82c9 ("mm, swap: use swap cache as the swap in synchronize layer") Signed-off-by: Alexandre Ghiti Acked-by: Kairui Song Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: Alexandre Ghiti Cc: Baoquan He Cc: Barry Song Cc: Kemeng Shi Cc: Signed-off-by: Andrew Morton --- mm/swap_state.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d0eef7470be..48aff2c917c0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -494,6 +494,10 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, __folio_set_locked(folio); __folio_set_swapbacked(folio); + + if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) + goto failed; + for (;;) { ret = swap_cache_add_folio(folio, entry, &shadow); if (!ret) @@ -514,11 +518,6 @@ static struct folio *__swap_cache_prepare_and_add(swp_entry_t entry, goto failed; } - if (!charged && mem_cgroup_swapin_charge_folio(folio, NULL, gfp, entry)) { - swap_cache_del_folio(folio); - goto failed; - } - memcg1_swapin(entry, folio_nr_pages(folio)); if (shadow) workingset_refault(folio, shadow); From 7fe000eb32904758a85e62f6ea9483f89d5dabfc Mon Sep 17 00:00:00 2001 From: Josh Law Date: Sat, 21 Mar 2026 10:54:24 -0700 Subject: [PATCH 04/10] mm/damon/sysfs: fix param_ctx leak on damon_sysfs_new_test_ctx() failure Patch series "mm/damon/sysfs: fix memory leak and NULL dereference issues", v4. DAMON_SYSFS can leak memory under allocation failure, and do NULL pointer dereference when a privileged user make wrong sequences of control. Fix those. This patch (of 3): When damon_sysfs_new_test_ctx() fails in damon_sysfs_commit_input(), param_ctx is leaked because the early return skips the cleanup at the out label. Destroy param_ctx before returning. Link: https://lkml.kernel.org/r/20260321175427.86000-1-sj@kernel.org Link: https://lkml.kernel.org/r/20260321175427.86000-2-sj@kernel.org Fixes: f0c5118ebb0e ("mm/damon/sysfs: catch commit test ctx alloc failure") Signed-off-by: Josh Law Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: [6.18+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 576d1ddd736b..b573b9d60784 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1524,8 +1524,10 @@ static int damon_sysfs_commit_input(void *data) if (IS_ERR(param_ctx)) return PTR_ERR(param_ctx); test_ctx = damon_sysfs_new_test_ctx(kdamond->damon_ctx); - if (!test_ctx) + if (!test_ctx) { + damon_destroy_ctx(param_ctx); return -ENOMEM; + } err = damon_commit_ctx(test_ctx, param_ctx); if (err) goto out; From 1bfe9fb5ed2667fb075682408b776b5273162615 Mon Sep 17 00:00:00 2001 From: Josh Law Date: Sat, 21 Mar 2026 10:54:25 -0700 Subject: [PATCH 05/10] mm/damon/sysfs: check contexts->nr before accessing contexts_arr[0] Multiple sysfs command paths dereference contexts_arr[0] without first verifying that kdamond->contexts->nr == 1. A user can set nr_contexts to 0 via sysfs while DAMON is running, causing NULL pointer dereferences. In more detail, the issue can be triggered by privileged users like below. First, start DAMON and make contexts directory empty (kdamond->contexts->nr == 0). # damo start # cd /sys/kernel/mm/damon/admin/kdamonds/0 # echo 0 > contexts/nr_contexts Then, each of below commands will cause the NULL pointer dereference. # echo update_schemes_stats > state # echo update_schemes_tried_regions > state # echo update_schemes_tried_bytes > state # echo update_schemes_effective_quotas > state # echo update_tuned_intervals > state Guard all commands (except OFF) at the entry point of damon_sysfs_handle_cmd(). Link: https://lkml.kernel.org/r/20260321175427.86000-3-sj@kernel.org Fixes: 0ac32b8affb5 ("mm/damon/sysfs: support DAMOS stats") Signed-off-by: Josh Law Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: [5.18+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index b573b9d60784..ddc30586c0e6 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1749,6 +1749,9 @@ static int damon_sysfs_update_schemes_tried_regions( static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, struct damon_sysfs_kdamond *kdamond) { + if (cmd != DAMON_SYSFS_CMD_OFF && kdamond->contexts->nr != 1) + return -EINVAL; + switch (cmd) { case DAMON_SYSFS_CMD_ON: return damon_sysfs_turn_damon_on(kdamond); From 6557004a8b59c7701e695f02be03c7e20ed1cc15 Mon Sep 17 00:00:00 2001 From: Josh Law Date: Sat, 21 Mar 2026 10:54:26 -0700 Subject: [PATCH 06/10] mm/damon/sysfs: check contexts->nr in repeat_call_fn damon_sysfs_repeat_call_fn() calls damon_sysfs_upd_tuned_intervals(), damon_sysfs_upd_schemes_stats(), and damon_sysfs_upd_schemes_effective_quotas() without checking contexts->nr. If nr_contexts is set to 0 via sysfs while DAMON is running, these functions dereference contexts_arr[0] and cause a NULL pointer dereference. Add the missing check. For example, the issue can be reproduced using DAMON sysfs interface and DAMON user-space tool (damo) [1] like below. $ sudo damo start --refresh_interval 1s $ echo 0 | sudo tee \ /sys/kernel/mm/damon/admin/kdamonds/0/contexts/nr_contexts Link: https://patch.msgid.link/20260320163559.178101-3-objecting@objecting.org Link: https://lkml.kernel.org/r/20260321175427.86000-4-sj@kernel.org Link: https://github.com/damonitor/damo [1] Fixes: d809a7c64ba8 ("mm/damon/sysfs: implement refresh_ms file internal work") Signed-off-by: Josh Law Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Cc: [6.17+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index ddc30586c0e6..6a44a2f3d8fc 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1620,9 +1620,12 @@ static int damon_sysfs_repeat_call_fn(void *data) if (!mutex_trylock(&damon_sysfs_lock)) return 0; + if (sysfs_kdamond->contexts->nr != 1) + goto out; damon_sysfs_upd_tuned_intervals(sysfs_kdamond); damon_sysfs_upd_schemes_stats(sysfs_kdamond); damon_sysfs_upd_schemes_effective_quotas(sysfs_kdamond); +out: mutex_unlock(&damon_sysfs_lock); return 0; } From ffef67b93aa352b34e6aeba3d52c19a63885409a Mon Sep 17 00:00:00 2001 From: "David Hildenbrand (Arm)" Date: Mon, 23 Mar 2026 21:20:18 +0100 Subject: [PATCH 07/10] mm/memory: fix PMD/PUD checks in follow_pfnmap_start() follow_pfnmap_start() suffers from two problems: (1) We are not re-fetching the pmd/pud after taking the PTL Therefore, we are not properly stabilizing what the lock actually protects. If there is concurrent zapping, we would indicate to the caller that we found an entry, however, that entry might already have been invalidated, or contain a different PFN after taking the lock. Properly use pmdp_get() / pudp_get() after taking the lock. (2) pmd_leaf() / pud_leaf() are not well defined on non-present entries pmd_leaf()/pud_leaf() could wrongly trigger on non-present entries. There is no real guarantee that pmd_leaf()/pud_leaf() returns something reasonable on non-present entries. Most architectures indeed either perform a present check or make it work by smart use of flags. However, for example loongarch checks the _PAGE_HUGE flag in pmd_leaf(), and always sets the _PAGE_HUGE flag in __swp_entry_to_pmd(). Whereby pmd_trans_huge() explicitly checks pmd_present(), pmd_leaf() does not do that. Let's check pmd_present()/pud_present() before assuming "the is a present PMD leaf" when spotting pmd_leaf()/pud_leaf(), like other page table handling code that traverses user page tables does. Given that non-present PMD entries are likely rare in VM_IO|VM_PFNMAP, (1) is likely more relevant than (2). It is questionable how often (1) would actually trigger, but let's CC stable to be sure. This was found by code inspection. Link: https://lkml.kernel.org/r/20260323-follow_pfnmap_fix-v1-1-5b0ec10872b3@kernel.org Fixes: 6da8e9634bb7 ("mm: new follow_pfnmap API") Signed-off-by: David Hildenbrand (Arm) Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes (Oracle) Cc: Liam Howlett Cc: Michal Hocko Cc: Peter Xu Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/memory.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2f815a34d924..c65e82c86fed 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6815,11 +6815,16 @@ retry: pudp = pud_offset(p4dp, address); pud = pudp_get(pudp); - if (pud_none(pud)) + if (!pud_present(pud)) goto out; if (pud_leaf(pud)) { lock = pud_lock(mm, pudp); - if (!unlikely(pud_leaf(pud))) { + pud = pudp_get(pudp); + + if (unlikely(!pud_present(pud))) { + spin_unlock(lock); + goto out; + } else if (unlikely(!pud_leaf(pud))) { spin_unlock(lock); goto retry; } @@ -6831,9 +6836,16 @@ retry: pmdp = pmd_offset(pudp, address); pmd = pmdp_get_lockless(pmdp); + if (!pmd_present(pmd)) + goto out; if (pmd_leaf(pmd)) { lock = pmd_lock(mm, pmdp); - if (!unlikely(pmd_leaf(pmd))) { + pmd = pmdp_get(pmdp); + + if (unlikely(!pmd_present(pmd))) { + spin_unlock(lock); + goto out; + } else if (unlikely(!pmd_leaf(pmd))) { spin_unlock(lock); goto retry; } From 3b89863c3fa482912911cd65a12a3aeef662c250 Mon Sep 17 00:00:00 2001 From: Max Boone Date: Wed, 25 Mar 2026 10:59:16 +0100 Subject: [PATCH 08/10] mm/pagewalk: fix race between concurrent split and refault The splitting of a PUD entry in walk_pud_range() can race with a concurrent thread refaulting the PUD leaf entry causing it to try walking a PMD range that has disappeared. An example and reproduction of this is to try reading numa_maps of a process while VFIO-PCI is setting up DMA (specifically the vfio_pin_pages_remote call) on a large BAR for that process. This will trigger a kernel BUG: vfio-pci 0000:03:00.0: enabling device (0000 -> 0002) BUG: unable to handle page fault for address: ffffa23980000000 PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI ... RIP: 0010:walk_pgd_range+0x3b5/0x7a0 Code: 8d 43 ff 48 89 44 24 28 4d 89 ce 4d 8d a7 00 00 20 00 48 8b 4c 24 28 49 81 e4 00 00 e0 ff 49 8d 44 24 ff 48 39 c8 4c 0f 43 e3 <49> f7 06 9f ff ff ff 75 3b 48 8b 44 24 20 48 8b 40 28 48 85 c0 74 RSP: 0018:ffffac23e1ecf808 EFLAGS: 00010287 RAX: 00007f44c01fffff RBX: 00007f4500000000 RCX: 00007f44ffffffff RDX: 0000000000000000 RSI: 000ffffffffff000 RDI: ffffffff93378fe0 RBP: ffffac23e1ecf918 R08: 0000000000000004 R09: ffffa23980000000 R10: 0000000000000020 R11: 0000000000000004 R12: 00007f44c0200000 R13: 00007f44c0000000 R14: ffffa23980000000 R15: 00007f44c0000000 FS: 00007fe884739580(0000) GS:ffff9b7d7a9c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffa23980000000 CR3: 000000c0650e2005 CR4: 0000000000770ef0 PKRU: 55555554 Call Trace: __walk_page_range+0x195/0x1b0 walk_page_vma+0x62/0xc0 show_numa_map+0x12b/0x3b0 seq_read_iter+0x297/0x440 seq_read+0x11d/0x140 vfs_read+0xc2/0x340 ksys_read+0x5f/0xe0 do_syscall_64+0x68/0x130 ? get_page_from_freelist+0x5c2/0x17e0 ? mas_store_prealloc+0x17e/0x360 ? vma_set_page_prot+0x4c/0xa0 ? __alloc_pages_noprof+0x14e/0x2d0 ? __mod_memcg_lruvec_state+0x8d/0x140 ? __lruvec_stat_mod_folio+0x76/0xb0 ? __folio_mod_stat+0x26/0x80 ? do_anonymous_page+0x705/0x900 ? __handle_mm_fault+0xa8d/0x1000 ? __count_memcg_events+0x53/0xf0 ? handle_mm_fault+0xa5/0x360 ? do_user_addr_fault+0x342/0x640 ? arch_exit_to_user_mode_prepare.constprop.0+0x16/0xa0 ? irqentry_exit_to_user_mode+0x24/0x100 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fe88464f47e Code: c0 e9 b6 fe ff ff 50 48 8d 3d be 07 0b 00 e8 69 01 02 00 66 0f 1f 84 00 00 00 00 00 64 8b 04 25 18 00 00 00 85 c0 75 14 0f 05 <48> 3d 00 f0 ff ff 77 5a c3 66 0f 1f 84 00 00 00 00 00 48 83 ec 28 RSP: 002b:00007ffe6cd9a9b8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007fe88464f47e RDX: 0000000000020000 RSI: 00007fe884543000 RDI: 0000000000000003 RBP: 00007fe884543000 R08: 00007fe884542010 R09: 0000000000000000 R10: fffffffffffffbc5 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000003 R14: 0000000000020000 R15: 0000000000020000 Fix this by validating the PUD entry in walk_pmd_range() using a stable snapshot (pudp_get()). If the PUD is not present or is a leaf, retry the walk via ACTION_AGAIN instead of descending further. This mirrors the retry logic in walk_pte_range(), which lets walk_pmd_range() retry if the PTE is not being got by pte_offset_map_lock(). Link: https://lkml.kernel.org/r/20260325-pagewalk-check-pmd-refault-v2-1-707bff33bc60@akamai.com Fixes: f9e54c3a2f5b ("vfio/pci: implement huge_fault support") Co-developed-by: David Hildenbrand (Arm) Signed-off-by: David Hildenbrand (Arm) Signed-off-by: Max Boone Acked-by: David Hildenbrand (Arm) Cc: Liam Howlett Cc: Lorenzo Stoakes (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/pagewalk.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index a94c401ab2cf..4e7bcd975c54 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -97,6 +97,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, struct mm_walk *walk) { + pud_t pudval = pudp_get(pud); pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; @@ -105,6 +106,24 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, int err = 0; int depth = real_depth(3); + /* + * For PTE handling, pte_offset_map_lock() takes care of checking + * whether there actually is a page table. But it also has to be + * very careful about concurrent page table reclaim. + * + * Similarly, we have to be careful here - a PUD entry that points + * to a PMD table cannot go away, so we can just walk it. But if + * it's something else, we need to ensure we didn't race something, + * so need to retry. + * + * A pertinent example of this is a PUD refault after PUD split - + * we will need to split again or risk accessing invalid memory. + */ + if (!pud_present(pudval) || pud_leaf(pudval)) { + walk->action = ACTION_AGAIN; + return 0; + } + pmd = pmd_offset(pud, addr); do { again: @@ -218,12 +237,12 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, else if (pud_leaf(*pud) || !pud_present(*pud)) continue; /* Nothing to do. */ - if (pud_none(*pud)) - goto again; - err = walk_pmd_range(pud, addr, next, walk); if (err) break; + + if (walk->action == ACTION_AGAIN) + goto again; } while (pud++, addr = next, addr != end); return err; From 2598ab9d63f41160c7081998857fef409182933d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 23 Mar 2026 21:55:16 +0100 Subject: [PATCH 09/10] bug: avoid format attribute warning for clang as well Like gcc, clang-22 now also warns about a function that it incorrectly identifies as a printf-style format: lib/bug.c:190:22: error: diagnostic behavior may be improved by adding the 'format(printf, 1, 0)' attribute to the declaration of '__warn_printf' [-Werror,-Wmissing-format-attribute] 179 | static void __warn_printf(const char *fmt, struct pt_regs *regs) | __attribute__((format(printf, 1, 0))) 180 | { 181 | if (!fmt) 182 | return; 183 | 184 | #ifdef HAVE_ARCH_BUG_FORMAT_ARGS 185 | if (regs) { 186 | struct arch_va_list _args; 187 | va_list *args = __warn_args(&_args, regs); 188 | 189 | if (args) { 190 | vprintk(fmt, *args); | ^ Revert the change that added a gcc-specific workaround, and instead add the generic annotation that avoid the warning. Link: https://lkml.kernel.org/r/20260323205534.1284284-1-arnd@kernel.org Fixes: d36067d6ea00 ("bug: Hush suggest-attribute=format for __warn_printf()") Suggested-by: Andy Shevchenko Suggested-by: Brendan Jackman Link: https://lore.kernel.org/all/20251208141618.2805983-1-andriy.shevchenko@linux.intel.com/T/#u Signed-off-by: Arnd Bergmann Reviewed-by: Brendan Jackman Reviewed-by: Andy Shevchenko Cc: Bill Wendling Cc: Ingo Molnar Cc: Justin Stitt Cc: Nathan Chancellor Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- lib/bug.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lib/bug.c b/lib/bug.c index 623c467a8b76..aab9e6a40c5f 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -173,10 +173,8 @@ struct bug_entry *find_bug(unsigned long bugaddr) return module_find_bug(bugaddr); } -__diag_push(); -__diag_ignore(GCC, all, "-Wsuggest-attribute=format", - "Not a valid __printf() conversion candidate."); -static void __warn_printf(const char *fmt, struct pt_regs *regs) +static __printf(1, 0) +void __warn_printf(const char *fmt, struct pt_regs *regs) { if (!fmt) return; @@ -195,7 +193,6 @@ static void __warn_printf(const char *fmt, struct pt_regs *regs) printk("%s", fmt); } -__diag_pop(); static enum bug_trap_type __report_bug(struct bug_entry *bug, unsigned long bugaddr, struct pt_regs *regs) { From 2697dd8ae721db4f6a53d4f4cbd438212a80f8dc Mon Sep 17 00:00:00 2001 From: "Lorenzo Stoakes (Oracle)" Date: Fri, 27 Mar 2026 17:31:04 +0000 Subject: [PATCH 10/10] mm/mseal: update VMA end correctly on merge Previously we stored the end of the current VMA in curr_end, and then upon iterating to the next VMA updated curr_start to curr_end to advance to the next VMA. However, this doesn't take into account the fact that a VMA might be updated due to a merge by vma_modify_flags(), which can result in curr_end being stale and thus, upon setting curr_start to curr_end, ending up with an incorrect curr_start on the next iteration. Resolve the issue by setting curr_end to vma->vm_end unconditionally to ensure this value remains updated should this occur. While we're here, eliminate this entire class of bug by simply setting const curr_[start/end] to be clamped to the input range and VMAs, which also happens to simplify the logic. Link: https://lkml.kernel.org/r/20260327173104.322405-1-ljs@kernel.org Fixes: 6c2da14ae1e0 ("mm/mseal: rework mseal apply logic") Signed-off-by: Lorenzo Stoakes (Oracle) Reported-by: Antonius Closes: https://lore.kernel.org/linux-mm/CAK8a0jwWGj9-SgFk0yKFh7i8jMkwKm5b0ao9=kmXWjO54veX2g@mail.gmail.com/ Suggested-by: David Hildenbrand (ARM) Acked-by: Vlastimil Babka (SUSE) Reviewed-by: Pedro Falcato Acked-by: David Hildenbrand (Arm) Cc: Jann Horn Cc: Jeff Xu Cc: Liam Howlett Cc: Signed-off-by: Andrew Morton --- mm/mseal.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/mseal.c b/mm/mseal.c index 316b5e1dec78..ac58643181f7 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -56,7 +56,6 @@ static int mseal_apply(struct mm_struct *mm, unsigned long start, unsigned long end) { struct vm_area_struct *vma, *prev; - unsigned long curr_start = start; VMA_ITERATOR(vmi, mm, start); /* We know there are no gaps so this will be non-NULL. */ @@ -66,6 +65,7 @@ static int mseal_apply(struct mm_struct *mm, prev = vma; for_each_vma_range(vmi, vma, end) { + const unsigned long curr_start = MAX(vma->vm_start, start); const unsigned long curr_end = MIN(vma->vm_end, end); if (!(vma->vm_flags & VM_SEALED)) { @@ -79,7 +79,6 @@ static int mseal_apply(struct mm_struct *mm, } prev = vma; - curr_start = curr_end; } return 0;