From 56b060d0a1d3b1fd0429daeac366f00c030fca59 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Tue, 5 Aug 2025 13:50:47 -0700 Subject: [PATCH 001/372] mempolicy: clarify what zone reclaim means The zone_reclaim_mode API controls the reclaim behavior when a node runs out of memory. Contrary to its user-facing name, it is internally referred to as "node_reclaim_mode". This can be confusing. But because we cannot change the name of the API since it has been in place since at least 2.6, let's try to be more explicit about what the behavior of this API is. Change the description to clarify what zone reclaim entails, and be explicit about the RECLAIM_ZONE bit, whose purpose has led to some confusion in the past already [1] [2]. While at it, also soften the warning about changing these bits. [joshua.hahnjy@gmail.com: remove the reference to the vm.zone_reclaim_mode sysctl as an ABI] Link: https://lkml.kernel.org/r/20250806134404.2000234-1-joshua.hahnjy@gmail.com Link: https://lkml.kernel.org/r/20250805205048.1518453-1-joshua.hahnjy@gmail.com Link: https://lore.kernel.org/linux-mm/1579005573-58923-1-git-send-email-alex.shi@linux.alibaba.com/ [1] Link: https://lore.kernel.org/linux-mm/20200626003459.D8E015CA@viggo.jf.intel.com/ [2] Signed-off-by: Joshua Hahn Acked-by: SeongJae Park Acked-by: David Hildenbrand Reviewed-by: Huang Ying Acked-by: Zi Yan Acked-by: Byungchul Park Cc: Alistair Popple Cc: Byungchul Park Cc: Gregory Price Cc: Mathew Brost Cc: Rakie Kim Signed-off-by: Andrew Morton --- include/uapi/linux/mempolicy.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 1f9bb10d1a47..8fbbe613611a 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -66,10 +66,16 @@ enum { #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ /* - * These bit locations are exposed in the vm.zone_reclaim_mode sysctl - * ABI. New bits are OK, but existing bits can never change. + * Enabling zone reclaim means the page allocator will attempt to fulfill + * the allocation request on the current node by triggering reclaim and + * trying to shrink the current node. + * Fallback allocations on the next candidates in the zonelist are considered + * when reclaim fails to free up enough memory in the current node/zone. + * + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl. + * New bits are OK, but existing bits should not be changed. */ -#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_ZONE (1<<0) /* Enable zone reclaim */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ From e5eb32468859145806bd72d4d3ce3cb617528f37 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 28 Jul 2025 17:25:07 +0200 Subject: [PATCH 002/372] kasan: add test for SLAB_TYPESAFE_BY_RCU quarantine skipping Verify that KASAN does not quarantine objects in SLAB_TYPESAFE_BY_RCU slabs if CONFIG_SLUB_RCU_DEBUG is off. [jannh@google.com: v2] Link: https://lkml.kernel.org/r/20250729-kasan-tsbrcu-noquarantine-test-v2-1-d16bd99309c9@google.com [jannh@google.com: make comment more verbose] Link: https://lkml.kernel.org/r/20250814-kasan-tsbrcu-noquarantine-test-v3-1-9e9110009b4e@google.com Link: https://lkml.kernel.org/r/20250728-kasan-tsbrcu-noquarantine-test-v1-1-fa24d9ab7f41@google.com Signed-off-by: Jann Horn Suggested-by: Andrey Konovalov Acked-by: Vlastimil Babka Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitriy Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index f4b17984b627..4cf2b5f8d6c1 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1073,6 +1073,45 @@ static void kmem_cache_rcu_uaf(struct kunit *test) kmem_cache_destroy(cache); } +/* + * Check that SLAB_TYPESAFE_BY_RCU objects are immediately reused when + * CONFIG_SLUB_RCU_DEBUG is off, and stay at the same address. + * Without this, KASAN builds would be unable to trigger bugs caused by + * SLAB_TYPESAFE_BY_RCU users handling reycled objects improperly. + */ +static void kmem_cache_rcu_reuse(struct kunit *test) +{ + char *p, *p2; + struct kmem_cache *cache; + + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_SLUB_RCU_DEBUG); + + cache = kmem_cache_create("test_cache", 16, 0, SLAB_TYPESAFE_BY_RCU, + NULL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, cache); + + migrate_disable(); + p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) { + kunit_err(test, "Allocation failed: %s\n", __func__); + goto out; + } + + kmem_cache_free(cache, p); + p2 = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p2) { + kunit_err(test, "Allocation failed: %s\n", __func__); + goto out; + } + KUNIT_EXPECT_PTR_EQ(test, p, p2); + + kmem_cache_free(cache, p2); + +out: + migrate_enable(); + kmem_cache_destroy(cache); +} + static void kmem_cache_double_destroy(struct kunit *test) { struct kmem_cache *cache; @@ -2106,6 +2145,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmem_cache_double_free), KUNIT_CASE(kmem_cache_invalid_free), KUNIT_CASE(kmem_cache_rcu_uaf), + KUNIT_CASE(kmem_cache_rcu_reuse), KUNIT_CASE(kmem_cache_double_destroy), KUNIT_CASE(kmem_cache_accounted), KUNIT_CASE(kmem_cache_bulk), From 915a4022b591d5476f72a4d2ca688212a0f983a4 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Thu, 17 Jul 2025 16:28:45 +0800 Subject: [PATCH 003/372] mm/mglru: update MG-LRU proactive reclaim statistics only to memcg Users can use /sys/kernel/debug/lru_gen to trigger proactive memory reclaim of a specified memcg. Currently, statistics such as pgrefill, pgscan and pgsteal will be updated to the /proc/vmstat system memory statistics. This will confuse some system memory pressure monitoring tools, making it difficult to determine whether pgscan and pgsteal are caused by system-level pressure or by proactive memory reclaim of some specific memory cgroup. Therefore, make this interface behave similarly to memory.reclaim. Update proactive memory reclaim statistics only to its memory cgroup. Link: https://lkml.kernel.org/r/20250717082845.34673-1-jiahao.kernel@gmail.com Signed-off-by: Hao Jia Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Greg Thelen Cc: Johannes Weiner Cc: Kinsey Ho Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Yuanchu Xie Cc: Yu Zhao Cc: Roman Gushchin Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index a48aec8bfd92..b9a1cfeb2ddf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5561,6 +5561,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (memcg_id != mem_cgroup_id(memcg)) goto done; + sc->target_mem_cgroup = memcg; lruvec = get_lruvec(memcg, nid); if (swappiness < MIN_SWAPPINESS) @@ -5597,6 +5598,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, .may_swap = true, .reclaim_idx = MAX_NR_ZONES - 1, .gfp_mask = GFP_KERNEL, + .proactive = true, }; buf = kvmalloc(len + 1, GFP_KERNEL); From 337135e6124b6d37d7ef1cd5a6c0b9681938c5ee Mon Sep 17 00:00:00 2001 From: Ruan Shiyang Date: Tue, 29 Jul 2025 11:51:01 +0800 Subject: [PATCH 004/372] mm: memory-tiering: fix PGPROMOTE_CANDIDATE counting Goto-san reported confusing pgpromote statistics where the pgpromote_success count significantly exceeded pgpromote_candidate. On a system with three nodes (nodes 0-1: DRAM 4GB, node 2: NVDIMM 4GB): # Enable demotion only echo 1 > /sys/kernel/mm/numa/demotion_enabled numactl -m 0-1 memhog -r200 3500M >/dev/null & pid=$! sleep 2 numactl memhog -r100 2500M >/dev/null & sleep 10 kill -9 $pid # terminate the 1st memhog # Enable promotion echo 2 > /proc/sys/kernel/numa_balancing After a few seconds, we observeed `pgpromote_candidate < pgpromote_success` $ grep -e pgpromote /proc/vmstat pgpromote_success 2579 pgpromote_candidate 0 In this scenario, after terminating the first memhog, the conditions for pgdat_free_space_enough() are quickly met, and triggers promotion. However, these migrated pages are only counted for in PGPROMOTE_SUCCESS, not in PGPROMOTE_CANDIDATE. To solve these confusing statistics, introduce PGPROMOTE_CANDIDATE_NRL to count the missed promotion pages. And also, not counting these pages into PGPROMOTE_CANDIDATE is to avoid changing the existing algorithm or performance of the promotion rate limit. Link: https://lkml.kernel.org/r/20250901090122.124262-1-ruansy.fnst@fujitsu.com Link: https://lkml.kernel.org/r/20250729035101.1601407-1-ruansy.fnst@fujitsu.com Fixes: c6833e10008f ("memory tiering: rate limit NUMA migration throughput") Co-developed-by: Li Zhijian Signed-off-by: Li Zhijian Signed-off-by: Ruan Shiyang Reported-by: Yasunori Gotou (Fujitsu) Suggested-by: Huang Ying Acked-by: Vlastimil Babka Reviewed-by: Huang Ying Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Juri Lelli Cc: Vincent Guittot Cc: Dietmar Eggemann Cc: Steven Rostedt Cc: Ben Segall Cc: Mel Gorman Cc: Valentin Schneider Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 16 +++++++++++++++- kernel/sched/fair.c | 5 +++-- mm/vmstat.c | 1 + 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c5da9141983..9d3ea9085556 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -234,7 +234,21 @@ enum node_stat_item { #endif #ifdef CONFIG_NUMA_BALANCING PGPROMOTE_SUCCESS, /* promote successfully */ - PGPROMOTE_CANDIDATE, /* candidate pages to promote */ + /** + * Candidate pages for promotion based on hint fault latency. This + * counter is used to control the promotion rate and adjust the hot + * threshold. + */ + PGPROMOTE_CANDIDATE, + /** + * Not rate-limited (NRL) candidate pages for those can be promoted + * without considering hot threshold because of enough free pages in + * fast-tier node. These promotions bypass the regular hotness checks + * and do NOT influence the promotion rate-limiter or + * threshold-adjustment logic. + * This is for statistics/monitoring purposes. + */ + PGPROMOTE_CANDIDATE_NRL, #endif /* PGDEMOTE_*: pages demoted */ PGDEMOTE_KSWAPD, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b173a059315c..82c8d804c54c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1923,11 +1923,13 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, struct pglist_data *pgdat; unsigned long rate_limit; unsigned int latency, th, def_th; + long nr = folio_nr_pages(folio); pgdat = NODE_DATA(dst_nid); if (pgdat_free_space_enough(pgdat)) { /* workload changed, reset hot threshold */ pgdat->nbp_threshold = 0; + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr); return true; } @@ -1941,8 +1943,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, if (latency >= th) return false; - return !numa_promotion_rate_limit(pgdat, rate_limit, - folio_nr_pages(folio)); + return !numa_promotion_rate_limit(pgdat, rate_limit, nr); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); diff --git a/mm/vmstat.c b/mm/vmstat.c index 71cd1ceba191..e74f0b2a1021 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1280,6 +1280,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING [I(PGPROMOTE_SUCCESS)] = "pgpromote_success", [I(PGPROMOTE_CANDIDATE)] = "pgpromote_candidate", + [I(PGPROMOTE_CANDIDATE_NRL)] = "pgpromote_candidate_nrl", #endif [I(PGDEMOTE_KSWAPD)] = "pgdemote_kswapd", [I(PGDEMOTE_DIRECT)] = "pgdemote_direct", From 849d5cff4d48be9146c2bf3c492fc7f434b5ffaa Mon Sep 17 00:00:00 2001 From: Zhang Qilong Date: Thu, 31 Jul 2025 20:23:05 +0800 Subject: [PATCH 005/372] /dev/zero: try to align PMD_SIZE for private mapping Attempt to map aligned to huge page size for private mapping which could achieve performance gains, the mprot_tw4m in libMicro average execution time on arm64: - Test case: mprot_tw4m - Before the patch: 22 us - After the patch: 17 us If THP config is not set, we fall back to system page size mappings. Link: https://lkml.kernel.org/r/20250731122305.2669090-1-zhangqilong3@huawei.com Signed-off-by: Zhang Qilong Reviewed-by: Lorenzo Stoakes Tested-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- drivers/char/mem.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 48839958b0b1..34b815901b20 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -512,11 +512,18 @@ static int mmap_zero(struct file *file, struct vm_area_struct *vma) return 0; } +#ifndef CONFIG_MMU +static unsigned long get_unmapped_area_zero(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + return -ENOSYS; +} +#else static unsigned long get_unmapped_area_zero(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { -#ifdef CONFIG_MMU if (flags & MAP_SHARED) { /* * mmap_zero() will call shmem_zero_setup() to create a file, @@ -527,12 +534,18 @@ static unsigned long get_unmapped_area_zero(struct file *file, return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags); } - /* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */ - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + /* + * Otherwise flags & MAP_PRIVATE: with no shmem object beneath it, + * attempt to map aligned to huge page size if possible, otherwise we + * fall back to system page size mappings. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + return thp_get_unmapped_area(file, addr, len, pgoff, flags); #else - return -ENOSYS; + return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); #endif } +#endif /* CONFIG_MMU */ static ssize_t write_full(struct file *file, const char __user *buf, size_t count, loff_t *ppos) From 79e1c24285c40cdfa9eb00fe8131d1ba14b84ef1 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Fri, 18 Jul 2025 10:41:32 +0800 Subject: [PATCH 006/372] mm: replace (20 - PAGE_SHIFT) with common macros for pages<->MB conversion Replace repeated (20 - PAGE_SHIFT) calculations with standard macros: - MB_TO_PAGES(mb) converts MB to page count - PAGES_TO_MB(pages) converts pages to MB No functional change. [akpm@linux-foundation.org: remove arc's private PAGES_TO_MB, remove its unused PAGES_TO_KB] [akpm@linux-foundation.org: don't include mm.h due to include file ordering mess] Link: https://lkml.kernel.org/r/20250718024134.1304745-1-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Dev Jain Acked-by: David Hildenbrand Acked-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Ben Segall Cc: Boqun Feng Cc: Davidlohr Bueso Cc: Dietmar Eggemann Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Josh Triplett Cc: Juri Lelli Cc: Kairui Song Cc: Kemeng Shi Cc: Lai jiangshan Cc: Liam Howlett Cc: Mariano Pache Cc: Mathieu Desnoyers Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Neeraj Upadhyay Cc: Nhat Pham Cc: "Paul E . McKenney" Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: "Uladzislau Rezki (Sony)" Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arc/include/asm/arcregs.h | 3 --- include/linux/mm.h | 9 +++++++++ kernel/rcu/rcuscale.c | 2 +- kernel/sched/fair.c | 5 ++--- mm/backing-dev.c | 2 +- mm/huge_memory.c | 2 +- mm/swap.c | 2 +- 7 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index a31bbf5c8bbc..d84908a177bd 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -151,9 +151,6 @@ /* Helpers */ #define TO_KB(bytes) ((bytes) >> 10) #define TO_MB(bytes) (TO_KB(bytes) >> 10) -#define PAGES_TO_KB(n_pages) ((n_pages) << (PAGE_SHIFT - 10)) -#define PAGES_TO_MB(n_pages) (PAGES_TO_KB(n_pages) >> 10) - /* *************************************************************** diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ae97a0b8ec7..b626d1bacef5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -69,6 +69,15 @@ static inline void totalram_pages_add(long count) extern void * high_memory; +/* + * Convert between pages and MB + * 20 is the shift for 1MB (2^20 = 1MB) + * PAGE_SHIFT is the shift for page size (e.g., 12 for 4KB pages) + * So (20 - PAGE_SHIFT) converts between pages and MB + */ +#define PAGES_TO_MB(pages) ((pages) >> (20 - PAGE_SHIFT)) +#define MB_TO_PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index b521d0455992..7484d8ad5767 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -796,7 +796,7 @@ kfree_scale_thread(void *arg) pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n", (unsigned long long)(end_time - start_time), kfree_loops, rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started), - (mem_begin - mem_during) >> (20 - PAGE_SHIFT)); + PAGES_TO_MB(mem_begin - mem_during)); if (shutdown) { smp_mb(); /* Assign before wake. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82c8d804c54c..e256793b9a08 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1495,7 +1495,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p) * by the PTE scanner and NUMA hinting faults should be trapped based * on resident pages */ - nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT); + nr_scan_pages = MB_TO_PAGES(sysctl_numa_balancing_scan_size); rss = get_mm_rss(p->mm); if (!rss) rss = nr_scan_pages; @@ -1934,8 +1934,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, } def_th = sysctl_numa_balancing_hot_threshold; - rate_limit = sysctl_numa_balancing_promote_rate_limit << \ - (20 - PAGE_SHIFT); + rate_limit = MB_TO_PAGES(sysctl_numa_balancing_promote_rate_limit); numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 783904d8c5ef..e4d578e6121c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -510,7 +510,7 @@ static void wb_update_bandwidth_workfn(struct work_struct *work) /* * Initial write bandwidth: 100 MB/s */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) +#define INIT_BW MB_TO_PAGES(100) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c38a95e9f09..2b4ea5a2ce7d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -911,7 +911,7 @@ static int __init hugepage_init(void) * where the extra memory used could hurt more than TLB overhead * is likely to save. The admin can still enable it through /sys. */ - if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) { + if (totalram_pages() < MB_TO_PAGES(512)) { transparent_hugepage_flags = 0; return 0; } diff --git a/mm/swap.c b/mm/swap.c index 3632dd061beb..cb164f9ef9e3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1096,7 +1096,7 @@ static const struct ctl_table swap_sysctl_table[] = { */ void __init swap_setup(void) { - unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); + unsigned long megs = PAGES_TO_MB(totalram_pages()); /* Use a smaller cluster for small-memory machines */ if (megs < 16) From 7cbce1eaeb783af9e88fc5cebe9b18d46d9030bd Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 5 Aug 2025 19:19:29 +0900 Subject: [PATCH 007/372] zram: protect recomp_algorithm_show() with ->init_lock sysfs handlers should be called under ->init_lock and are not supposed to unlock it until return, otherwise e.g. a concurrent reset() can occur. There is one handler that breaks that rule: recomp_algorithm_show(). Move ->init_lock handling outside of __comp_algorithm_show() (also drop it and call zcomp_available_show() directly) so that the entire recomp_algorithm_show() loop is protected by the lock, as opposed to protecting individual iterations. The patch does not need to go to -stable, as it does not fix any runtime errors (at least I can't think of any). It makes recomp_algorithm_show() "atomic" w.r.t. zram reset() (just like the rest of zram sysfs show() handlers), that's a pretty minor change. Link: https://lkml.kernel.org/r/20250805101946.1774112-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reported-by: Seyediman Seyedarab Suggested-by: Seyediman Seyedarab Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8acad3cc6e6e..9ac271b82780 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1225,18 +1225,6 @@ static void comp_algorithm_set(struct zram *zram, u32 prio, const char *alg) zram->comp_algs[prio] = alg; } -static ssize_t __comp_algorithm_show(struct zram *zram, u32 prio, - char *buf, ssize_t at) -{ - ssize_t sz; - - down_read(&zram->init_lock); - sz = zcomp_available_show(zram->comp_algs[prio], buf, at); - up_read(&zram->init_lock); - - return sz; -} - static int __comp_algorithm_store(struct zram *zram, u32 prio, const char *buf) { char *compressor; @@ -1387,8 +1375,12 @@ static ssize_t comp_algorithm_show(struct device *dev, char *buf) { struct zram *zram = dev_to_zram(dev); + ssize_t sz; - return __comp_algorithm_show(zram, ZRAM_PRIMARY_COMP, buf, 0); + down_read(&zram->init_lock); + sz = zcomp_available_show(zram->comp_algs[ZRAM_PRIMARY_COMP], buf, 0); + up_read(&zram->init_lock); + return sz; } static ssize_t comp_algorithm_store(struct device *dev, @@ -1412,14 +1404,15 @@ static ssize_t recomp_algorithm_show(struct device *dev, ssize_t sz = 0; u32 prio; + down_read(&zram->init_lock); for (prio = ZRAM_SECONDARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { if (!zram->comp_algs[prio]) continue; sz += sysfs_emit_at(buf, sz, "#%d: ", prio); - sz += __comp_algorithm_show(zram, prio, buf, sz); + sz += zcomp_available_show(zram->comp_algs[prio], buf, sz); } - + up_read(&zram->init_lock); return sz; } From 35edbaa04a460531862d3f00c1c2bb0d5e45cd5d Mon Sep 17 00:00:00 2001 From: Sudarsan Mahendran Date: Mon, 4 Aug 2025 18:36:29 -0700 Subject: [PATCH 008/372] selftests/mm: pass filename as input param to VM_PFNMAP tests Enable these tests to be run on other pfnmap'ed memory like NVIDIA's EGM. Add '--' as a separator to pass in file path. This allows passing of cmd line arguments to kselftest_harness. Use '/dev/mem' as default filename. Existing test passes: pfnmap TAP version 13 1..6 # Starting 6 tests from 1 test cases. # PASSED: 6 / 6 tests passed. # Totals: pass:6 fail:0 xfail:0 xpass:0 skip:0 error:0 Pass params to kselftest_harness: pfnmap -r pfnmap:mremap_fixed TAP version 13 1..1 # Starting 1 tests from 1 test cases. # RUN pfnmap.mremap_fixed ... # OK pfnmap.mremap_fixed ok 1 pfnmap.mremap_fixed # PASSED: 1 / 1 tests passed. # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 Pass non-existent file name as input: pfnmap -- /dev/blah TAP version 13 1..6 # Starting 6 tests from 1 test cases. # RUN pfnmap.madvise_disallowed ... # SKIP Cannot open '/dev/blah' Pass non pfnmap'ed file as input: pfnmap -r pfnmap.madvise_disallowed -- randfile.txt TAP version 13 1..1 # Starting 1 tests from 1 test cases. # RUN pfnmap.madvise_disallowed ... # SKIP Invalid file: 'randfile.txt'. Not pfnmap'ed Link: https://lkml.kernel.org/r/20250805013629.47629-1-sudarsanm@google.com Signed-off-by: Sudarsan Mahendran Acked-by: David Hildenbrand Cc: Axel Rasmussen Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pfnmap.c | 48 ++++++++++++++++++++-------- tools/testing/selftests/mm/vm_util.c | 14 ++++++-- tools/testing/selftests/mm/vm_util.h | 1 + 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/tools/testing/selftests/mm/pfnmap.c b/tools/testing/selftests/mm/pfnmap.c index 866ac023baf5..88659f0a90ea 100644 --- a/tools/testing/selftests/mm/pfnmap.c +++ b/tools/testing/selftests/mm/pfnmap.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Basic VM_PFNMAP tests relying on mmap() of '/dev/mem' + * Basic VM_PFNMAP tests relying on mmap() of input file provided. + * Use '/dev/mem' as default. * * Copyright 2025, Red Hat, Inc. * @@ -25,6 +26,7 @@ #include "vm_util.h" static sigjmp_buf sigjmp_buf_env; +static char *file = "/dev/mem"; static void signal_handler(int sig) { @@ -51,7 +53,7 @@ static int test_read_access(char *addr, size_t size, size_t pagesize) return ret; } -static int find_ram_target(off_t *phys_addr, +static int find_ram_target(off_t *offset, unsigned long long pagesize) { unsigned long long start, end; @@ -91,7 +93,7 @@ static int find_ram_target(off_t *phys_addr, /* We need two pages. */ if (end > start + 2 * pagesize) { fclose(file); - *phys_addr = start; + *offset = start; return 0; } } @@ -100,7 +102,7 @@ static int find_ram_target(off_t *phys_addr, FIXTURE(pfnmap) { - off_t phys_addr; + off_t offset; size_t pagesize; int dev_mem_fd; char *addr1; @@ -113,23 +115,31 @@ FIXTURE_SETUP(pfnmap) { self->pagesize = getpagesize(); - /* We'll require two physical pages throughout our tests ... */ - if (find_ram_target(&self->phys_addr, self->pagesize)) - SKIP(return, "Cannot find ram target in '/proc/iomem'\n"); + if (strncmp(file, "/dev/mem", strlen("/dev/mem")) == 0) { + /* We'll require two physical pages throughout our tests ... */ + if (find_ram_target(&self->offset, self->pagesize)) + SKIP(return, + "Cannot find ram target in '/proc/iomem'\n"); + } else { + self->offset = 0; + } - self->dev_mem_fd = open("/dev/mem", O_RDONLY); + self->dev_mem_fd = open(file, O_RDONLY); if (self->dev_mem_fd < 0) - SKIP(return, "Cannot open '/dev/mem'\n"); + SKIP(return, "Cannot open '%s'\n", file); self->size1 = self->pagesize * 2; self->addr1 = mmap(NULL, self->size1, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->phys_addr); + self->dev_mem_fd, self->offset); if (self->addr1 == MAP_FAILED) - SKIP(return, "Cannot mmap '/dev/mem'\n"); + SKIP(return, "Cannot mmap '%s'\n", file); + + if (!check_vmflag_pfnmap(self->addr1)) + SKIP(return, "Invalid file: '%s'. Not pfnmap'ed\n", file); /* ... and want to be able to read from them. */ if (test_read_access(self->addr1, self->size1, self->pagesize)) - SKIP(return, "Cannot read-access mmap'ed '/dev/mem'\n"); + SKIP(return, "Cannot read-access mmap'ed '%s'\n", file); self->size2 = 0; self->addr2 = MAP_FAILED; @@ -182,7 +192,7 @@ TEST_F(pfnmap, munmap_split) */ self->size2 = self->pagesize; self->addr2 = mmap(NULL, self->pagesize, PROT_READ, MAP_SHARED, - self->dev_mem_fd, self->phys_addr); + self->dev_mem_fd, self->offset); ASSERT_NE(self->addr2, MAP_FAILED); } @@ -246,4 +256,14 @@ TEST_F(pfnmap, fork) ASSERT_EQ(ret, 0); } -TEST_HARNESS_MAIN +int main(int argc, char **argv) +{ + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--") == 0) { + if (i + 1 < argc && strlen(argv[i + 1]) > 0) + file = argv[i + 1]; + return test_harness_run(i, argv); + } + } + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 9dafa7669ef9..6a239aa413e2 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -402,7 +402,7 @@ unsigned long get_free_hugepages(void) return fhp; } -bool check_vmflag_io(void *addr) +static bool check_vmflag(void *addr, const char *flag) { char buffer[MAX_LINE_LENGTH]; const char *flags; @@ -419,13 +419,23 @@ bool check_vmflag_io(void *addr) if (!flaglen) return false; - if (flaglen == strlen("io") && !memcmp(flags, "io", flaglen)) + if (flaglen == strlen(flag) && !memcmp(flags, flag, flaglen)) return true; flags += flaglen; } } +bool check_vmflag_io(void *addr) +{ + return check_vmflag(addr, "io"); +} + +bool check_vmflag_pfnmap(void *addr) +{ + return check_vmflag(addr, "pf"); +} + /* * Open an fd at /proc/$pid/maps and configure procmap_out ready for * PROCMAP_QUERY query. Returns 0 on success, or an error code otherwise. diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index b55d1809debc..1843ad48d32b 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -93,6 +93,7 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, bool miss, bool wp, bool minor, uint64_t *ioctls); unsigned long get_free_hugepages(void); bool check_vmflag_io(void *addr); +bool check_vmflag_pfnmap(void *addr); int open_procmap(pid_t pid, struct procmap_fd *procmap_out); int query_procmap(struct procmap_fd *procmap); bool find_vma_procmap(struct procmap_fd *procmap, void *address); From cc483b328881bbccb55265a86731384d5176fe85 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 4 Aug 2025 16:33:48 -0700 Subject: [PATCH 009/372] mm: limit the scope of vma_start_read() Limit the scope of vma_start_read() as it is used only as a helper for higher-level locking functions implemented inside mmap_lock.c and we are about to introduce more complex RCU rules for this function. The change is pure code refactoring and has no functional changes. Link: https://lkml.kernel.org/r/20250804233349.1278678-1-surenb@google.com Suggested-by: Vlastimil Babka Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 85 --------------------------------------- mm/mmap_lock.c | 85 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 85 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 11a078de9150..2c9fffa58714 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -147,91 +147,6 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) } } -/* - * Try to read-lock a vma. The function is allowed to occasionally yield false - * locked result to avoid performance overhead, in which case we fall back to - * using mmap_lock. The function should never yield false unlocked result. - * False locked result is possible if mm_lock_seq overflows or if vma gets - * reused and attached to a different mm before we lock it. - * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got - * detached. - * - * WARNING! The vma passed to this function cannot be used if the function - * fails to lock it because in certain cases RCU lock is dropped and then - * reacquired. Once RCU lock is dropped the vma can be concurently freed. - */ -static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - int oldcnt; - - /* - * Check before locking. A race might cause false locked result. - * We can use READ_ONCE() for the mm_lock_seq here, and don't need - * ACQUIRE semantics, because this is just a lockless check whose result - * we don't rely on for anything - the mm_lock_seq read against which we - * need ordering is below. - */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) - return NULL; - - /* - * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() - * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. - * Acquire fence is required here to avoid reordering against later - * vm_lock_seq check and checks inside lock_vma_under_rcu(). - */ - if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, - VMA_REF_LIMIT))) { - /* return EAGAIN if vma got detached from under us */ - return oldcnt ? NULL : ERR_PTR(-EAGAIN); - } - - rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - - /* - * If vma got attached to another mm from under us, that mm is not - * stable and can be freed in the narrow window after vma->vm_refcnt - * is dropped and before rcuwait_wake_up(mm) is called. Grab it before - * releasing vma->vm_refcnt. - */ - if (unlikely(vma->vm_mm != mm)) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ - struct mm_struct *other_mm = vma->vm_mm; - - /* - * __mmdrop() is a heavy operation and we don't need RCU - * protection here. Release RCU lock during these operations. - * We reinstate the RCU read lock as the caller expects it to - * be held when this function returns even on error. - */ - rcu_read_unlock(); - mmgrab(other_mm); - vma_refcount_put(vma); - mmdrop(other_mm); - rcu_read_lock(); - return NULL; - } - - /* - * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. - * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq - * modification invalidates all existing locks. - * - * We must use ACQUIRE semantics for the mm_lock_seq so that if we are - * racing with vma_end_write_all(), we only start reading from the VMA - * after it has been unlocked. - * This pairs with RELEASE semantics in vma_end_write_all(). - */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { - vma_refcount_put(vma); - return NULL; - } - - return vma; -} - /* * Use only while holding mmap read lock which guarantees that locking will not * fail (nobody can concurrently write-lock the vma). vma_start_read() should diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index b006cec8e6fe..10826f347a9f 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -127,6 +127,91 @@ void vma_mark_detached(struct vm_area_struct *vma) } } +/* + * Try to read-lock a vma. The function is allowed to occasionally yield false + * locked result to avoid performance overhead, in which case we fall back to + * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. + * + * WARNING! The vma passed to this function cannot be used if the function + * fails to lock it because in certain cases RCU lock is dropped and then + * reacquired. Once RCU lock is dropped the vma can be concurently freed. + */ +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) +{ + int oldcnt; + + /* + * Check before locking. A race might cause false locked result. + * We can use READ_ONCE() for the mm_lock_seq here, and don't need + * ACQUIRE semantics, because this is just a lockless check whose result + * we don't rely on for anything - the mm_lock_seq read against which we + * need ordering is below. + */ + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) + return NULL; + + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); + + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + if (unlikely(vma->vm_mm != mm)) { + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *other_mm = vma->vm_mm; + + /* + * __mmdrop() is a heavy operation and we don't need RCU + * protection here. Release RCU lock during these operations. + * We reinstate the RCU read lock as the caller expects it to + * be held when this function returns even on error. + */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + rcu_read_lock(); + return NULL; + } + + /* + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. + * False unlocked result is impossible because we modify and check + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq + * modification invalidates all existing locks. + * + * We must use ACQUIRE semantics for the mm_lock_seq so that if we are + * racing with vma_end_write_all(), we only start reading from the VMA + * after it has been unlocked. + * This pairs with RELEASE semantics in vma_end_write_all(). + */ + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { + vma_refcount_put(vma); + return NULL; + } + + return vma; +} + /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the From 0b16f8bed19c6af82233cb57d01cfc944cce8fb7 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 4 Aug 2025 16:33:49 -0700 Subject: [PATCH 010/372] mm: change vma_start_read() to drop RCU lock on failure vma_start_read() can drop and reacquire RCU lock in certain failure cases. It's not apparent that the RCU session started by the caller of this function might be interrupted when vma_start_read() fails to lock the vma. This might become a source of subtle bugs and to prevent that we change the locking rules for vma_start_read() to drop RCU read lock upon failure. This way it's more obvious that RCU-protected objects are unsafe after vma locking fails. Link: https://lkml.kernel.org/r/20250804233349.1278678-2-surenb@google.com Suggested-by: Vlastimil Babka Signed-off-by: Suren Baghdasaryan Tested-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Jann Horn Cc: Liam Howlett Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap_lock.c | 84 +++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 10826f347a9f..0a0db5849b8e 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -136,15 +136,16 @@ void vma_mark_detached(struct vm_area_struct *vma) * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. * - * WARNING! The vma passed to this function cannot be used if the function - * fails to lock it because in certain cases RCU lock is dropped and then - * reacquired. Once RCU lock is dropped the vma can be concurently freed. + * IMPORTANT: RCU lock must be held upon entering the function, but upon error + * IT IS RELEASED. The caller must handle this correctly. */ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, struct vm_area_struct *vma) { + struct mm_struct *other_mm; int oldcnt; + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held"); /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need @@ -152,8 +153,10 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) - return NULL; + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) { + vma = NULL; + goto err; + } /* * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() @@ -164,34 +167,14 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, VMA_REF_LIMIT))) { /* return EAGAIN if vma got detached from under us */ - return oldcnt ? NULL : ERR_PTR(-EAGAIN); + vma = oldcnt ? NULL : ERR_PTR(-EAGAIN); + goto err; } rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); - /* - * If vma got attached to another mm from under us, that mm is not - * stable and can be freed in the narrow window after vma->vm_refcnt - * is dropped and before rcuwait_wake_up(mm) is called. Grab it before - * releasing vma->vm_refcnt. - */ - if (unlikely(vma->vm_mm != mm)) { - /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ - struct mm_struct *other_mm = vma->vm_mm; - - /* - * __mmdrop() is a heavy operation and we don't need RCU - * protection here. Release RCU lock during these operations. - * We reinstate the RCU read lock as the caller expects it to - * be held when this function returns even on error. - */ - rcu_read_unlock(); - mmgrab(other_mm); - vma_refcount_put(vma); - mmdrop(other_mm); - rcu_read_lock(); - return NULL; - } + if (unlikely(vma->vm_mm != mm)) + goto err_unstable; /* * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. @@ -206,10 +189,31 @@ static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { vma_refcount_put(vma); - return NULL; + vma = NULL; + goto err; } return vma; +err: + rcu_read_unlock(); + + return vma; +err_unstable: + /* + * If vma got attached to another mm from under us, that mm is not + * stable and can be freed in the narrow window after vma->vm_refcnt + * is dropped and before rcuwait_wake_up(mm) is called. Grab it before + * releasing vma->vm_refcnt. + */ + other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */ + + /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */ + rcu_read_unlock(); + mmgrab(other_mm); + vma_refcount_put(vma); + mmdrop(other_mm); + + return NULL; } /* @@ -223,11 +227,13 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, MA_STATE(mas, &mm->mm_mt, address, address); struct vm_area_struct *vma; - rcu_read_lock(); retry: + rcu_read_lock(); vma = mas_walk(&mas); - if (!vma) + if (!vma) { + rcu_read_unlock(); goto inval; + } vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { @@ -247,18 +253,17 @@ retry: * From here on, we can access the VMA without worrying about which * fields are accessible for RCU readers. */ + rcu_read_unlock(); /* Check if the vma we locked is the right one. */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) - goto inval_end_read; + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { + vma_end_read(vma); + goto inval; + } - rcu_read_unlock(); return vma; -inval_end_read: - vma_end_read(vma); inval: - rcu_read_unlock(); count_vm_vma_lock_event(VMA_LOCK_ABORT); return NULL; } @@ -313,6 +318,7 @@ retry: */ if (PTR_ERR(vma) == -EAGAIN) { /* reset to search from the last address */ + rcu_read_lock(); vma_iter_set(vmi, from_addr); goto retry; } @@ -342,9 +348,9 @@ retry: return vma; fallback_unlock: + rcu_read_unlock(); vma_end_read(vma); fallback: - rcu_read_unlock(); vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr); rcu_read_lock(); /* Reinitialize the iterator after re-entering rcu read section */ From b25786b4a9819419a317994dabb6ecb409e8114b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 7 Aug 2025 00:17:46 +0800 Subject: [PATCH 011/372] mm, swap: only scan one cluster in fragment list Patch series "mm, swap: improve cluster scan strategy", v2. This series improves the large allocation performance and reduces the failure rate. Some design of the cluster alloactor was later found to be improvable after thorough testing. The allocator spent too much effort scanning the fragment list, which is not helpful in most setups, but causes serious contention of the list lock (si->lock). Besides, the allocator prefers free clusters when searching for a new cluster due to historical reasons, which causes fragmentation issues. So make the allocator only scan one cluster for high order allocation, and prefer nonfull cluster. This both improves the performance and reduces fragmentation. For example, build kernel test with make -j96 and 10G ZRAM with 64kB mTHP enabled shows better performance and a lower failure rate: Before: sys time: 11609.69s 64kB/swpout: 1787051 64kB/swpout_fallback: 20917 After: sys time: 5587.53s 64kB/swpout: 1811598 64kB/swpout_fallback: 0 System time is cut in half, and the failure rate drops to zero. Larger allocations in a hybrid workload also showed a major improvement: 512kB swap failure rate: Before: swpout:11663 swpout_fallback:1767 After: swpout:14480 swpout_fallback:6 2M swap failure rate: Before: swpout:24 swpout_fallback:1442 After: swpout:1329 swpout_fallback:7 This patch (of 3): Fragment clusters were mostly failing high order allocation already. The reason we scan it through now is that a swap slot may get freed without releasing the swap cache, so a swap map entry will end up in HAS_CACHE only status, and the cluster won't be moved back to non-full or free cluster list. This may cause a higher allocation failure rate. Usually only !SWP_SYNCHRONOUS_IO devices may have a large number of slots stuck in HAS_CACHE only status. Because when a !SWP_SYNCHRONOUS_IO device's usage is low (!vm_swap_full()), it will try to lazy free the swap cache. But this fragment list scan out is a bit overkill. Fragmentation is only an issue for the allocator when the device is getting full, and by that time, swap will be releasing the swap cache aggressively already. Only scanning one fragment cluster at a time is good enough to reclaim already pinned slots, and move the cluster back to nonfull. And besides, only high order allocation requires iterating over the list, order 0 allocation will succeed on the first attempt. And high order allocation failure isn't a serious problem. So the iteration of fragment clusters is trivial, but it will slow down large allocation by a lot when the fragment cluster list is long. So it's better to drop this fragment cluster iteration design. Test on a 48c96t system, build linux kernel using 10G ZRAM, make -j48, defconfig with 768M cgroup memory limit, on top of tmpfs, 4K folio only: Before: sys time: 4432.56s After: sys time: 4430.18s Change to make -j96, 2G memory limit, 64kB mTHP enabled, and 10G ZRAM: Before: sys time: 11609.69s 64kB/swpout: 1787051 64kB/swpout_fallback: 20917 After: sys time: 5572.85s 64kB/swpout: 1797612 64kB/swpout_fallback: 19254 Change to 8G ZRAM: Before: sys time: 21524.35s 64kB/swpout: 1687142 64kB/swpout_fallback: 128496 After: sys time: 6278.45s 64kB/swpout: 1679127 64kB/swpout_fallback: 130942 Change to use 10G brd device with SWP_SYNCHRONOUS_IO flag removed: Before: sys time: 7393.50s 64kB/swpout:1788246 swpout_fallback: 0 After: sys time: 7399.88s 64kB/swpout:1784257 swpout_fallback: 0 Change to use 8G brd device with SWP_SYNCHRONOUS_IO flag removed: Before: sys time: 26292.26s 64kB/swpout:1645236 swpout_fallback: 138945 After: sys time: 9463.16s 64kB/swpout:1581376 swpout_fallback: 259979 The performance is a lot better for large folios, and the large order allocation failure rate is only very slightly higher or unchanged even for !SWP_SYNCHRONOUS_IO devices high pressure. Link: https://lkml.kernel.org/r/20250806161748.76651-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250806161748.76651-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Nhat Pham Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kairui Song Cc: Kemeng Shi Signed-off-by: Andrew Morton --- mm/swapfile.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index b4f3cc712580..1f1110e37f68 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -926,32 +926,25 @@ new_cluster: swap_reclaim_full_clusters(si, false); if (order < PMD_ORDER) { - unsigned int frags = 0, frags_existing; - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; - /* Clusters failed to allocate are moved to frag_clusters */ - frags++; } - frags_existing = atomic_long_read(&si->frag_cluster_nr[order]); - while (frags < frags_existing && - (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) { - atomic_long_dec(&si->frag_cluster_nr[order]); - /* - * Rotate the frag list to iterate, they were all - * failing high order allocation or moved here due to - * per-CPU usage, but they could contain newly released - * reclaimable (eg. lazy-freed swap cache) slots. - */ + /* + * Scan only one fragment cluster is good enough. Order 0 + * allocation will surely success, and large allocation + * failure is not critical. Scanning one cluster still + * keeps the list rotated and reclaimed (for HAS_CACHE). + */ + ci = isolate_lock_cluster(si, &si->frag_clusters[order]); + if (ci) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), order, usage); if (found) goto done; - frags++; } } From 913fff314547c1922002e655bb25199ee38e8825 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 7 Aug 2025 00:17:47 +0800 Subject: [PATCH 012/372] mm, swap: remove fragment clusters counter It was used for calculating the iteration number when the swap allocator wants to scan the whole fragment list. Now the allocator only scans one fragment cluster at a time, so no one uses this counter anymore. Remove it as a cleanup; the performance change is marginal: Build linux kernel using 10G ZRAM, make -j96, defconfig with 2G cgroup memory limit, on top of tmpfs, 64kB mTHP enabled: Before: sys time: 6278.45s After: sys time: 6176.34s Change to 8G ZRAM: Before: sys time: 5572.85s After: sys time: 5531.49s Link: https://lkml.kernel.org/r/20250806161748.76651-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Nhat Pham Acked-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/swapfile.c | 7 ------- 2 files changed, 8 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fe6ed2cc3fd..a060d102e0d1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -310,7 +310,6 @@ struct swap_info_struct { /* list of cluster that contains at least one free slot */ struct list_head frag_clusters[SWAP_NR_ORDERS]; /* list of cluster that are fragmented or contented */ - atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS]; unsigned int pages; /* total of usable pages of swap */ atomic_long_t inuse_pages; /* number of those currently in use */ struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f1110e37f68..5fdb3cb2b8b7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -470,11 +470,6 @@ static void move_cluster(struct swap_info_struct *si, else list_move_tail(&ci->list, list); spin_unlock(&si->lock); - - if (ci->flags == CLUSTER_FLAG_FRAG) - atomic_long_dec(&si->frag_cluster_nr[ci->order]); - else if (new_flags == CLUSTER_FLAG_FRAG) - atomic_long_inc(&si->frag_cluster_nr[ci->order]); ci->flags = new_flags; } @@ -965,7 +960,6 @@ new_cluster: * allocation, but reclaim may drop si->lock and race with another user. */ while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - atomic_long_dec(&si->frag_cluster_nr[o]); found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), 0, usage); if (found) @@ -3217,7 +3211,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < SWAP_NR_ORDERS; i++) { INIT_LIST_HEAD(&si->nonfull_clusters[i]); INIT_LIST_HEAD(&si->frag_clusters[i]); - atomic_long_set(&si->frag_cluster_nr[i], 0); } /* From 9a42aed48421390155037460a3f2fd510fa6df76 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Thu, 7 Aug 2025 00:17:48 +0800 Subject: [PATCH 013/372] mm, swap: prefer nonfull over free clusters We prefer a free cluster over a nonfull cluster whenever a CPU local cluster is drained to respect the SSD discard behavior [1]. It's not a best practice for non-discarding devices. And this is causing a higher fragmentation rate. So for a non-discarding device, prefer nonfull over free clusters. This reduces the fragmentation issue by a lot. Testing with make -j96, defconfig, using 64k mTHP, 8G ZRAM: Before: sys time: 6176.34s 64kB/swpout: 1659757 64kB/swpout_fallback: 139503 After: sys time: 6194.11s 64kB/swpout: 1689470 64kB/swpout_fallback: 56147 Testing with make -j96, defconfig, using 64k mTHP, 10G ZRAM: After: sys time: 5531.49s 64kB/swpout: 1791142 64kB/swpout_fallback: 17676 After: sys time: 5587.53s 64kB/swpout: 1811598 64kB/swpout_fallback: 0 Performance is basically unchanged, and the large allocation failure rate is lower. Enabling all mTHP sizes showed a more significant result. Using the same test setup with 10G ZRAM and enabling all mTHP sizes: 128kB swap failure rate: Before: swpout:451599 swpout_fallback:54525 After: swpout:502710 swpout_fallback:870 256kB swap failure rate: Before: swpout:63652 swpout_fallback:2708 After: swpout:65913 swpout_fallback:20 512kB swap failure rate: Before: swpout:11663 swpout_fallback:1767 After: swpout:14480 swpout_fallback:6 2M swap failure rate: Before: swpout:24 swpout_fallback:1442 After: swpout:1329 swpout_fallback:7 The success rate of large allocations is much higher. Link: https://lore.kernel.org/linux-mm/87v8242vng.fsf@yhuang6-desk2.ccr.corp.intel.com/ [1] Link: https://lkml.kernel.org/r/20250806161748.76651-4-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Nhat Pham Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton --- mm/swapfile.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 5fdb3cb2b8b7..4a0cf4fb348d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -908,18 +908,20 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o } new_cluster: - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; + /* + * If the device need discard, prefer new cluster over nonfull + * to spread out the writes. + */ + if (si->flags & SWP_PAGE_DISCARD) { + ci = isolate_lock_cluster(si, &si->free_clusters); + if (ci) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; + } } - /* Try reclaim from full clusters if free clusters list is drained */ - if (vm_swap_full()) - swap_reclaim_full_clusters(si, false); - if (order < PMD_ORDER) { while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), @@ -927,7 +929,23 @@ new_cluster: if (found) goto done; } + } + if (!(si->flags & SWP_PAGE_DISCARD)) { + ci = isolate_lock_cluster(si, &si->free_clusters); + if (ci) { + found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), + order, usage); + if (found) + goto done; + } + } + + /* Try reclaim full clusters if free and nonfull lists are drained */ + if (vm_swap_full()) + swap_reclaim_full_clusters(si, false); + + if (order < PMD_ORDER) { /* * Scan only one fragment cluster is good enough. Order 0 * allocation will surely success, and large allocation From 6de1ef1ca39a07fe1c17f3f73a74106543eae127 Mon Sep 17 00:00:00 2001 From: Pranav Tyagi Date: Wed, 30 Jul 2025 19:53:01 +0530 Subject: [PATCH 014/372] selftests/mm: use __auto_type in swap() macro Replace typeof() with __auto_type in the swap() macro in uffd-stress.c. __auto_type was introduced in GCC 4.9 and reduces the compile time for all compilers. No functional changes intended. Link: https://lkml.kernel.org/r/20250730142301.6754-1-pranav.tyagi03@gmail.com Signed-off-by: Pranav Tyagi Reviewed-by: Joshua Hahn Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 40af7f67c407..c0f64df5085c 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -51,7 +51,7 @@ static char *zeropage; pthread_attr_t attr; #define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + do { __auto_type __tmp = (a); (a) = (b); (b) = __tmp; } while (0) const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" From 61dc4358d37ae0be3220a0fa32cf7f0ccd4f7636 Mon Sep 17 00:00:00 2001 From: "Adrian Huang (Lenovo)" Date: Wed, 6 Aug 2025 22:59:06 +0800 Subject: [PATCH 015/372] mm: correct misleading comment on mmap_lock field in mm_struct The comment previously described the offset of mmap_lock as 0x120 (hex), which is misleading. The correct offset is 56 bytes (decimal) from the last cache line boundary. Using '0x120' could confuse readers trying to understand why the count and owner fields reside in separate cachelines. This change also removes an unnecessary space for improved formatting. Link: https://lkml.kernel.org/r/20250806145906.24647-1-adrianhuang0701@gmail.com Signed-off-by: Adrian Huang (Lenovo) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db93..3ed763e7ec6f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1026,10 +1026,10 @@ struct mm_struct { * counters */ /* - * With some kernel config, the current mmap_lock's offset - * inside 'mm_struct' is at 0x120, which is very optimal, as + * Typically the current mmap_lock's offset is 56 bytes from + * the last cacheline boundary, which is very optimal, as * its two hot fields 'count' and 'owner' sit in 2 different - * cachelines, and when mmap_lock is highly contended, both + * cachelines, and when mmap_lock is highly contended, both * of the 2 fields will be accessed frequently, current layout * will help to reduce cache bouncing. * From 4c5d3365882dbbc0784688784904f440d7a4c0f1 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Aug 2025 14:41:08 +0200 Subject: [PATCH 016/372] mm/vmalloc: allow to set node and align in vrealloc Patch series "support large align and nid in Rust allocators", v15. The series provides the ability for Rust allocators to set NUMA node and large alignment. This patch (of 4): Reimplement vrealloc() to be able to set node and alignment should a user need to do so. Rename the function to vrealloc_node_align() to better match what it actually does now and introduce macros for vrealloc() and friends for backward compatibility. With that change we also provide the ability for the Rust part of the kernel to set node and alignment in its allocations. Link: https://lkml.kernel.org/r/20250806124034.1724515-1-vitaly.wool@konsulko.se Link: https://lkml.kernel.org/r/20250806124108.1724561-1-vitaly.wool@konsulko.se Signed-off-by: Vitaly Wool Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Vlastimil Babka Cc: Alice Ryhl Cc: Danilo Krummrich Cc: Herbert Xu Cc: Jann Horn Cc: Kent Overstreet Cc: Liam Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 12 +++++++++--- mm/nommu.c | 3 ++- mm/vmalloc.c | 29 ++++++++++++++++++++++++----- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 2759dac6be44..eb54b7b3202f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -197,9 +197,15 @@ extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1 extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); #define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) -void * __must_check vrealloc_noprof(const void *p, size_t size, gfp_t flags) - __realloc_size(2); -#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) +void *__must_check vrealloc_node_align_noprof(const void *p, size_t size, + unsigned long align, gfp_t flags, int nid) __realloc_size(2); +#define vrealloc_node_noprof(_p, _s, _f, _nid) \ + vrealloc_node_align_noprof(_p, _s, 1, _f, _nid) +#define vrealloc_noprof(_p, _s, _f) \ + vrealloc_node_align_noprof(_p, _s, 1, _f, NUMA_NO_NODE) +#define vrealloc_node_align(...) alloc_hooks(vrealloc_node_align_noprof(__VA_ARGS__)) +#define vrealloc_node(...) alloc_hooks(vrealloc_node_noprof(__VA_ARGS__)) +#define vrealloc(...) alloc_hooks(vrealloc_noprof(__VA_ARGS__)) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/mm/nommu.c b/mm/nommu.c index 8b819fafd57b..6fff462f90d3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -119,7 +119,8 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) } EXPORT_SYMBOL(__vmalloc_noprof); -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int node) { return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6dbcdceecae1..e299b51bd922 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4089,19 +4089,29 @@ void *vzalloc_node_noprof(unsigned long size, int node) EXPORT_SYMBOL(vzalloc_node_noprof); /** - * vrealloc - reallocate virtually contiguous memory; contents remain unchanged + * vrealloc_node_align_noprof - reallocate virtually contiguous memory; contents + * remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: requested alignment * @flags: the flags for the page level allocator + * @nid: node number of the target node * - * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and - * @p is not a %NULL pointer, the object pointed to is freed. + * If @p is %NULL, vrealloc_XXX() behaves exactly like vmalloc_XXX(). If @size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. + * + * If the caller wants the new memory to be on specific node *only*, + * __GFP_THISNODE flag should be set, otherwise the function will try to avoid + * reallocation and possibly disregard the specified @nid. * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that * __GFP_ZERO is not fully honored by this API. * + * Requesting an alignment that is bigger than the alignment of the existing + * allocation will fail. + * * In any case, the contents of the object pointed to are preserved up to the * lesser of the new and old sizes. * @@ -4111,7 +4121,8 @@ EXPORT_SYMBOL(vzalloc_node_noprof); * Return: pointer to the allocated memory; %NULL if @size is zero or in case of * failure */ -void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *vrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { struct vm_struct *vm = NULL; size_t alloced_size = 0; @@ -4135,6 +4146,12 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) if (WARN(alloced_size < old_size, "vrealloc() has mismatched area vs requested sizes (%p)\n", p)) return NULL; + if (WARN(!IS_ALIGNED((unsigned long)p, align), + "will not reallocate with a bigger alignment (0x%lx)\n", align)) + return NULL; + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(vmalloc_to_page(p))) + goto need_realloc; } /* @@ -4165,8 +4182,10 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) return (void *)p; } +need_realloc: /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */ - n = __vmalloc_noprof(size, flags); + n = __vmalloc_node_noprof(size, align, flags, nid, __builtin_return_address(0)); + if (!n) return NULL; From 2cd8231796b5e7133b1c3d66ad7d2a3c42c97258 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Aug 2025 14:41:47 +0200 Subject: [PATCH 017/372] mm/slub: allow to set node and align in k[v]realloc Reimplement k[v]realloc_node() to be able to set node and alignment should a user need to do so. In order to do that while retaining the maximal backward compatibility, add k[v]realloc_node_align() functions and redefine the rest of API using these new ones. While doing that, we also keep the number of _noprof variants to a minimum, which implies some changes to the existing users of older _noprof functions, that basically being bcachefs. With that change we also provide the ability for the Rust part of the kernel to set node and alignment in its K[v]xxx [re]allocations. Link: https://lkml.kernel.org/r/20250806124147.1724658-1-vitaly.wool@konsulko.se Signed-off-by: Vitaly Wool Reviewed-by: Vlastimil Babka Cc: Alice Ryhl Cc: Danilo Krummrich Cc: Herbert Xu Cc: Jann Horn Cc: Kent Overstreet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton --- fs/bcachefs/darray.c | 2 +- fs/bcachefs/util.h | 2 +- include/linux/bpfptr.h | 2 +- include/linux/slab.h | 39 +++++++++++++++++----------- lib/rhashtable.c | 4 +-- mm/slub.c | 59 ++++++++++++++++++++++++++++++++---------- 6 files changed, 74 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c index e86d36d23e9e..928e83a1ce42 100644 --- a/fs/bcachefs/darray.c +++ b/fs/bcachefs/darray.c @@ -21,7 +21,7 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_ return -ENOMEM; void *data = likely(bytes < INT_MAX) - ? kvmalloc_noprof(bytes, gfp) + ? kvmalloc_node_align_noprof(bytes, 1, gfp, NUMA_NO_NODE) : vmalloc_noprof(bytes); if (!data) return -ENOMEM; diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 6488f098d140..7112fd40ee21 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -61,7 +61,7 @@ static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) { void *p = unlikely(n >= INT_MAX) ? vmalloc_noprof(n) - : kvmalloc_noprof(n, flags & ~__GFP_ZERO); + : kvmalloc_node_align_noprof(n, 1, flags & ~__GFP_ZERO, NUMA_NO_NODE); if (p && (flags & __GFP_ZERO)) memset(p, 0, n); return p; diff --git a/include/linux/bpfptr.h b/include/linux/bpfptr.h index 1af241525a17..f6e0795db484 100644 --- a/include/linux/bpfptr.h +++ b/include/linux/bpfptr.h @@ -67,7 +67,7 @@ static inline int copy_to_bpfptr_offset(bpfptr_t dst, size_t offset, static inline void *kvmemdup_bpfptr_noprof(bpfptr_t src, size_t len) { - void *p = kvmalloc_noprof(len, GFP_USER | __GFP_NOWARN); + void *p = kvmalloc_node_align_noprof(len, 1, GFP_USER | __GFP_NOWARN, NUMA_NO_NODE); if (!p) return ERR_PTR(-ENOMEM); diff --git a/include/linux/slab.h b/include/linux/slab.h index d5a8ab98035c..6dc300bac2a1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -465,9 +465,13 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc_noprof(const void *objp, size_t new_size, - gfp_t flags) __realloc_size(2); -#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) +void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size, + unsigned long align, + gfp_t flags, int nid) __realloc_size(2); +#define krealloc_noprof(_o, _s, _f) krealloc_node_align_noprof(_o, _s, 1, _f, NUMA_NO_NODE) +#define krealloc_node_align(...) alloc_hooks(krealloc_node_align_noprof(__VA_ARGS__)) +#define krealloc_node(_o, _s, _f, _n) krealloc_node_align(_o, _s, 1, _f, _n) +#define krealloc(...) krealloc_node(__VA_ARGS__, NUMA_NO_NODE) void kfree(const void *objp); void kfree_sensitive(const void *objp); @@ -1041,18 +1045,20 @@ static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) #define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) #define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1); -#define kvmalloc_node_noprof(size, flags, node) \ - __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node) -#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) - -#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) -#define kvmalloc_noprof(_size, _flags) kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) __alloc_size(1); +#define kvmalloc_node_align_noprof(_size, _align, _flags, _node) \ + __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, NULL), _align, _flags, _node) +#define kvmalloc_node_align(...) \ + alloc_hooks(kvmalloc_node_align_noprof(__VA_ARGS__)) +#define kvmalloc_node(_s, _f, _n) kvmalloc_node_align(_s, 1, _f, _n) +#define kvmalloc(...) kvmalloc_node(__VA_ARGS__, NUMA_NO_NODE) #define kvzalloc(_size, _flags) kvmalloc(_size, (_flags)|__GFP_ZERO) #define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node) + #define kmem_buckets_valloc(_b, _size, _flags) \ - alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE)) + alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), 1, _flags, NUMA_NO_NODE)) static inline __alloc_size(1, 2) void * kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) @@ -1062,7 +1068,7 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; - return kvmalloc_node_noprof(bytes, flags, node); + return kvmalloc_node_align_noprof(bytes, 1, flags, node); } #define kvmalloc_array_noprof(...) kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE) @@ -1073,9 +1079,12 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) - __realloc_size(2); -#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) __realloc_size(2); +#define kvrealloc_node_align(...) \ + alloc_hooks(kvrealloc_node_align_noprof(__VA_ARGS__)) +#define kvrealloc_node(_p, _s, _f, _n) kvrealloc_node_align(_p, _s, 1, _f, _n) +#define kvrealloc(...) kvrealloc_node(__VA_ARGS__, NUMA_NO_NODE) extern void kvfree(const void *addr); DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T)) diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 3e555d012ed6..fde0f0e556f8 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -184,8 +184,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, static struct lock_class_key __key; tbl = alloc_hooks_tag(ht->alloc_tag, - kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), - gfp|__GFP_ZERO, NUMA_NO_NODE)); + kvmalloc_node_align_noprof(struct_size(tbl, buckets, nbuckets), + 1, gfp|__GFP_ZERO, NUMA_NO_NODE)); size = nbuckets; diff --git a/mm/slub.c b/mm/slub.c index 30003763d224..8dbeabc6a0f0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4881,7 +4881,7 @@ void kfree(const void *object) EXPORT_SYMBOL(kfree); static __always_inline __realloc_size(2) void * -__do_krealloc(const void *p, size_t new_size, gfp_t flags) +__do_krealloc(const void *p, size_t new_size, unsigned long align, gfp_t flags, int nid) { void *ret; size_t ks = 0; @@ -4895,6 +4895,16 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (!kasan_check_byte(p)) return NULL; + /* + * If reallocation is not necessary (e. g. the new size is less + * than the current allocated size), the current allocation will be + * preserved unless __GFP_THISNODE is set. In the latter case a new + * allocation on the requested node will be attempted. + */ + if (unlikely(flags & __GFP_THISNODE) && nid != NUMA_NO_NODE && + nid != page_to_nid(virt_to_page(p))) + goto alloc_new; + if (is_kfence_address(p)) { ks = orig_size = kfence_ksize(p); } else { @@ -4917,6 +4927,10 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) if (new_size > ks) goto alloc_new; + /* If the old object doesn't satisfy the new alignment, allocate a new one */ + if (!IS_ALIGNED((unsigned long)p, align)) + goto alloc_new; + /* Zero out spare memory. */ if (want_init_on_alloc(flags)) { kasan_disable_current(); @@ -4939,7 +4953,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; alloc_new: - ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + ret = kmalloc_node_track_caller_noprof(new_size, flags, nid, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -4951,14 +4965,19 @@ alloc_new: } /** - * krealloc - reallocate memory. The contents will remain unchanged. + * krealloc_node_align - reallocate memory. The contents will remain unchanged. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. + * @align: desired alignment. * @flags: the type of memory to allocate. + * @nid: NUMA node or NUMA_NO_NODE * * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -4983,7 +5002,8 @@ alloc_new: * * Return: pointer to the allocated memory or %NULL in case of error */ -void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) +void *krealloc_node_align_noprof(const void *p, size_t new_size, unsigned long align, + gfp_t flags, int nid) { void *ret; @@ -4992,13 +5012,13 @@ void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) return ZERO_SIZE_PTR; } - ret = __do_krealloc(p, new_size, flags); + ret = __do_krealloc(p, new_size, align, flags, nid); if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret)) kfree(p); return ret; } -EXPORT_SYMBOL(krealloc_noprof); +EXPORT_SYMBOL(krealloc_node_align_noprof); static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) { @@ -5029,9 +5049,13 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @b: which set of kmalloc buckets to allocate from. + * @align: desired alignment. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * @@ -5041,7 +5065,8 @@ static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), unsigned long align, + gfp_t flags, int node) { void *ret; @@ -5071,7 +5096,7 @@ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } @@ -5115,14 +5140,19 @@ void kvfree_sensitive(const void *addr, size_t len) EXPORT_SYMBOL(kvfree_sensitive); /** - * kvrealloc - reallocate memory; contents remain unchanged + * kvrealloc_node_align - reallocate memory; contents remain unchanged * @p: object to reallocate memory for * @size: the size to reallocate + * @align: desired alignment * @flags: the flags for the page level allocator + * @nid: NUMA node id * * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 * and @p is not a %NULL pointer, the object pointed to is freed. * + * Only alignments up to those guaranteed by kmalloc() will be honored. Please see + * Documentation/core-api/memory-allocation.rst for more details. + * * If __GFP_ZERO logic is requested, callers must ensure that, starting with the * initial memory allocation, every subsequent call to this API for the same * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that @@ -5136,17 +5166,18 @@ EXPORT_SYMBOL(kvfree_sensitive); * * Return: pointer to the allocated memory or %NULL in case of error */ -void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) +void *kvrealloc_node_align_noprof(const void *p, size_t size, unsigned long align, + gfp_t flags, int nid) { void *n; if (is_vmalloc_addr(p)) - return vrealloc_noprof(p, size, flags); + return vrealloc_node_align_noprof(p, size, align, flags, nid); - n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + n = krealloc_node_align_noprof(p, size, align, kmalloc_gfp_adjust(flags, size), nid); if (!n) { /* We failed to krealloc(), fall back to kvmalloc(). */ - n = kvmalloc_noprof(size, flags); + n = kvmalloc_node_align_noprof(size, align, flags, nid); if (!n) return NULL; @@ -5162,7 +5193,7 @@ void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) return n; } -EXPORT_SYMBOL(kvrealloc_noprof); +EXPORT_SYMBOL(kvrealloc_node_align_noprof); struct detached_freelist { struct slab *slab; From 7760b6421b6c1b49550885ecdfa9cf720ead6eed Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Aug 2025 14:55:22 +0200 Subject: [PATCH 018/372] rust: add support for NUMA ids in allocations Add a new type to support specifying NUMA identifiers in Rust allocators and extend the allocators to have NUMA id as a parameter. Thus, modify ReallocFunc to use the new extended realloc primitives from the C side of the kernel (i.e. k[v]realloc_node_align/vrealloc_node_align) and add the new function alloc_node to the Allocator trait while keeping the existing one (alloc) for backward compatibility. This will allow to specify node to use for allocation of e. g. {KV}Box, as well as for future NUMA aware users of the API. [ojeda@kernel.org: fix missing import needed for `rusttest`] Link: https://lkml.kernel.org/r/20250816210214.2729269-1-ojeda@kernel.org Link: https://lkml.kernel.org/r/20250806125522.1726992-1-vitaly.wool@konsulko.se Signed-off-by: Vitaly Wool Signed-off-by: Miguel Ojeda Acked-by: Danilo Krummrich Acked-by: Alice Ryhl Cc: Herbert Xu Cc: Jann Horn Cc: Kent Overstreet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Miguel Ojeda Signed-off-by: Andrew Morton --- rust/helpers/slab.c | 8 ++--- rust/helpers/vmalloc.c | 4 +-- rust/kernel/alloc.rs | 54 ++++++++++++++++++++++++++--- rust/kernel/alloc/allocator.rs | 35 ++++++++++++------- rust/kernel/alloc/allocator_test.rs | 3 +- rust/kernel/alloc/kbox.rs | 4 +-- rust/kernel/alloc/kvec.rs | 11 ++++-- 7 files changed, 90 insertions(+), 29 deletions(-) diff --git a/rust/helpers/slab.c b/rust/helpers/slab.c index a842bfbddcba..8472370a4338 100644 --- a/rust/helpers/slab.c +++ b/rust/helpers/slab.c @@ -3,13 +3,13 @@ #include void * __must_check __realloc_size(2) -rust_helper_krealloc(const void *objp, size_t new_size, gfp_t flags) +rust_helper_krealloc_node(const void *objp, size_t new_size, gfp_t flags, int node) { - return krealloc(objp, new_size, flags); + return krealloc_node(objp, new_size, flags, node); } void * __must_check __realloc_size(2) -rust_helper_kvrealloc(const void *p, size_t size, gfp_t flags) +rust_helper_kvrealloc_node(const void *p, size_t size, gfp_t flags, int node) { - return kvrealloc(p, size, flags); + return kvrealloc_node(p, size, flags, node); } diff --git a/rust/helpers/vmalloc.c b/rust/helpers/vmalloc.c index 80d34501bbc0..62d30db9a1a6 100644 --- a/rust/helpers/vmalloc.c +++ b/rust/helpers/vmalloc.c @@ -3,7 +3,7 @@ #include void * __must_check __realloc_size(2) -rust_helper_vrealloc(const void *p, size_t size, gfp_t flags) +rust_helper_vrealloc_node(const void *p, size_t size, gfp_t flags, int node) { - return vrealloc(p, size, flags); + return vrealloc_node(p, size, flags, node); } diff --git a/rust/kernel/alloc.rs b/rust/kernel/alloc.rs index a2c49e5494d3..b39c279236f5 100644 --- a/rust/kernel/alloc.rs +++ b/rust/kernel/alloc.rs @@ -28,6 +28,8 @@ pub use self::kvec::Vec; /// Indicates an allocation error. #[derive(Copy, Clone, PartialEq, Eq, Debug)] pub struct AllocError; + +use crate::error::{code::EINVAL, Result}; use core::{alloc::Layout, ptr::NonNull}; /// Flags to be used when allocating memory. @@ -115,6 +117,31 @@ pub mod flags { pub const __GFP_NOWARN: Flags = Flags(bindings::__GFP_NOWARN); } +/// Non Uniform Memory Access (NUMA) node identifier. +#[derive(Clone, Copy, PartialEq)] +pub struct NumaNode(i32); + +impl NumaNode { + /// Create a new NUMA node identifier (non-negative integer). + /// + /// Returns [`EINVAL`] if a negative id or an id exceeding [`bindings::MAX_NUMNODES`] is + /// specified. + pub fn new(node: i32) -> Result { + // MAX_NUMNODES never exceeds 2**10 because NODES_SHIFT is 0..10. + if node < 0 || node >= bindings::MAX_NUMNODES as i32 { + return Err(EINVAL); + } + Ok(Self(node)) + } +} + +/// Specify necessary constant to pass the information to Allocator that the caller doesn't care +/// about the NUMA node to allocate memory from. +impl NumaNode { + /// No node preference. + pub const NO_NODE: NumaNode = NumaNode(bindings::NUMA_NO_NODE); +} + /// The kernel's [`Allocator`] trait. /// /// An implementation of [`Allocator`] can allocate, re-allocate and free memory buffers described @@ -137,7 +164,7 @@ pub mod flags { /// - Implementers must ensure that all trait functions abide by the guarantees documented in the /// `# Guarantees` sections. pub unsafe trait Allocator { - /// Allocate memory based on `layout` and `flags`. + /// Allocate memory based on `layout`, `flags` and `nid`. /// /// On success, returns a buffer represented as `NonNull<[u8]>` that satisfies the layout /// constraints (i.e. minimum size and alignment as specified by `layout`). @@ -153,13 +180,21 @@ pub unsafe trait Allocator { /// /// Additionally, `Flags` are honored as documented in /// . - fn alloc(layout: Layout, flags: Flags) -> Result, AllocError> { + fn alloc(layout: Layout, flags: Flags, nid: NumaNode) -> Result, AllocError> { // SAFETY: Passing `None` to `realloc` is valid by its safety requirements and asks for a // new memory allocation. - unsafe { Self::realloc(None, layout, Layout::new::<()>(), flags) } + unsafe { Self::realloc(None, layout, Layout::new::<()>(), flags, nid) } } - /// Re-allocate an existing memory allocation to satisfy the requested `layout`. + /// Re-allocate an existing memory allocation to satisfy the requested `layout` and + /// a specific NUMA node request to allocate the memory for. + /// + /// Systems employing a Non Uniform Memory Access (NUMA) architecture contain collections of + /// hardware resources including processors, memory, and I/O buses, that comprise what is + /// commonly known as a NUMA node. + /// + /// `nid` stands for NUMA id, i. e. NUMA node identifier, which is a non-negative integer + /// if a node needs to be specified, or [`NumaNode::NO_NODE`] if the caller doesn't care. /// /// If the requested size is zero, `realloc` behaves equivalent to `free`. /// @@ -196,6 +231,7 @@ pub unsafe trait Allocator { layout: Layout, old_layout: Layout, flags: Flags, + nid: NumaNode, ) -> Result, AllocError>; /// Free an existing memory allocation. @@ -211,7 +247,15 @@ pub unsafe trait Allocator { // SAFETY: The caller guarantees that `ptr` points at a valid allocation created by this // allocator. We are passing a `Layout` with the smallest possible alignment, so it is // smaller than or equal to the alignment previously used with this allocation. - let _ = unsafe { Self::realloc(Some(ptr), Layout::new::<()>(), layout, Flags(0)) }; + let _ = unsafe { + Self::realloc( + Some(ptr), + Layout::new::<()>(), + layout, + Flags(0), + NumaNode::NO_NODE, + ) + }; } } diff --git a/rust/kernel/alloc/allocator.rs b/rust/kernel/alloc/allocator.rs index 2692cf90c948..14510a9e4502 100644 --- a/rust/kernel/alloc/allocator.rs +++ b/rust/kernel/alloc/allocator.rs @@ -13,7 +13,7 @@ use core::alloc::Layout; use core::ptr; use core::ptr::NonNull; -use crate::alloc::{AllocError, Allocator}; +use crate::alloc::{AllocError, Allocator, NumaNode}; use crate::bindings; use crate::pr_warn; @@ -45,20 +45,25 @@ pub struct KVmalloc; /// # Invariants /// -/// One of the following: `krealloc`, `vrealloc`, `kvrealloc`. +/// One of the following: `krealloc_node`, `vrealloc_node`, `kvrealloc_node`. struct ReallocFunc( - unsafe extern "C" fn(*const crate::ffi::c_void, usize, u32) -> *mut crate::ffi::c_void, + unsafe extern "C" fn( + *const crate::ffi::c_void, + usize, + u32, + crate::ffi::c_int, + ) -> *mut crate::ffi::c_void, ); impl ReallocFunc { - // INVARIANT: `krealloc` satisfies the type invariants. - const KREALLOC: Self = Self(bindings::krealloc); + // INVARIANT: `krealloc_node` satisfies the type invariants. + const KREALLOC: Self = Self(bindings::krealloc_node); - // INVARIANT: `vrealloc` satisfies the type invariants. - const VREALLOC: Self = Self(bindings::vrealloc); + // INVARIANT: `vrealloc_node` satisfies the type invariants. + const VREALLOC: Self = Self(bindings::vrealloc_node); - // INVARIANT: `kvrealloc` satisfies the type invariants. - const KVREALLOC: Self = Self(bindings::kvrealloc); + // INVARIANT: `kvrealloc_node` satisfies the type invariants. + const KVREALLOC: Self = Self(bindings::kvrealloc_node); /// # Safety /// @@ -76,6 +81,7 @@ impl ReallocFunc { layout: Layout, old_layout: Layout, flags: Flags, + nid: NumaNode, ) -> Result, AllocError> { let size = layout.size(); let ptr = match ptr { @@ -99,7 +105,7 @@ impl ReallocFunc { // - Those functions provide the guarantees of this function. let raw_ptr = unsafe { // If `size == 0` and `ptr != NULL` the memory behind the pointer is freed. - self.0(ptr.cast(), size, flags.0).cast() + self.0(ptr.cast(), size, flags.0, nid.0).cast() }; let ptr = if size == 0 { @@ -134,11 +140,12 @@ unsafe impl Allocator for Kmalloc { layout: Layout, old_layout: Layout, flags: Flags, + nid: NumaNode, ) -> Result, AllocError> { let layout = Kmalloc::aligned_layout(layout); // SAFETY: `ReallocFunc::call` has the same safety requirements as `Allocator::realloc`. - unsafe { ReallocFunc::KREALLOC.call(ptr, layout, old_layout, flags) } + unsafe { ReallocFunc::KREALLOC.call(ptr, layout, old_layout, flags, nid) } } } @@ -153,6 +160,7 @@ unsafe impl Allocator for Vmalloc { layout: Layout, old_layout: Layout, flags: Flags, + nid: NumaNode, ) -> Result, AllocError> { // TODO: Support alignments larger than PAGE_SIZE. if layout.align() > bindings::PAGE_SIZE { @@ -162,7 +170,7 @@ unsafe impl Allocator for Vmalloc { // SAFETY: If not `None`, `ptr` is guaranteed to point to valid memory, which was previously // allocated with this `Allocator`. - unsafe { ReallocFunc::VREALLOC.call(ptr, layout, old_layout, flags) } + unsafe { ReallocFunc::VREALLOC.call(ptr, layout, old_layout, flags, nid) } } } @@ -177,6 +185,7 @@ unsafe impl Allocator for KVmalloc { layout: Layout, old_layout: Layout, flags: Flags, + nid: NumaNode, ) -> Result, AllocError> { // `KVmalloc` may use the `Kmalloc` backend, hence we have to enforce a `Kmalloc` // compatible layout. @@ -190,6 +199,6 @@ unsafe impl Allocator for KVmalloc { // SAFETY: If not `None`, `ptr` is guaranteed to point to valid memory, which was previously // allocated with this `Allocator`. - unsafe { ReallocFunc::KVREALLOC.call(ptr, layout, old_layout, flags) } + unsafe { ReallocFunc::KVREALLOC.call(ptr, layout, old_layout, flags, nid) } } } diff --git a/rust/kernel/alloc/allocator_test.rs b/rust/kernel/alloc/allocator_test.rs index 90dd987d40e4..2e61cdbd2303 100644 --- a/rust/kernel/alloc/allocator_test.rs +++ b/rust/kernel/alloc/allocator_test.rs @@ -9,7 +9,7 @@ #![allow(missing_docs)] -use super::{flags::*, AllocError, Allocator, Flags}; +use super::{flags::*, AllocError, Allocator, Flags, NumaNode}; use core::alloc::Layout; use core::cmp; use core::ptr; @@ -51,6 +51,7 @@ unsafe impl Allocator for Cmalloc { layout: Layout, old_layout: Layout, flags: Flags, + _nid: NumaNode, ) -> Result, AllocError> { let src = match ptr { Some(src) => { diff --git a/rust/kernel/alloc/kbox.rs b/rust/kernel/alloc/kbox.rs index 856d05aa60f1..1fef9beb57c8 100644 --- a/rust/kernel/alloc/kbox.rs +++ b/rust/kernel/alloc/kbox.rs @@ -4,7 +4,7 @@ #[allow(unused_imports)] // Used in doc comments. use super::allocator::{KVmalloc, Kmalloc, Vmalloc}; -use super::{AllocError, Allocator, Flags}; +use super::{AllocError, Allocator, Flags, NumaNode}; use core::alloc::Layout; use core::borrow::{Borrow, BorrowMut}; use core::fmt; @@ -273,7 +273,7 @@ where /// ``` pub fn new_uninit(flags: Flags) -> Result, A>, AllocError> { let layout = Layout::new::>(); - let ptr = A::alloc(layout, flags)?; + let ptr = A::alloc(layout, flags, NumaNode::NO_NODE)?; // INVARIANT: `ptr` is either a dangling pointer or points to memory allocated with `A`, // which is sufficient in size and alignment for storing a `T`. diff --git a/rust/kernel/alloc/kvec.rs b/rust/kernel/alloc/kvec.rs index 3c72e0bdddb8..92d0ed3f302e 100644 --- a/rust/kernel/alloc/kvec.rs +++ b/rust/kernel/alloc/kvec.rs @@ -5,7 +5,7 @@ use super::{ allocator::{KVmalloc, Kmalloc, Vmalloc}, layout::ArrayLayout, - AllocError, Allocator, Box, Flags, + AllocError, Allocator, Box, Flags, NumaNode, }; use core::{ borrow::{Borrow, BorrowMut}, @@ -634,6 +634,7 @@ where layout.into(), self.layout.into(), flags, + NumaNode::NO_NODE, )? }; @@ -1111,7 +1112,13 @@ where // the type invariant to be smaller than `cap`. Depending on `realloc` this operation // may shrink the buffer or leave it as it is. ptr = match unsafe { - A::realloc(Some(buf.cast()), layout.into(), old_layout.into(), flags) + A::realloc( + Some(buf.cast()), + layout.into(), + old_layout.into(), + flags, + NumaNode::NO_NODE, + ) } { // If we fail to shrink, which likely can't even happen, continue with the existing // buffer. From 1738796994a439b0ea796847e3ceb8688dacd93d Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Wed, 6 Aug 2025 14:55:52 +0200 Subject: [PATCH 019/372] rust: support large alignments in allocations Add support for large (> PAGE_SIZE) alignments in Rust allocators. All the preparations on the C side are already done, we just need to add bindings for _node_align() functions and start using those. Link: https://lkml.kernel.org/r/20250806125552.1727073-1-vitaly.wool@konsulko.se Signed-off-by: Vitaly Wool Acked-by: Danilo Krummrich Acked-by: Alice Ryhl Cc: Herbert Xu Cc: Jann Horn Cc: Kent Overstreet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/helpers/slab.c | 10 ++++++---- rust/helpers/vmalloc.c | 5 +++-- rust/kernel/alloc/allocator.rs | 30 +++++++++--------------------- 3 files changed, 18 insertions(+), 27 deletions(-) diff --git a/rust/helpers/slab.c b/rust/helpers/slab.c index 8472370a4338..7fac958907b0 100644 --- a/rust/helpers/slab.c +++ b/rust/helpers/slab.c @@ -3,13 +3,15 @@ #include void * __must_check __realloc_size(2) -rust_helper_krealloc_node(const void *objp, size_t new_size, gfp_t flags, int node) +rust_helper_krealloc_node_align(const void *objp, size_t new_size, unsigned long align, + gfp_t flags, int node) { - return krealloc_node(objp, new_size, flags, node); + return krealloc_node_align(objp, new_size, align, flags, node); } void * __must_check __realloc_size(2) -rust_helper_kvrealloc_node(const void *p, size_t size, gfp_t flags, int node) +rust_helper_kvrealloc_node_align(const void *p, size_t size, unsigned long align, + gfp_t flags, int node) { - return kvrealloc_node(p, size, flags, node); + return kvrealloc_node_align(p, size, align, flags, node); } diff --git a/rust/helpers/vmalloc.c b/rust/helpers/vmalloc.c index 62d30db9a1a6..7d7f7336b3d2 100644 --- a/rust/helpers/vmalloc.c +++ b/rust/helpers/vmalloc.c @@ -3,7 +3,8 @@ #include void * __must_check __realloc_size(2) -rust_helper_vrealloc_node(const void *p, size_t size, gfp_t flags, int node) +rust_helper_vrealloc_node_align(const void *p, size_t size, unsigned long align, + gfp_t flags, int node) { - return vrealloc_node(p, size, flags, node); + return vrealloc_node_align(p, size, align, flags, node); } diff --git a/rust/kernel/alloc/allocator.rs b/rust/kernel/alloc/allocator.rs index 14510a9e4502..f4ae0cf0a594 100644 --- a/rust/kernel/alloc/allocator.rs +++ b/rust/kernel/alloc/allocator.rs @@ -15,7 +15,6 @@ use core::ptr::NonNull; use crate::alloc::{AllocError, Allocator, NumaNode}; use crate::bindings; -use crate::pr_warn; /// The contiguous kernel allocator. /// @@ -45,25 +44,26 @@ pub struct KVmalloc; /// # Invariants /// -/// One of the following: `krealloc_node`, `vrealloc_node`, `kvrealloc_node`. +/// One of the following: `krealloc_node_align`, `vrealloc_node_align`, `kvrealloc_node_align`. struct ReallocFunc( unsafe extern "C" fn( *const crate::ffi::c_void, usize, + crate::ffi::c_ulong, u32, crate::ffi::c_int, ) -> *mut crate::ffi::c_void, ); impl ReallocFunc { - // INVARIANT: `krealloc_node` satisfies the type invariants. - const KREALLOC: Self = Self(bindings::krealloc_node); + // INVARIANT: `krealloc_node_align` satisfies the type invariants. + const KREALLOC: Self = Self(bindings::krealloc_node_align); - // INVARIANT: `vrealloc_node` satisfies the type invariants. - const VREALLOC: Self = Self(bindings::vrealloc_node); + // INVARIANT: `vrealloc_node_align` satisfies the type invariants. + const VREALLOC: Self = Self(bindings::vrealloc_node_align); - // INVARIANT: `kvrealloc_node` satisfies the type invariants. - const KVREALLOC: Self = Self(bindings::kvrealloc_node); + // INVARIANT: `kvrealloc_node_align` satisfies the type invariants. + const KVREALLOC: Self = Self(bindings::kvrealloc_node_align); /// # Safety /// @@ -105,7 +105,7 @@ impl ReallocFunc { // - Those functions provide the guarantees of this function. let raw_ptr = unsafe { // If `size == 0` and `ptr != NULL` the memory behind the pointer is freed. - self.0(ptr.cast(), size, flags.0, nid.0).cast() + self.0(ptr.cast(), size, layout.align(), flags.0, nid.0).cast() }; let ptr = if size == 0 { @@ -162,12 +162,6 @@ unsafe impl Allocator for Vmalloc { flags: Flags, nid: NumaNode, ) -> Result, AllocError> { - // TODO: Support alignments larger than PAGE_SIZE. - if layout.align() > bindings::PAGE_SIZE { - pr_warn!("Vmalloc does not support alignments larger than PAGE_SIZE yet.\n"); - return Err(AllocError); - } - // SAFETY: If not `None`, `ptr` is guaranteed to point to valid memory, which was previously // allocated with this `Allocator`. unsafe { ReallocFunc::VREALLOC.call(ptr, layout, old_layout, flags, nid) } @@ -191,12 +185,6 @@ unsafe impl Allocator for KVmalloc { // compatible layout. let layout = Kmalloc::aligned_layout(layout); - // TODO: Support alignments larger than PAGE_SIZE. - if layout.align() > bindings::PAGE_SIZE { - pr_warn!("KVmalloc does not support alignments larger than PAGE_SIZE yet.\n"); - return Err(AllocError); - } - // SAFETY: If not `None`, `ptr` is guaranteed to point to valid memory, which was previously // allocated with this `Allocator`. unsafe { ReallocFunc::KVREALLOC.call(ptr, layout, old_layout, flags, nid) } From 1097a3d456ae11a09a8c624f80505150e1f112ae Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 4 Aug 2025 14:51:17 +0000 Subject: [PATCH 020/372] mm/nommu: convert kobjsize() to folios Simple folio conversion to remove a user of PageSlab() and PageCompound(). Link: https://lkml.kernel.org/r/20250804145117.3857308-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: SeongJae Park Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Sidhartha Kumar Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/nommu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 6fff462f90d3..c3a23b082adb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -64,7 +64,7 @@ const struct vm_operations_struct generic_file_vm_ops = { */ unsigned int kobjsize(const void *objp) { - struct page *page; + struct folio *folio; /* * If the object we have should not have ksize performed on it, @@ -73,22 +73,22 @@ unsigned int kobjsize(const void *objp) if (!objp || !virt_addr_valid(objp)) return 0; - page = virt_to_head_page(objp); + folio = virt_to_folio(objp); /* * If the allocator sets PageSlab, we know the pointer came from * kmalloc(). */ - if (PageSlab(page)) + if (folio_test_slab(folio)) return ksize(objp); /* - * If it's not a compound page, see if we have a matching VMA + * If it's not a large folio, see if we have a matching VMA * region. This test is intentionally done in reverse order, * so if there's no VMA, we still fall through and hand back - * PAGE_SIZE for 0-order pages. + * PAGE_SIZE for 0-order folios. */ - if (!PageCompound(page)) { + if (!folio_test_large(folio)) { struct vm_area_struct *vma; vma = find_vma(current->mm, (unsigned long)objp); @@ -100,7 +100,7 @@ unsigned int kobjsize(const void *objp) * The ksize() function is only guaranteed to work for pointers * returned by kmalloc(). So handle arbitrary pointers here. */ - return page_size(page); + return folio_size(folio); } void vfree(const void *addr) From 9863124e0bfb94a0c1e84186d2f56f400b2e0112 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Mon, 4 Aug 2025 21:00:17 +0800 Subject: [PATCH 021/372] xarray: remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Link: https://lkml.kernel.org/r/20250804130018.484321-1-rongqianfeng@vivo.com Signed-off-by: Qianfeng Rong Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- lib/xarray.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/xarray.c b/lib/xarray.c index ae3d80f4b4ee..9a8b4916540c 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -370,7 +370,7 @@ static void *xas_alloc(struct xa_state *xas, unsigned int shift) if (node) { xas->xa_alloc = NULL; } else { - gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN; + gfp_t gfp = GFP_NOWAIT; if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT) gfp |= __GFP_ACCOUNT; From 878d9e8cebbb111a678e1f83d7daee917922f700 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 4 Aug 2025 06:41:06 +0000 Subject: [PATCH 022/372] mm/rmap: do __folio_mod_stat() in __folio_add_rmap() It is required to modify folio statistic after rmap changes, so it looks reasonable to do it in __folio_add_rmap(), which is the current behavior of __folio_remove_rmap() and folio_add_new_anon_rmap(). Call __folio_mod_stat() in __folio_add_rmap(), so that rmap adjustment family shares the same pattern. Link: https://lkml.kernel.org/r/20250804064106.21269-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Rik van Riel Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Harry Yoo Signed-off-by: Andrew Morton --- mm/rmap.c | 67 +++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 568198e9efc2..84a8d8b02ef7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1241,13 +1241,35 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } -static __always_inline unsigned int __folio_add_rmap(struct folio *folio, +static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) +{ + int idx; + + if (nr) { + idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; + __lruvec_stat_mod_folio(folio, idx, nr); + } + if (nr_pmdmapped) { + if (folio_test_anon(folio)) { + idx = NR_ANON_THPS; + __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); + } else { + /* NR_*_PMDMAPPED are not maintained per-memcg */ + idx = folio_test_swapbacked(folio) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; + __mod_node_page_state(folio_pgdat(folio), idx, + nr_pmdmapped); + } + } +} + +static __always_inline void __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level, int *nr_pmdmapped) + enum rmap_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; - int first = 0, nr = 0; + int first = 0, nr = 0, nr_pmdmapped = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); @@ -1283,7 +1305,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, first = atomic_inc_and_test(&folio->_entire_mapcount); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { if (level == RMAP_LEVEL_PMD && first) - *nr_pmdmapped = folio_large_nr_pages(folio); + nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_inc_return_large_mapcount(folio, vma); if (nr == 1) /* Was completely unmapped. */ @@ -1302,7 +1324,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, * folios separately. */ if (level == RMAP_LEVEL_PMD) - *nr_pmdmapped = nr_pages; + nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) @@ -1315,7 +1337,7 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio, folio_inc_large_mapcount(folio, vma); break; } - return nr; + __folio_mod_stat(folio, nr, nr_pmdmapped); } /** @@ -1403,43 +1425,19 @@ static void __page_check_anon_rmap(const struct folio *folio, page); } -static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) -{ - int idx; - - if (nr) { - idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; - __lruvec_stat_mod_folio(folio, idx, nr); - } - if (nr_pmdmapped) { - if (folio_test_anon(folio)) { - idx = NR_ANON_THPS; - __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); - } else { - /* NR_*_PMDMAPPED are not maintained per-memcg */ - idx = folio_test_swapbacked(folio) ? - NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; - __mod_node_page_state(folio_pgdat(folio), idx, - nr_pmdmapped); - } - } -} - static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum rmap_level level) { - int i, nr, nr_pmdmapped = 0; + int i; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); - nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped); + __folio_add_rmap(folio, page, nr_pages, vma, level); if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); - __folio_mod_stat(folio, nr, nr_pmdmapped); - if (flags & RMAP_EXCLUSIVE) { switch (level) { case RMAP_LEVEL_PTE: @@ -1613,12 +1611,9 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum rmap_level level) { - int nr, nr_pmdmapped = 0; - VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); - nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped); - __folio_mod_stat(folio, nr, nr_pmdmapped); + __folio_add_rmap(folio, page, nr_pages, vma, level); /* See comments in folio_add_anon_rmap_*() */ if (!folio_test_large(folio)) From dc32c8d4875fdd27b43d5c295853860f4d647055 Mon Sep 17 00:00:00 2001 From: Bijan Tabatabai Date: Wed, 6 Aug 2025 18:42:54 -0500 Subject: [PATCH 023/372] mm/damon/core: skip needless update of damon_attrs in damon_commit_ctx() Currently, damon_commit_ctx() always calls damon_set_attrs() even if the attributes have not been changed. This can be problematic when the DAMON state is committed relatively frequently because damon_set_attrs() resets ctx->next_{aggregation,ops_update}_sis, causing aggregation and ops update operations to be needlessly delayed. This patch avoids this by only calling damon_set_attrs() in damon_commit_ctx when the attributes have been changed. [akpm@linux-foundation.org: Link: https://lkml.kernel.org/r/20250807001924.76275-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250806234254.10572-1-bijan311@gmail.com Signed-off-by: Bijan Tabatabai Reviewed-by: SeongJae Park Cc: Bijan Tabatabai Signed-off-by: Andrew Morton --- mm/damon/core.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 106ee8b0f2d5..52ecc3a4426f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -570,6 +570,23 @@ void damon_destroy_ctx(struct damon_ctx *ctx) kfree(ctx); } +static bool damon_attrs_equals(const struct damon_attrs *attrs1, + const struct damon_attrs *attrs2) +{ + const struct damon_intervals_goal *ig1 = &attrs1->intervals_goal; + const struct damon_intervals_goal *ig2 = &attrs2->intervals_goal; + + return attrs1->sample_interval == attrs2->sample_interval && + attrs1->aggr_interval == attrs2->aggr_interval && + attrs1->ops_update_interval == attrs2->ops_update_interval && + attrs1->min_nr_regions == attrs2->min_nr_regions && + attrs1->max_nr_regions == attrs2->max_nr_regions && + ig1->access_bp == ig2->access_bp && + ig1->aggrs == ig2->aggrs && + ig1->min_sample_us == ig2->min_sample_us && + ig1->max_sample_us == ig2->max_sample_us; +} + static unsigned int damon_age_for_new_attrs(unsigned int age, struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) { @@ -1222,9 +1239,11 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) * 2. ops update should be done after pid handling is done (target * committing require putting pids). */ - err = damon_set_attrs(dst, &src->attrs); - if (err) - return err; + if (!damon_attrs_equals(&dst->attrs, &src->attrs)) { + err = damon_set_attrs(dst, &src->attrs); + if (err) + return err; + } dst->ops = src->ops; return 0; From 408b299a62ec207fa5f213d7044245752eb50dab Mon Sep 17 00:00:00 2001 From: Yueyang Pan Date: Sat, 2 Aug 2025 11:52:45 +0000 Subject: [PATCH 024/372] mm/damon/paddr: move filters existence check function to ops-common Patch series "mm/damon/vaddr: support stat-purpose DAMOS filters", v4. Extend DAMOS_STAT handling of the DAMON operations sets for virtual address spaces for ops-level DAMOS filters. Functionality Test ================== I wrote a small test program which allocates 10GB of DRAM, use madvise(MADV_HUGEPAGE) to convert the base pages to 2MB huge pages Then my program does the following things in order: 1. Write sequentially to the whole 10GB region 2. Read the first 5GB region sequentially for 10 times 3. Sleep 5s 4. Read the second 5GB region sequentially for 10 times With a proper damon setting, we are expected to see df-passed to be 10GB and hot region move around with the read $ # Start DAMON $ sudo ./damo/damo start "./my_test/test" --monitoring_intervals 100ms\ 1s 2s $ # Show DAMON-generated access pattern snapshot $ sudo ./damo/damo report access --snapshot_damos_filter allow \ hugepage_size 2MiB 2MiB heatmap: # min/max temperatures: -600,000,000, 100,001,000, column size: 137.352 MiB intervals: sample 100 ms aggr 1 s (max access hz 10) # damos filters (df): reject none hugepage_size [2.000 MiB, 2.000 MiB] df-pass: # min/max temperatures: -400,000,000, 100,001,000, column size: 128.031 MiB 0 addr 85.373 TiB size 745.555 MiB access 0 hz age 6 s df-passed 0 B 1 addr 127.608 TiB size 877.664 MiB access 3.000 hz age 0 ns df-passed 878.000 MiB 2 addr 127.609 TiB size 219.418 MiB access 2.000 hz age 0 ns df-passed 220.000 MiB 3 addr 127.609 TiB size 316.613 MiB access 1.000 hz age 1 s df-passed 316.000 MiB 4 addr 127.609 TiB size 474.922 MiB access 1.000 hz age 1 s df-passed 476.000 MiB 5 addr 127.610 TiB size 407.188 MiB access 1.000 hz age 0 ns df-passed 406.000 MiB 6 addr 127.610 TiB size 610.781 MiB access 1.000 hz age 0 ns df-passed 612.000 MiB 7 addr 127.611 TiB size 697.309 MiB access 0 hz age 0 ns df-passed 696.000 MiB 8 addr 127.611 TiB size 77.480 MiB access 1.000 hz age 0 ns df-passed 78.000 MiB 9 addr 127.611 TiB size 573.102 MiB access 1.000 hz age 0 ns df-passed 574.000 MiB 10 addr 127.612 TiB size 245.617 MiB access 2.000 hz age 0 ns df-passed 246.000 MiB 11 addr 127.612 TiB size 295.102 MiB access 1.000 hz age 1 s df-passed 294.000 MiB 12 addr 127.612 TiB size 295.105 MiB access 1.000 hz age 1 s df-passed 296.000 MiB 13 addr 127.613 TiB size 67.172 MiB access 1.000 hz age 1 s df-passed 66.000 MiB 14 addr 127.613 TiB size 604.570 MiB access 0 hz age 1 s df-passed 606.000 MiB 15 addr 127.613 TiB size 389.578 MiB access 0 hz age 4 s df-passed 388.000 MiB 16 addr 127.614 TiB size 259.719 MiB access 0 hz age 4 s df-passed 260.000 MiB 17 addr 127.614 TiB size 817.941 MiB access 0 hz age 4 s df-passed 818.000 MiB 18 addr 127.615 TiB size 204.488 MiB access 0 hz age 4 s df-passed 204.000 MiB 19 addr 127.615 TiB size 730.902 MiB access 0 hz age 4 s df-passed 732.000 MiB 20 addr 127.616 TiB size 182.727 MiB access 0 hz age 4 s df-passed 182.000 MiB 21 addr 127.616 TiB size 926.824 MiB access 0 hz age 2 s df-passed 928.000 MiB 22 addr 127.617 TiB size 102.984 MiB access 0 hz age 2 s df-passed 102.000 MiB 23 addr 127.617 TiB size 86.527 MiB access 0 hz age 2 s df-passed 86.000 MiB 24 addr 127.617 TiB size 778.777 MiB access 0 hz age 2 s df-passed 776.000 MiB 25 addr 127.999 TiB size 132.000 KiB access 0 hz age 6 s df-passed 0 B memory bw estimate: 6.524 GiB per second df-passed: 6.527 GiB per second total size: 10.731 GiB df-passed 10.000 GiB record DAMON intervals: sample 100 ms, aggr 1 s $ # Show DAMON-generated access pattern snapshot again $ sudo ./damo/damo report access --snapshot_damos_filter allow \ hugepage_size 2MiB 2MiB heatmap: # min/max temperatures: -1,100,000,000, 2,000, column size: 137.352 MiB intervals: sample 100 ms aggr 1 s (max access hz 10) # damos filters (df): reject none hugepage_size [2.000 MiB, 2.000 MiB] df-pass: # min/max temperatures: -900,000,000, 2,000, column size: 128.031 MiB 0 addr 85.373 TiB size 745.555 MiB access 0 hz age 11 s df-passed 0 B 1 addr 127.608 TiB size 579.715 MiB access 2.000 hz age 0 ns df-passed 580.000 MiB 2 addr 127.608 TiB size 144.930 MiB access 2.000 hz age 0 ns df-passed 146.000 MiB 3 addr 127.608 TiB size 452.453 MiB access 2.000 hz age 0 ns df-passed 452.000 MiB 4 addr 127.609 TiB size 113.117 MiB access 1.000 hz age 0 ns df-passed 114.000 MiB 5 addr 127.609 TiB size 182.367 MiB access 2.000 hz age 0 ns df-passed 182.000 MiB 6 addr 127.609 TiB size 182.371 MiB access 2.000 hz age 0 ns df-passed 182.000 MiB 7 addr 127.609 TiB size 350.488 MiB access 1.000 hz age 0 ns df-passed 350.000 MiB 8 addr 127.610 TiB size 525.738 MiB access 1.000 hz age 0 ns df-passed 526.000 MiB 9 addr 127.610 TiB size 401.352 MiB access 1.000 hz age 0 ns df-passed 402.000 MiB 10 addr 127.611 TiB size 100.340 MiB access 1.000 hz age 0 ns df-passed 100.000 MiB 11 addr 127.611 TiB size 19.523 MiB access 0 hz age 0 ns df-passed 20.000 MiB 12 addr 127.611 TiB size 175.727 MiB access 0 hz age 0 ns df-passed 176.000 MiB 13 addr 127.611 TiB size 106.629 MiB access 0 hz age 0 ns df-passed 106.000 MiB 14 addr 127.611 TiB size 959.676 MiB access 0 hz age 0 ns df-passed 960.000 MiB 15 addr 127.612 TiB size 424.469 MiB access 1.000 hz age 0 ns df-passed 424.000 MiB 16 addr 127.612 TiB size 424.469 MiB access 1.000 hz age 0 ns df-passed 424.000 MiB 17 addr 127.613 TiB size 201.648 MiB access 0 hz age 6 s df-passed 202.000 MiB 18 addr 127.613 TiB size 806.609 MiB access 0 hz age 6 s df-passed 806.000 MiB 19 addr 127.614 TiB size 862.125 MiB access 0 hz age 9 s df-passed 862.000 MiB 20 addr 127.614 TiB size 215.535 MiB access 0 hz age 9 s df-passed 216.000 MiB 21 addr 127.615 TiB size 104.500 MiB access 0 hz age 9 s df-passed 104.000 MiB 22 addr 127.615 TiB size 940.523 MiB access 0 hz age 9 s df-passed 942.000 MiB 23 addr 127.616 TiB size 640.281 MiB access 0 hz age 7 s df-passed 640.000 MiB 24 addr 127.616 TiB size 426.855 MiB access 0 hz age 7 s df-passed 426.000 MiB 25 addr 127.617 TiB size 90.105 MiB access 0 hz age 7 s df-passed 90.000 MiB 26 addr 127.617 TiB size 810.965 MiB access 0 hz age 7 s df-passed 808.000 MiB 27 addr 127.999 TiB size 132.000 KiB access 0 hz age 11 s df-passed 0 B memory bw estimate: 5.297 GiB per second df-passed: 5.297 GiB per second total size: 10.731 GiB df-passed 10.000 GiB record DAMON intervals: sample 100 ms, aggr 1 s As you can see the total df-passed region is 10GiB and the hot region moves as the seq read keeps going This patch (of 2): This patch moves damon_pa_scheme_has_filter to ops-common. renaming to damos_ops_has_filter. Doing so allows us to reuse its logic in the vaddr version of DAMOS_STAT. Link: https://lkml.kernel.org/r/cover.1754135312.git.pyyjason@gmail.com Link: https://lkml.kernel.org/r/cbe01740f7ac5ac7c9fd1ca367d297c3d7f2a69d.1754135312.git.pyyjason@gmail.com Signed-off-by: Yueyang Pan Reviewed-by: SeongJae Park Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 9 +++++++++ mm/damon/ops-common.h | 2 ++ mm/damon/paddr.c | 11 +---------- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 99321ff5cb92..2e3409a6c8a4 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -412,3 +412,12 @@ unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid) return nr_migrated; } + +bool damos_ops_has_filter(struct damos *s) +{ + struct damos_filter *f; + + damos_for_each_ops_filter(f, s) + return true; + return false; +} diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 61ad54aaf256..5efa5b5970de 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -21,3 +21,5 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, bool damos_folio_filter_match(struct damos_filter *filter, struct folio *folio); unsigned long damon_migrate_pages(struct list_head *folio_list, int target_nid); + +bool damos_ops_has_filter(struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 53a55c5114fb..0b67d9321460 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -262,22 +262,13 @@ put_folio: return applied * PAGE_SIZE; } -static bool damon_pa_scheme_has_filter(struct damos *s) -{ - struct damos_filter *f; - - damos_for_each_ops_filter(f, s) - return true; - return false; -} - static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, unsigned long *sz_filter_passed) { unsigned long addr; struct folio *folio; - if (!damon_pa_scheme_has_filter(s)) + if (!damos_ops_has_filter(s)) return 0; addr = r->ar.start; From 63f39737d1e3cfb3a2173fb4c56a64b4d8ce6fa8 Mon Sep 17 00:00:00 2001 From: Yueyang Pan Date: Sat, 2 Aug 2025 11:52:46 +0000 Subject: [PATCH 025/372] mm/damon/vaddr: support stat-purpose DAMOS filters This patch extends DAMOS_STAT handling of the DAMON operations sets for virtual address spaces for ops-level DAMOS filters. It leverages the walk_page_range to walk the page table and gets the folio from page table. The last folio scanned is stored in damos->last_applied to prevent double counting. Link: https://lkml.kernel.org/r/264a4b5ea202fd73c01b349c9694d8bf9978c441.1754135312.git.pyyjason@gmail.com Signed-off-by: Yueyang Pan Reviewed-by: SeongJae Park Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 87e825349bdf..66ef9869eafe 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -890,6 +890,107 @@ free_lists: return applied * PAGE_SIZE; } +struct damos_va_stat_private { + struct damos *scheme; + unsigned long *sz_filter_passed; +}; + +static inline bool damos_va_invalid_folio(struct folio *folio, + struct damos *s) +{ + return !folio || folio == s->last_applied; +} + +static int damos_va_stat_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct damos_va_stat_private *priv = walk->private; + struct damos *s = priv->scheme; + unsigned long *sz_filter_passed = priv->sz_filter_passed; + struct vm_area_struct *vma = walk->vma; + struct folio *folio; + spinlock_t *ptl; + pte_t *start_pte, *pte, ptent; + int nr; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (pmd_trans_huge(*pmd)) { + pmd_t pmde; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return 0; + pmde = pmdp_get(pmd); + if (!pmd_present(pmde)) + goto huge_unlock; + + folio = vm_normal_folio_pmd(vma, addr, pmde); + + if (damos_va_invalid_folio(folio, s)) + goto huge_unlock; + + if (!damos_va_filter_out(s, folio, vma, addr, NULL, pmd)) + *sz_filter_passed += folio_size(folio); + s->last_applied = folio; + +huge_unlock: + spin_unlock(ptl); + return 0; + } +#endif + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (!start_pte) + return 0; + + for (; addr < next; pte += nr, addr += nr * PAGE_SIZE) { + nr = 1; + ptent = ptep_get(pte); + + if (pte_none(ptent) || !pte_present(ptent)) + continue; + + folio = vm_normal_folio(vma, addr, ptent); + + if (damos_va_invalid_folio(folio, s)) + continue; + + if (!damos_va_filter_out(s, folio, vma, addr, pte, NULL)) + *sz_filter_passed += folio_size(folio); + nr = folio_nr_pages(folio); + s->last_applied = folio; + } + pte_unmap_unlock(start_pte, ptl); + return 0; +} + +static unsigned long damos_va_stat(struct damon_target *target, + struct damon_region *r, struct damos *s, + unsigned long *sz_filter_passed) +{ + struct damos_va_stat_private priv; + struct mm_struct *mm; + struct mm_walk_ops walk_ops = { + .pmd_entry = damos_va_stat_pmd_entry, + .walk_lock = PGWALK_RDLOCK, + }; + + priv.scheme = s; + priv.sz_filter_passed = sz_filter_passed; + + if (!damos_ops_has_filter(s)) + return 0; + + mm = damon_get_mm(target); + if (!mm) + return 0; + + mmap_read_lock(mm); + walk_page_range(mm, r->ar.start, r->ar.end, &walk_ops, &priv); + mmap_read_unlock(mm); + mmput(mm); + return 0; +} + static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) @@ -916,7 +1017,7 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_MIGRATE_COLD: return damos_va_migrate(t, r, scheme, sz_filter_passed); case DAMOS_STAT: - return 0; + return damos_va_stat(t, r, scheme, sz_filter_passed); default: /* * DAMOS actions that are not yet supported by 'vaddr'. From 41f105581680c821cd0081c91805c0420cc1e0e4 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 8 Aug 2025 08:28:47 -0700 Subject: [PATCH 026/372] selftests/proc: test PROCMAP_QUERY ioctl while vma is concurrently modified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series " execute PROCMAP_QUERY ioctl under per-vma lock", v4. With /proc/pid/maps now being read under per-vma lock protection we can reuse parts of that code to execute PROCMAP_QUERY ioctl also without taking mmap_lock. The change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates. This patchset was split out of the original patchset [1] that introduced per-vma lock usage for /proc/pid/maps reading. It contains PROCMAP_QUERY tests, code refactoring patch to simplify the main change and the actual transition to per-vma lock. This patch (of 3): Extend /proc/pid/maps tearing tests to verify PROCMAP_QUERY ioctl operation correctness while the vma is being concurrently modified. Link: https://lkml.kernel.org/r/20250808152850.2580887-1-surenb@google.com Link: https://lkml.kernel.org/r/20250808152850.2580887-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: SeongJae Park Acked-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Cc: Alexey Dobriyan Cc: Andrii Nakryiko Cc: Christian Brauner Cc: Christophe Leroy Cc: David Hildenbrand Cc: Jann Horn Cc: Johannes Weiner Cc: Josef Bacik Cc: Kalesh Singh Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Ryan Roberts Cc: Shuah Khan Cc: Thomas Weißschuh Cc: T.J. Mercier Cc: Vlastimil Babka Cc: Ye Bin Signed-off-by: Andrew Morton --- tools/testing/selftests/proc/proc-maps-race.c | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tools/testing/selftests/proc/proc-maps-race.c b/tools/testing/selftests/proc/proc-maps-race.c index 94bba4553130..a546475db550 100644 --- a/tools/testing/selftests/proc/proc-maps-race.c +++ b/tools/testing/selftests/proc/proc-maps-race.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -317,6 +319,25 @@ static bool capture_mod_pattern(FIXTURE_DATA(proc_maps_race) *self, strcmp(restored_first_line->text, self->first_line.text) == 0; } +static bool query_addr_at(int maps_fd, void *addr, + unsigned long *vma_start, unsigned long *vma_end) +{ + struct procmap_query q; + + memset(&q, 0, sizeof(q)); + q.size = sizeof(q); + /* Find the VMA at the split address */ + q.query_addr = (unsigned long long)addr; + q.query_flags = 0; + if (ioctl(maps_fd, PROCMAP_QUERY, &q)) + return false; + + *vma_start = q.vma_start; + *vma_end = q.vma_end; + + return true; +} + static inline bool split_vma(FIXTURE_DATA(proc_maps_race) *self) { return mmap(self->mod_info->addr, self->page_size, self->mod_info->prot | PROT_EXEC, @@ -559,6 +580,8 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) do { bool last_line_changed; bool first_line_changed; + unsigned long vma_start; + unsigned long vma_end; ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line)); @@ -595,6 +618,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_split) first_line_changed = strcmp(new_first_line.text, self->first_line.text) != 0; ASSERT_EQ(last_line_changed, first_line_changed); + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma at the split address can be either the same as + * original one (if read before the split) or the same as the + * first line in the second page (if read after the split). + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end == self->last_line.end_addr) || + (vma_start == split_first_line.start_addr && + vma_end == split_first_line.end_addr)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -636,6 +672,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do { + unsigned long vma_start; + unsigned long vma_end; + ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line)); /* Check if we read vmas after shrinking it */ @@ -662,6 +701,16 @@ TEST_F(proc_maps_race, test_maps_tearing_from_resize) "Expand result invalid", self)); } + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr, &vma_start, &vma_end)); + /* + * The vma should stay at the same address and have either the + * original size of 3 pages or 1 page if read after shrinking. + */ + ASSERT_TRUE(vma_start == self->last_line.start_addr && + (vma_end - vma_start == self->page_size * 3 || + vma_end - vma_start == self->page_size)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); @@ -703,6 +752,9 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) clock_gettime(CLOCK_MONOTONIC_COARSE, &start_ts); start_test_loop(&start_ts, self->verbose); do { + unsigned long vma_start; + unsigned long vma_end; + ASSERT_TRUE(read_boundary_lines(self, &new_last_line, &new_first_line)); /* Check if we read vmas after remapping it */ @@ -729,6 +781,19 @@ TEST_F(proc_maps_race, test_maps_tearing_from_remap) "Remap restore result invalid", self)); } + /* Check if PROCMAP_QUERY ioclt() finds the right VMA */ + ASSERT_TRUE(query_addr_at(self->maps_fd, mod_info->addr + self->page_size, + &vma_start, &vma_end)); + /* + * The vma should either stay at the same address and have the + * original size of 3 pages or we should find the remapped vma + * at the remap destination address with size of 1 page. + */ + ASSERT_TRUE((vma_start == self->last_line.start_addr && + vma_end - vma_start == self->page_size * 3) || + (vma_start == self->last_line.start_addr + self->page_size && + vma_end - vma_start == self->page_size)); + clock_gettime(CLOCK_MONOTONIC_COARSE, &end_ts); end_test_iteration(&end_ts, self->verbose); } while (end_ts.tv_sec - start_ts.tv_sec < self->duration_sec); From ee737a5a102c87cba43514483fcaa6fa75f866b8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 8 Aug 2025 08:28:48 -0700 Subject: [PATCH 027/372] fs/proc/task_mmu: factor out proc_maps_private fields used by PROCMAP_QUERY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor struct proc_maps_private so that the fields used by PROCMAP_QUERY ioctl are moved into a separate structure. In the next patch this allows ioctl to reuse some of the functions used for reading /proc/pid/maps without using file->private_data. This prevents concurrent modification of file->private_data members by ioctl and /proc/pid/maps readers. The change is pure code refactoring and has no functional changes. Link: https://lkml.kernel.org/r/20250808152850.2580887-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Acked-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Cc: Alexey Dobriyan Cc: Andrii Nakryiko Cc: Christian Brauner Cc: Christophe Leroy Cc: David Hildenbrand Cc: Jann Horn Cc: Johannes Weiner Cc: Josef Bacik Cc: Kalesh Singh Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Ryan Roberts Cc: Shuah Khan Cc: Thomas Weißschuh Cc: T.J. Mercier Cc: Ye Bin Signed-off-by: Andrew Morton --- fs/proc/internal.h | 15 +++++--- fs/proc/task_mmu.c | 87 +++++++++++++++++++++++--------------------- fs/proc/task_nommu.c | 14 +++---- 3 files changed, 63 insertions(+), 53 deletions(-) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index e737401d7383..d1598576506c 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -378,16 +378,21 @@ extern void proc_self_init(void); * task_[no]mmu.c */ struct mem_size_stats; -struct proc_maps_private { - struct inode *inode; - struct task_struct *task; + +struct proc_maps_locking_ctx { struct mm_struct *mm; - struct vma_iterator iter; - loff_t last_pos; #ifdef CONFIG_PER_VMA_LOCK bool mmap_locked; struct vm_area_struct *locked_vma; #endif +}; + +struct proc_maps_private { + struct inode *inode; + struct task_struct *task; + struct vma_iterator iter; + loff_t last_pos; + struct proc_maps_locking_ctx lock_ctx; #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 29cca0e6d0ff..c0968d293b61 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,18 +132,18 @@ static void release_task_mempolicy(struct proc_maps_private *priv) #ifdef CONFIG_PER_VMA_LOCK -static void unlock_vma(struct proc_maps_private *priv) +static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) { - if (priv->locked_vma) { - vma_end_read(priv->locked_vma); - priv->locked_vma = NULL; + if (lock_ctx->locked_vma) { + vma_end_read(lock_ctx->locked_vma); + lock_ctx->locked_vma = NULL; } } static const struct seq_operations proc_pid_maps_op; static inline bool lock_vma_range(struct seq_file *m, - struct proc_maps_private *priv) + struct proc_maps_locking_ctx *lock_ctx) { /* * smaps and numa_maps perform page table walk, therefore require @@ -151,25 +151,25 @@ static inline bool lock_vma_range(struct seq_file *m, * walking the vma tree under rcu read protection. */ if (m->op != &proc_pid_maps_op) { - if (mmap_read_lock_killable(priv->mm)) + if (mmap_read_lock_killable(lock_ctx->mm)) return false; - priv->mmap_locked = true; + lock_ctx->mmap_locked = true; } else { rcu_read_lock(); - priv->locked_vma = NULL; - priv->mmap_locked = false; + lock_ctx->locked_vma = NULL; + lock_ctx->mmap_locked = false; } return true; } -static inline void unlock_vma_range(struct proc_maps_private *priv) +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { - if (priv->mmap_locked) { - mmap_read_unlock(priv->mm); + if (lock_ctx->mmap_locked) { + mmap_read_unlock(lock_ctx->mm); } else { - unlock_vma(priv); + unlock_ctx_vma(lock_ctx); rcu_read_unlock(); } } @@ -177,15 +177,16 @@ static inline void unlock_vma_range(struct proc_maps_private *priv) static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, loff_t last_pos) { + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; struct vm_area_struct *vma; - if (priv->mmap_locked) + if (lock_ctx->mmap_locked) return vma_next(&priv->iter); - unlock_vma(priv); - vma = lock_next_vma(priv->mm, &priv->iter, last_pos); + unlock_ctx_vma(lock_ctx); + vma = lock_next_vma(lock_ctx->mm, &priv->iter, last_pos); if (!IS_ERR_OR_NULL(vma)) - priv->locked_vma = vma; + lock_ctx->locked_vma = vma; return vma; } @@ -193,14 +194,16 @@ static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, loff_t pos) { - if (priv->mmap_locked) + struct proc_maps_locking_ctx *lock_ctx = &priv->lock_ctx; + + if (lock_ctx->mmap_locked) return false; rcu_read_unlock(); - mmap_read_lock(priv->mm); + mmap_read_lock(lock_ctx->mm); /* Reinitialize the iterator after taking mmap_lock */ vma_iter_set(&priv->iter, pos); - priv->mmap_locked = true; + lock_ctx->mmap_locked = true; return true; } @@ -208,14 +211,14 @@ static inline bool fallback_to_mmap_lock(struct proc_maps_private *priv, #else /* CONFIG_PER_VMA_LOCK */ static inline bool lock_vma_range(struct seq_file *m, - struct proc_maps_private *priv) + struct proc_maps_locking_ctx *lock_ctx) { - return mmap_read_lock_killable(priv->mm) == 0; + return mmap_read_lock_killable(lock_ctx->mm) == 0; } -static inline void unlock_vma_range(struct proc_maps_private *priv) +static inline void unlock_vma_range(struct proc_maps_locking_ctx *lock_ctx) { - mmap_read_unlock(priv->mm); + mmap_read_unlock(lock_ctx->mm); } static struct vm_area_struct *get_next_vma(struct proc_maps_private *priv, @@ -258,7 +261,7 @@ retry: *ppos = vma->vm_end; } else { *ppos = SENTINEL_VMA_GATE; - vma = get_gate_vma(priv->mm); + vma = get_gate_vma(priv->lock_ctx.mm); } return vma; @@ -267,6 +270,7 @@ retry: static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + struct proc_maps_locking_ctx *lock_ctx; loff_t last_addr = *ppos; struct mm_struct *mm; @@ -278,14 +282,15 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!priv->task) return ERR_PTR(-ESRCH); - mm = priv->mm; + lock_ctx = &priv->lock_ctx; + mm = lock_ctx->mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; return NULL; } - if (!lock_vma_range(m, priv)) { + if (!lock_vma_range(m, lock_ctx)) { mmput(mm); put_task_struct(priv->task); priv->task = NULL; @@ -318,13 +323,13 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos) static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm; if (!priv->task) return; release_task_mempolicy(priv); - unlock_vma_range(priv); + unlock_vma_range(&priv->lock_ctx); mmput(mm); put_task_struct(priv->task); priv->task = NULL; @@ -339,9 +344,9 @@ static int proc_maps_open(struct inode *inode, struct file *file, return -ENOMEM; priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->lock_ctx.mm)) { + int err = PTR_ERR(priv->lock_ctx.mm); seq_release_private(inode, file); return err; @@ -355,8 +360,8 @@ static int proc_map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; - if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); return seq_release_private(inode, file); } @@ -610,7 +615,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL; - mm = priv->mm; + mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH; @@ -1311,7 +1316,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss = {}; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm; struct vm_area_struct *vma; unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; @@ -1456,9 +1461,9 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) goto out_free; priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR_OR_NULL(priv->mm)) { - ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + ret = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH; single_release(inode, file); goto out_free; @@ -1476,8 +1481,8 @@ static int smaps_rollup_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; - if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); kfree(priv); return single_release(inode, file); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 59bfd61d653a..d362919f4f68 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -204,7 +204,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) if (!priv->task) return ERR_PTR(-ESRCH); - mm = priv->mm; + mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) { put_task_struct(priv->task); priv->task = NULL; @@ -226,7 +226,7 @@ static void *m_start(struct seq_file *m, loff_t *ppos) static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct mm_struct *mm = priv->mm; + struct mm_struct *mm = priv->lock_ctx.mm; if (!priv->task) return; @@ -259,9 +259,9 @@ static int maps_open(struct inode *inode, struct file *file, return -ENOMEM; priv->inode = inode; - priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR_OR_NULL(priv->mm)) { - int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; + priv->lock_ctx.mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR_OR_NULL(priv->lock_ctx.mm)) { + int err = priv->lock_ctx.mm ? PTR_ERR(priv->lock_ctx.mm) : -ESRCH; seq_release_private(inode, file); return err; @@ -276,8 +276,8 @@ static int map_release(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct proc_maps_private *priv = seq->private; - if (priv->mm) - mmdrop(priv->mm); + if (priv->lock_ctx.mm) + mmdrop(priv->lock_ctx.mm); return seq_release_private(inode, file); } From d9d1c2d81797250270babcf108d990e3a4799a9d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 8 Aug 2025 08:28:49 -0700 Subject: [PATCH 028/372] fs/proc/task_mmu: execute PROCMAP_QUERY ioctl under per-vma locks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Utilize per-vma locks to stabilize vma after lookup without taking mmap_lock during PROCMAP_QUERY ioctl execution. If vma lock is contended, we fall back to mmap_lock but take it only momentarily to lock the vma and release the mmap_lock. In a very unlikely case of vm_refcnt overflow, this fall back path will fail and ioctl is done under mmap_lock protection. This change is designed to reduce mmap_lock contention and prevent PROCMAP_QUERY ioctl calls from blocking address space updates. Link: https://lkml.kernel.org/r/20250808152850.2580887-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: SeongJae Park Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Alexey Dobriyan Cc: Andrii Nakryiko Cc: Christian Brauner Cc: Christophe Leroy Cc: David Hildenbrand Cc: Jann Horn Cc: Johannes Weiner Cc: Josef Bacik Cc: Kalesh Singh Cc: Liam Howlett Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Oscar Salvador Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Ryan Roberts Cc: Shuah Khan Cc: Thomas Weißschuh Cc: T.J. Mercier Cc: Ye Bin Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 103 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 85 insertions(+), 18 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c0968d293b61..e64cf40ce9c4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -132,6 +132,12 @@ static void release_task_mempolicy(struct proc_maps_private *priv) #ifdef CONFIG_PER_VMA_LOCK +static void reset_lock_ctx(struct proc_maps_locking_ctx *lock_ctx) +{ + lock_ctx->locked_vma = NULL; + lock_ctx->mmap_locked = false; +} + static void unlock_ctx_vma(struct proc_maps_locking_ctx *lock_ctx) { if (lock_ctx->locked_vma) { @@ -157,8 +163,7 @@ static inline bool lock_vma_range(struct seq_file *m, lock_ctx->mmap_locked = true; } else { rcu_read_lock(); - lock_ctx->locked_vma = NULL; - lock_ctx->mmap_locked = false; + reset_lock_ctx(lock_ctx); } return true; @@ -522,28 +527,90 @@ static int pid_maps_open(struct inode *inode, struct file *file) PROCMAP_QUERY_VMA_FLAGS \ ) -static int query_vma_setup(struct mm_struct *mm) +#ifdef CONFIG_PER_VMA_LOCK + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) { - return mmap_read_lock_killable(mm); + reset_lock_ctx(lock_ctx); + + return 0; } -static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) { - mmap_read_unlock(mm); + if (lock_ctx->mmap_locked) { + mmap_read_unlock(lock_ctx->mm); + lock_ctx->mmap_locked = false; + } else { + unlock_ctx_vma(lock_ctx); + } } -static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) { - return find_vma(mm, addr); + struct mm_struct *mm = lock_ctx->mm; + struct vm_area_struct *vma; + struct vma_iterator vmi; + + if (lock_ctx->mmap_locked) + return find_vma(mm, addr); + + /* Unlock previously locked VMA and find the next one under RCU */ + unlock_ctx_vma(lock_ctx); + rcu_read_lock(); + vma_iter_init(&vmi, mm, addr); + vma = lock_next_vma(mm, &vmi, addr); + rcu_read_unlock(); + + if (!vma) + return NULL; + + if (!IS_ERR(vma)) { + lock_ctx->locked_vma = vma; + return vma; + } + + if (PTR_ERR(vma) == -EAGAIN) { + /* Fallback to mmap_lock on vma->vm_refcnt overflow */ + mmap_read_lock(mm); + vma = find_vma(mm, addr); + lock_ctx->mmap_locked = true; + } + + return vma; } -static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, +#else /* CONFIG_PER_VMA_LOCK */ + +static int query_vma_setup(struct proc_maps_locking_ctx *lock_ctx) +{ + return mmap_read_lock_killable(lock_ctx->mm); +} + +static void query_vma_teardown(struct proc_maps_locking_ctx *lock_ctx) +{ + mmap_read_unlock(lock_ctx->mm); +} + +static struct vm_area_struct *query_vma_find_by_addr(struct proc_maps_locking_ctx *lock_ctx, + unsigned long addr) +{ + return find_vma(lock_ctx->mm, addr); +} + +#endif /* CONFIG_PER_VMA_LOCK */ + +static struct vm_area_struct *query_matching_vma(struct proc_maps_locking_ctx *lock_ctx, unsigned long addr, u32 flags) { struct vm_area_struct *vma; next_vma: - vma = query_vma_find_by_addr(mm, addr); + vma = query_vma_find_by_addr(lock_ctx, addr); + if (IS_ERR(vma)) + return vma; + if (!vma) goto no_vma; @@ -584,11 +651,11 @@ no_vma: return ERR_PTR(-ENOENT); } -static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) +static int do_procmap_query(struct mm_struct *mm, void __user *uarg) { + struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; struct procmap_query karg; struct vm_area_struct *vma; - struct mm_struct *mm; const char *name = NULL; char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; __u64 usize; @@ -615,17 +682,16 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) if (!!karg.build_id_size != !!karg.build_id_addr) return -EINVAL; - mm = priv->lock_ctx.mm; if (!mm || !mmget_not_zero(mm)) return -ESRCH; - err = query_vma_setup(mm); + err = query_vma_setup(&lock_ctx); if (err) { mmput(mm); return err; } - vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); + vma = query_matching_vma(&lock_ctx, karg.query_addr, karg.query_flags); if (IS_ERR(vma)) { err = PTR_ERR(vma); vma = NULL; @@ -710,7 +776,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) } /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ - query_vma_teardown(mm, vma); + query_vma_teardown(&lock_ctx); mmput(mm); if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), @@ -730,7 +796,7 @@ static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) return 0; out: - query_vma_teardown(mm, vma); + query_vma_teardown(&lock_ctx); mmput(mm); kfree(name_buf); return err; @@ -743,7 +809,8 @@ static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned l switch (cmd) { case PROCMAP_QUERY: - return do_procmap_query(priv, (void __user *)arg); + /* priv->lock_ctx.mm is set during file open operation */ + return do_procmap_query(priv->lock_ctx.mm, (void __user *)arg); default: return -ENOIOCTLCMD; } From 3e86861d00d5cff8185a17da7e9d3b2f8fb90a99 Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Mon, 11 Aug 2025 11:42:57 +0800 Subject: [PATCH 029/372] mm/kasan/init.c: remove unnecessary pointer variables Simplify the code to enhance readability and maintain a consistent coding style. Link: https://lkml.kernel.org/r/20250811034257.154862-1-zhao.xichao@vivo.com Signed-off-by: Xichao Zhao Reviewed-by: Anshuman Khandual Reviewed-by: Andrey Konovalov Reviewed-by: Vishal Moola (Oracle) Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitriy Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/init.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 8fce3370c84e..f084e7a5df1e 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -266,11 +266,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start, } if (pgd_none(*pgd)) { - p4d_t *p; if (slab_is_available()) { - p = p4d_alloc(&init_mm, pgd, addr); - if (!p) + if (!p4d_alloc(&init_mm, pgd, addr)) return -ENOMEM; } else { pgd_populate_kernel(addr, pgd, From e6d7d3502e00ed6f86e03dcdb282cb7785e55448 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Tue, 5 Aug 2025 21:39:40 +0900 Subject: [PATCH 030/372] mm/damon: update expired description of damos_action Nowadays, damos operation actions support a greater operation set. But comments (also, generated documentation) weren't updated. So fix the comments with current support status. Link: https://lkml.kernel.org/r/20250805123940.13691-1-ekffu200098@gmail.com Signed-off-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Cc: Honggyu Kim Signed-off-by: Andrew Morton --- include/linux/damon.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index f13664c62ddd..d01bfee80bd6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -110,7 +110,7 @@ struct damon_target { * * @DAMOS_WILLNEED: Call ``madvise()`` for the region with MADV_WILLNEED. * @DAMOS_COLD: Call ``madvise()`` for the region with MADV_COLD. - * @DAMOS_PAGEOUT: Call ``madvise()`` for the region with MADV_PAGEOUT. + * @DAMOS_PAGEOUT: Reclaim the region. * @DAMOS_HUGEPAGE: Call ``madvise()`` for the region with MADV_HUGEPAGE. * @DAMOS_NOHUGEPAGE: Call ``madvise()`` for the region with MADV_NOHUGEPAGE. * @DAMOS_LRU_PRIO: Prioritize the region on its LRU lists. @@ -121,10 +121,10 @@ struct damon_target { * @NR_DAMOS_ACTIONS: Total number of DAMOS actions * * The support of each action is up to running &struct damon_operations. - * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR supports all actions except - * &enum DAMOS_LRU_PRIO and &enum DAMOS_LRU_DEPRIO. &enum DAMON_OPS_PADDR - * supports only &enum DAMOS_PAGEOUT, &enum DAMOS_LRU_PRIO, &enum - * DAMOS_LRU_DEPRIO, and &DAMOS_STAT. + * Refer to 'Operation Action' section of Documentation/mm/damon/design.rst for + * status of the supports. + * + * Note that DAMOS_PAGEOUT doesn't trigger demotions. */ enum damos_action { DAMOS_WILLNEED, From 7bca1760cd86b9ef62d4c2baf168b68a708011bd Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Sun, 10 Aug 2025 12:25:47 -0700 Subject: [PATCH 031/372] docs/mm/damon/design: fix typo: s/sz_trtied/sz_tried/ There are some typo in statistics section of DAMON design docs - sz_trtied -> sz_tried Link: https://lkml.kernel.org/r/20250729144414.31958-1-ekffu200098@gmail.com Signed-off-by: Sang-Heon Jeon Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Reviewed-by: Dev Jain Reviewed-by: Honggyu Kim Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 03f8137256f5..2f6ba5c7f4c7 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -689,7 +689,7 @@ DAMOS accounts below statistics for each scheme, from the beginning of the scheme's execution. - ``nr_tried``: Total number of regions that the scheme is tried to be applied. -- ``sz_trtied``: Total size of regions that the scheme is tried to be applied. +- ``sz_tried``: Total size of regions that the scheme is tried to be applied. - ``sz_ops_filter_passed``: Total bytes that passed operations set layer-handled DAMOS filters. - ``nr_applied``: Total number of regions that the scheme is applied. From 27763edac9288bbb35a9feecb82652de04e637fd Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 12 Aug 2025 01:20:17 +0800 Subject: [PATCH 032/372] mm/mincore, swap: consolidate swap cache checking for mincore Patch series "mm/mincore: minor clean up for swap cache checking". This series cleans up a swap cache helper only used by mincore, move it back into mincore code. Also separate the swap cache related logics out of shmem / page cache logics in mincore. With this series we have less lines of code and better performance. Before this series: mincore on a swaped out 16G anon mmap range: Took 488220 us mincore on 16G shmem mmap range: Took 530272 us. After this series: mincore on a swaped out 16G anon mmap range: Took 446763 us mincore on 16G shmem mmap range: Took 460496 us. About ~10% faster. This patch (of 2): The filemap_get_incore_folio (previously find_get_incore_page) helper was introduced by commit 61ef18655704 ("mm: factor find_get_incore_page out of mincore_page") to be used by later commit f5df8635c5a3 ("mm: use find_get_incore_page in memcontrol"), so memory cgroup charge move code can be simplified. But commit 6b611388b626 ("memcg-v1: remove charge move code") removed that user completely, it's only used by mincore now. So this commit basically reverts commit 61ef18655704 ("mm: factor find_get_incore_page out of mincore_page"). Move it back to mincore side to simplify the code. Link: https://lkml.kernel.org/r/20250811172018.48901-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250811172018.48901-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Nhat Pham Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jann Horn Cc: Kemeng Shi Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mincore.c | 29 +++++++++++++++++++++++++++-- mm/swap.h | 10 ---------- mm/swap_state.c | 38 -------------------------------------- 3 files changed, 27 insertions(+), 50 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index 10dabefc3acc..20fd0967d3cb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -64,8 +64,33 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - folio = filemap_get_incore_folio(mapping, index); - if (!IS_ERR(folio)) { + if (IS_ENABLED(CONFIG_SWAP) && shmem_mapping(mapping)) { + folio = filemap_get_entry(mapping, index); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (xa_is_value(folio)) { + struct swap_info_struct *si; + swp_entry_t swp = radix_to_swp_entry(folio); + /* There might be swapin error entries in shmem mapping. */ + if (non_swap_entry(swp)) + return 0; + /* Prevent swap device to being swapoff under us */ + si = get_swap_device(swp); + if (si) { + folio = filemap_get_folio(swap_address_space(swp), + swap_cache_index(swp)); + put_swap_device(si); + } else { + return 0; + } + } + } else { + folio = filemap_get_folio(mapping, index); + } + + if (!IS_ERR_OR_NULL(folio)) { present = folio_test_uptodate(folio); folio_put(folio); } diff --git a/mm/swap.h b/mm/swap.h index 911ad5ff0f89..1ae44d4193b1 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -64,9 +64,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index); - struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); @@ -178,13 +175,6 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry, return NULL; } -static inline -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - return filemap_get_folio(mapping, index); -} - static inline void *get_shadow_from_swap_cache(swp_entry_t entry) { return NULL; diff --git a/mm/swap_state.c b/mm/swap_state.c index c354435a0923..99513b74b5d8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -323,44 +323,6 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, return folio; } -/** - * filemap_get_incore_folio - Find and get a folio from the page or swap caches. - * @mapping: The address_space to search. - * @index: The page cache index. - * - * This differs from filemap_get_folio() in that it will also look for the - * folio in the swap cache. - * - * Return: The found folio or %NULL. - */ -struct folio *filemap_get_incore_folio(struct address_space *mapping, - pgoff_t index) -{ - swp_entry_t swp; - struct swap_info_struct *si; - struct folio *folio = filemap_get_entry(mapping, index); - - if (!folio) - return ERR_PTR(-ENOENT); - if (!xa_is_value(folio)) - return folio; - if (!shmem_mapping(mapping)) - return ERR_PTR(-ENOENT); - - swp = radix_to_swp_entry(folio); - /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(swp)) - return ERR_PTR(-ENOENT); - /* Prevent swapoff from happening to us */ - si = get_swap_device(swp); - if (!si) - return ERR_PTR(-ENOENT); - index = swap_cache_index(swp); - folio = filemap_get_folio(swap_address_space(swp), index); - put_swap_device(si); - return folio; -} - struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) From 1f2052755c152940c336918bd73d13d5468f548b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 12 Aug 2025 01:20:18 +0800 Subject: [PATCH 033/372] mm/mincore: use a helper for checking the swap cache Introduce a mincore_swap helper for checking swap entries. Move all swap related logic and sanity debug check into it, and separate them from page cache checking. The performance is better after this commit. mincore_page is never called on a swap cache space now, so the logic can be simpler. The sanity check also covers more potential cases now, previously the WARN_ON only catches potentially corrupted page table, now if shmem contains a swap entry with !CONFIG_SWAP, a WARN will be triggered. This changes the mincore value when the WARN is triggered, but this shouldn't matter. The WARN_ON means the data is already corrupted or something is very wrong, so it really should not happen. Before this series: mincore on a swaped out 16G anon mmap range: Took 488220 us mincore on 16G shmem mmap range: Took 530272 us. After this commit: mincore on a swaped out 16G anon mmap range: Took 446763 us mincore on 16G shmem mmap range: Took 460496 us. About ~10% faster. Link: https://lkml.kernel.org/r/20250811172018.48901-3-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jann Horn Cc: Kemeng Shi Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mincore.c | 90 ++++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/mm/mincore.c b/mm/mincore.c index 20fd0967d3cb..2f3e1816a30d 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -47,6 +47,48 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, return 0; } +static unsigned char mincore_swap(swp_entry_t entry, bool shmem) +{ + struct swap_info_struct *si; + struct folio *folio = NULL; + unsigned char present = 0; + + if (!IS_ENABLED(CONFIG_SWAP)) { + WARN_ON(1); + return 0; + } + + /* + * Shmem mapping may contain swapin error entries, which are + * absent. Page table may contain migration or hwpoison + * entries which are always uptodate. + */ + if (non_swap_entry(entry)) + return !shmem; + + /* + * Shmem mapping lookup is lockless, so we need to grab the swap + * device. mincore page table walk locks the PTL, and the swap + * device is stable, avoid touching the si for better performance. + */ + if (shmem) { + si = get_swap_device(entry); + if (!si) + return 0; + } + folio = filemap_get_entry(swap_address_space(entry), + swap_cache_index(entry)); + if (shmem) + put_swap_device(si); + /* The swap cache space contains either folio, shadow or NULL */ + if (folio && !xa_is_value(folio)) { + present = folio_test_uptodate(folio); + folio_put(folio); + } + + return present; +} + /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, @@ -64,33 +106,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - if (IS_ENABLED(CONFIG_SWAP) && shmem_mapping(mapping)) { - folio = filemap_get_entry(mapping, index); - /* - * shmem/tmpfs may return swap: account for swapcache - * page too. - */ + folio = filemap_get_entry(mapping, index); + if (folio) { if (xa_is_value(folio)) { - struct swap_info_struct *si; - swp_entry_t swp = radix_to_swp_entry(folio); - /* There might be swapin error entries in shmem mapping. */ - if (non_swap_entry(swp)) + if (shmem_mapping(mapping)) + return mincore_swap(radix_to_swp_entry(folio), + true); + else return 0; - /* Prevent swap device to being swapoff under us */ - si = get_swap_device(swp); - if (si) { - folio = filemap_get_folio(swap_address_space(swp), - swap_cache_index(swp)); - put_swap_device(si); - } else { - return 0; - } } - } else { - folio = filemap_get_folio(mapping, index); - } - - if (!IS_ERR_OR_NULL(folio)) { present = folio_test_uptodate(folio); folio_put(folio); } @@ -168,23 +192,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (i = 0; i < step; i++) vec[i] = 1; } else { /* pte is a swap entry */ - swp_entry_t entry = pte_to_swp_entry(pte); - - if (non_swap_entry(entry)) { - /* - * migration or hwpoison entries are always - * uptodate - */ - *vec = 1; - } else { -#ifdef CONFIG_SWAP - *vec = mincore_page(swap_address_space(entry), - swap_cache_index(entry)); -#else - WARN_ON(1); - *vec = 1; -#endif - } + *vec = mincore_swap(pte_to_swp_entry(pte), false); } vec += step; } From 95c2908f1a4fd608b1cdbb5acef3572e5d769e1c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 16:39:47 +0200 Subject: [PATCH 034/372] mm/migrate: remove MIGRATEPAGE_UNMAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_folio_unmap() is the only user of MIGRATEPAGE_UNMAP. We want to remove MIGRATEPAGE_* completely. It's rather weird to have a generic MIGRATEPAGE_UNMAP, documented to be returned from address-space callbacks, when it's only used for an internal helper. Let's start by having only a single "success" return value for migrate_folio_unmap() -- 0 -- by moving the "folio was already freed" check into the single caller. There is a remaining comment for PG_isolated, which we renamed to PG_movable_ops_isolated recently and forgot to update. While we might still run into that case with zsmalloc, it's something we want to get rid of soon. So let's just focus that optimization on real folios only for now by excluding movable_ops pages. Note that concurrent freeing can happen at any time and this "already freed" check is not relevant for correctness. [david@redhat.com: no need to pass "reason" to migrate_folio_unmap(), per Lance] Link: https://lkml.kernel.org/r/3bb725f8-28d7-4aa2-b75f-af40d5cab280@redhat.com Link: https://lkml.kernel.org/r/20250811143949.1117439-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Lance Yang Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Benjamin LaHaise Cc: Byungchul Park Cc: Chris Mason Cc: Christian Brauner Cc: Christophe Leroy Cc: Dave Kleikamp Cc: David Sterba Cc: Eugenio Pé rez Cc: Greg Kroah-Hartman Cc: Gregory Price Cc: "Huang, Ying" Cc: Jan Kara Cc: Jason Wang Cc: Jerrin Shaji George Cc: Josef Bacik Cc: Joshua Hahn Cc: Madhavan Srinivasan Cc: Mathew Brost Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Rakie Kim Cc: Sergey Senozhatsky Cc: Xuan Zhuo Cc: Dave Kleikamp Signed-off-by: Andrew Morton --- include/linux/migrate.h | 1 - mm/migrate.c | 45 ++++++++++++++++++++--------------------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9009e27b5f44..302f3e95faea 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -18,7 +18,6 @@ struct migration_target_control; * - zero on page migration success; */ #define MIGRATEPAGE_SUCCESS 0 -#define MIGRATEPAGE_UNMAP 1 /** * struct movable_operations - Driver page migration diff --git a/mm/migrate.c b/mm/migrate.c index 9e5ef39ce73a..2ef467eae31d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1189,7 +1189,7 @@ static void migrate_folio_done(struct folio *src, static int migrate_folio_unmap(new_folio_t get_new_folio, free_folio_t put_new_folio, unsigned long private, struct folio *src, struct folio **dstp, enum migrate_mode mode, - enum migrate_reason reason, struct list_head *ret) + struct list_head *ret) { struct folio *dst; int rc = -EAGAIN; @@ -1198,16 +1198,6 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, bool locked = false; bool dst_locked = false; - if (folio_ref_count(src) == 1) { - /* Folio was freed from under us. So we are done. */ - folio_clear_active(src); - folio_clear_unevictable(src); - /* free_pages_prepare() will clear PG_isolated. */ - list_del(&src->lru); - migrate_folio_done(src, reason); - return MIGRATEPAGE_SUCCESS; - } - dst = get_new_folio(src, private); if (!dst) return -ENOMEM; @@ -1297,7 +1287,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (unlikely(page_has_movable_ops(&src->page))) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } /* @@ -1327,7 +1317,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, if (!folio_mapped(src)) { __migrate_folio_record(dst, old_page_state, anon_vma); - return MIGRATEPAGE_UNMAP; + return 0; } out: @@ -1870,14 +1860,27 @@ static int migrate_pages_batch(struct list_head *from, continue; } + /* + * If we are holding the last folio reference, the folio + * was freed from under us, so just drop our reference. + */ + if (likely(!page_has_movable_ops(&folio->page)) && + folio_ref_count(folio) == 1) { + folio_clear_active(folio); + folio_clear_unevictable(folio); + list_del(&folio->lru); + migrate_folio_done(folio, reason); + stats->nr_succeeded += nr_pages; + stats->nr_thp_succeeded += is_thp; + continue; + } + rc = migrate_folio_unmap(get_new_folio, put_new_folio, - private, folio, &dst, mode, reason, - ret_folios); + private, folio, &dst, mode, ret_folios); /* * The rules are: - * Success: folio will be freed - * Unmap: folio will be put on unmap_folios list, - * dst folio put on dst_folios list + * 0: folio will be put on unmap_folios list, + * dst folio put on dst_folios list * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1927,11 +1930,7 @@ static int migrate_pages_batch(struct list_head *from, thp_retry += is_thp; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: - stats->nr_succeeded += nr_pages; - stats->nr_thp_succeeded += is_thp; - break; - case MIGRATEPAGE_UNMAP: + case 0: list_move_tail(&folio->lru, &unmap_folios); list_add_tail(&dst->lru, &dst_folios); break; From fb49a4425cfa163faccd91f913773d3401d3a7d4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 16:39:48 +0200 Subject: [PATCH 035/372] treewide: remove MIGRATEPAGE_SUCCESS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At this point MIGRATEPAGE_SUCCESS is misnamed for all folio users, and now that we remove MIGRATEPAGE_UNMAP, it's really the only "success" return value that the code uses and expects. Let's just get rid of MIGRATEPAGE_SUCCESS completely and just use "0" for success. Link: https://lkml.kernel.org/r/20250811143949.1117439-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan [mm] Acked-by: Dave Kleikamp [jfs] Acked-by: David Sterba [btrfs] Acked-by: Greg Kroah-Hartman Reviewed-by: Byungchul Park Cc: Alistair Popple Cc: Al Viro Cc: Arnd Bergmann Cc: Benjamin LaHaise Cc: Chris Mason Cc: Christian Brauner Cc: Christophe Leroy Cc: Dave Kleikamp Cc: Eugenio Pé rez Cc: Gregory Price Cc: "Huang, Ying" Cc: Jan Kara Cc: Jason Wang Cc: Jerrin Shaji George Cc: Josef Bacik Cc: Joshua Hahn Cc: Madhavan Srinivasan Cc: Mathew Brost Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Rakie Kim Cc: Sergey Senozhatsky Cc: Xuan Zhuo Cc: Lance Yang Signed-off-by: Andrew Morton --- arch/powerpc/platforms/pseries/cmm.c | 2 +- drivers/misc/vmw_balloon.c | 4 +-- drivers/virtio/virtio_balloon.c | 2 +- fs/aio.c | 2 +- fs/btrfs/inode.c | 4 +-- fs/hugetlbfs/inode.c | 4 +-- fs/jfs/jfs_metapage.c | 8 +++--- include/linux/migrate.h | 10 +------ mm/migrate.c | 40 +++++++++++++--------------- mm/migrate_device.c | 2 +- mm/zsmalloc.c | 4 +-- 11 files changed, 36 insertions(+), 46 deletions(-) diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 5e0a718d1be7..0823fa2da151 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -545,7 +545,7 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, /* balloon page list reference */ put_page(page); - return MIGRATEPAGE_SUCCESS; + return 0; } static void cmm_balloon_compaction_init(void) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 6653fc53c951..6df51ee8db62 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -1806,7 +1806,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, * the list after acquiring the lock. */ get_page(newpage); - ret = MIGRATEPAGE_SUCCESS; + ret = 0; } /* Update the balloon list under the @pages_lock */ @@ -1817,7 +1817,7 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, * If we succeed just insert it to the list and update the statistics * under the lock. */ - if (ret == MIGRATEPAGE_SUCCESS) { + if (!ret) { balloon_page_insert(&b->b_dev_info, newpage); __count_vm_event(BALLOON_MIGRATE); } diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index e299e18346a3..eae65136cdfb 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -875,7 +875,7 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, balloon_page_finalize(page); put_page(page); /* balloon reference */ - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/fs/aio.c b/fs/aio.c index 7fc7b6221312..059e03cfa088 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -445,7 +445,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, folio_get(dst); rc = folio_migrate_mapping(mapping, dst, src, 1); - if (rc != MIGRATEPAGE_SUCCESS) { + if (rc) { folio_put(dst); goto out_unlock; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dd82dcc7b2b7..0bb604dbd673 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7421,7 +7421,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, { int ret = filemap_migrate_folio(mapping, dst, src, mode); - if (ret != MIGRATEPAGE_SUCCESS) + if (ret) return ret; if (folio_test_ordered(src)) { @@ -7429,7 +7429,7 @@ static int btrfs_migrate_folio(struct address_space *mapping, folio_set_ordered(dst); } - return MIGRATEPAGE_SUCCESS; + return 0; } #else #define btrfs_migrate_folio NULL diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09d4baef29cf..34d496a2b7de 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1052,7 +1052,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, int rc; rc = migrate_huge_page_move_mapping(mapping, dst, src); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (hugetlb_folio_subpool(src)) { @@ -1063,7 +1063,7 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping, folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } #else #define hugetlbfs_migrate_folio NULL diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index b98cf3bb6c1f..871cf4fb3636 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -169,7 +169,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, } rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; for (i = 0; i < MPS_PER_PAGE; i++) { @@ -199,7 +199,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, } } - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_MIGRATION */ @@ -242,7 +242,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, return -EAGAIN; rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (unlikely(insert_metapage(dst, mp))) @@ -253,7 +253,7 @@ static int __metapage_migrate_folio(struct address_space *mapping, mp->folio = dst; remove_metapage(src, mp); - return MIGRATEPAGE_SUCCESS; + return 0; } #endif /* CONFIG_MIGRATION */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 302f3e95faea..1f0ac122c3bf 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -12,13 +12,6 @@ typedef void free_folio_t(struct folio *folio, unsigned long private); struct migration_target_control; -/* - * Return values from addresss_space_operations.migratepage(): - * - negative errno on page migration failure; - * - zero on page migration success; - */ -#define MIGRATEPAGE_SUCCESS 0 - /** * struct movable_operations - Driver page migration * @isolate_page: @@ -34,8 +27,7 @@ struct migration_target_control; * @src page. The driver should copy the contents of the * @src page to the @dst page and set up the fields of @dst page. * Both pages are locked. - * If page migration is successful, the driver should - * return MIGRATEPAGE_SUCCESS. + * If page migration is successful, the driver should return 0. * If the driver cannot migrate the page at the moment, it can return * -EAGAIN. The VM interprets this as a temporary migration failure and * will retry it later. Any other error value is a permanent migration diff --git a/mm/migrate.c b/mm/migrate.c index 2ef467eae31d..8e435a078fc3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -231,18 +231,17 @@ static void putback_movable_ops_page(struct page *page) * src and dst are also released by migration core. These pages will not be * folios in the future, so that must be reworked. * - * Returns MIGRATEPAGE_SUCCESS on success, otherwise a negative error - * code. + * Returns 0 on success, otherwise a negative error code. */ static int migrate_movable_ops_page(struct page *dst, struct page *src, enum migrate_mode mode) { - int rc = MIGRATEPAGE_SUCCESS; + int rc; VM_WARN_ON_ONCE_PAGE(!page_has_movable_ops(src), src); VM_WARN_ON_ONCE_PAGE(!PageMovableOpsIsolated(src), src); rc = page_movable_ops(src)->migrate_page(dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) ClearPageMovableOpsIsolated(src); return rc; } @@ -587,7 +586,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); - return MIGRATEPAGE_SUCCESS; + return 0; } oldzone = folio_zone(folio); @@ -688,7 +687,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } local_irq_enable(); - return MIGRATEPAGE_SUCCESS; + return 0; } int folio_migrate_mapping(struct address_space *mapping, @@ -737,7 +736,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, xas_unlock_irq(&xas); - return MIGRATEPAGE_SUCCESS; + return 0; } /* @@ -853,14 +852,14 @@ static int __migrate_folio(struct address_space *mapping, struct folio *dst, return rc; rc = __folio_migrate_mapping(mapping, dst, src, expected_count); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) return rc; if (src_private) folio_attach_private(dst, folio_detach_private(src)); folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return 0; } /** @@ -967,7 +966,7 @@ recheck_buffers: } rc = filemap_migrate_folio(mapping, dst, src, mode); - if (rc != MIGRATEPAGE_SUCCESS) + if (rc) goto unlock_buffers; bh = head; @@ -1071,7 +1070,7 @@ static int fallback_migrate_folio(struct address_space *mapping, * * Return value: * < 0 - error code - * MIGRATEPAGE_SUCCESS - success + * 0 - success */ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -1099,7 +1098,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, else rc = fallback_migrate_folio(mapping, dst, src, mode); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { /* * For pagecache folios, src->mapping must be cleared before src * is freed. Anonymous folios must stay anonymous until freed. @@ -1449,7 +1448,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ folio_putback_hugetlb(src); - return MIGRATEPAGE_SUCCESS; + return 0; } dst = get_new_folio(src, private); @@ -1512,8 +1511,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) - remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); + remove_migration_ptes(src, !rc ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1522,7 +1520,7 @@ put_anon: if (anon_vma) put_anon_vma(anon_vma); - if (rc == MIGRATEPAGE_SUCCESS) { + if (!rc) { move_hugetlb_state(src, dst, reason); put_new_folio = NULL; } @@ -1530,7 +1528,7 @@ put_anon: out_unlock: folio_unlock(src); out: - if (rc == MIGRATEPAGE_SUCCESS) + if (!rc) folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); @@ -1640,7 +1638,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, reason, ret_folios); /* * The rules are: - * Success: hugetlb folio will be put back + * 0: hugetlb folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * Other errno: put on ret_folios list @@ -1657,7 +1655,7 @@ static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio, retry++; nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; break; default: @@ -1711,7 +1709,7 @@ static void migrate_folios_move(struct list_head *src_folios, reason, ret_folios); /* * The rules are: - * Success: folio will be freed + * 0: folio will be freed * -EAGAIN: stay on the unmap_folios list * Other errno: put on ret_folios list */ @@ -1721,7 +1719,7 @@ static void migrate_folios_move(struct list_head *src_folios, *thp_retry += is_thp; *nr_retry_pages += nr_pages; break; - case MIGRATEPAGE_SUCCESS: + case 0: stats->nr_succeeded += nr_pages; stats->nr_thp_succeeded += is_thp; break; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e05e14d6eacd..abd9f6850db6 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -778,7 +778,7 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (migrate && migrate->fault_page == page) extra_cnt = 1; r = folio_migrate_mapping(mapping, newfolio, folio, extra_cnt); - if (r != MIGRATEPAGE_SUCCESS) + if (r) src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; else folio_migrate_flags(newfolio, folio); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 805a10b41266..153783d49d34 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1746,7 +1746,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * instead. */ if (!zpdesc->zspage) - return MIGRATEPAGE_SUCCESS; + return 0; /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); @@ -1813,7 +1813,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, reset_zpdesc(zpdesc); zpdesc_put(zpdesc); - return MIGRATEPAGE_SUCCESS; + return 0; } static void zs_page_putback(struct page *page) From b7298e418e298545b195aef51c01cceed816558e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:21 +0200 Subject: [PATCH 036/372] mm/huge_memory: move more common code into insert_pmd() Patch series "mm: vm_normal_page*() improvements", v3. Cleanup and unify vm_normal_page_*() handling, also marking the huge zerofolio as special in the PMD. Add+use vm_normal_page_pud() and cleanup that XEN vm_ops->find_special_page thingy. There are plans of using vm_normal_page_*() more widely soon. This patch (of 11): Let's clean it all further up. No functional change intended. Link: https://lkml.kernel.org/r/20250811112631.759341-1-david@redhat.com Link: https://lkml.kernel.org/r/20250811112631.759341-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Alistair Popple Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Reviewed-by: Lance Yang Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Cc: David Vrabel Signed-off-by: Andrew Morton --- mm/huge_memory.c | 72 ++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 48 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2b4ea5a2ce7d..5314a89d676f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1379,15 +1379,25 @@ struct folio_or_pfn { bool is_folio; }; -static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, struct folio_or_pfn fop, pgprot_t prot, - bool write, pgtable_t pgtable) + bool write) { struct mm_struct *mm = vma->vm_mm; + pgtable_t pgtable = NULL; + spinlock_t *ptl; pmd_t entry; - lockdep_assert_held(pmd_lockptr(mm, pmd)); + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + + ptl = pmd_lock(mm, pmd); if (!pmd_none(*pmd)) { const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : fop.pfn; @@ -1395,15 +1405,14 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, if (write) { if (pmd_pfn(*pmd) != pfn) { WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); - return -EEXIST; + goto out_unlock; } entry = pmd_mkyoung(*pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - - return -EEXIST; + goto out_unlock; } if (fop.is_folio) { @@ -1424,11 +1433,17 @@ static int insert_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); + pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); - return 0; + +out_unlock: + spin_unlock(ptl); + if (pgtable) + pte_free(mm, pgtable); + return VM_FAULT_NOPAGE; } /** @@ -1450,9 +1465,6 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, struct folio_or_pfn fop = { .pfn = pfn, }; - pgtable_t pgtable = NULL; - spinlock_t *ptl; - int error; /* * If we had pmd_special, we could avoid all these restrictions, @@ -1464,25 +1476,9 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, unsigned long pfn, (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; - - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } - pfnmap_setup_cachemode_pfn(pfn, &pgprot); - ptl = pmd_lock(vma->vm_mm, vmf->pmd); - error = insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write, - pgtable); - spin_unlock(ptl); - if (error && pgtable) - pte_free(vma->vm_mm, pgtable); - - return VM_FAULT_NOPAGE; + return insert_pmd(vma, addr, vmf->pmd, fop, pgprot, write); } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); @@ -1491,35 +1487,15 @@ vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & PMD_MASK; - struct mm_struct *mm = vma->vm_mm; struct folio_or_pfn fop = { .folio = folio, .is_folio = true, }; - spinlock_t *ptl; - pgtable_t pgtable = NULL; - int error; - - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER)) return VM_FAULT_SIGBUS; - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } - - ptl = pmd_lock(mm, vmf->pmd); - error = insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, - write, pgtable); - spin_unlock(ptl); - if (error && pgtable) - pte_free(mm, pgtable); - - return VM_FAULT_NOPAGE; + return insert_pmd(vma, addr, vmf->pmd, fop, vma->vm_page_prot, write); } EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd); From 77e493280e92301d3151d6a18a2e17f0c30a582c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:22 +0200 Subject: [PATCH 037/372] mm/huge_memory: move more common code into insert_pud() Let's clean it all further up. No functional change intended. Link: https://lkml.kernel.org/r/20250811112631.759341-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Alistair Popple Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5314a89d676f..7933791b75f4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1507,25 +1507,30 @@ static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) return pud; } -static void insert_pud(struct vm_area_struct *vma, unsigned long addr, +static vm_fault_t insert_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, struct folio_or_pfn fop, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pud_t entry; + if (addr < vma->vm_start || addr >= vma->vm_end) + return VM_FAULT_SIGBUS; + + ptl = pud_lock(mm, pud); if (!pud_none(*pud)) { const unsigned long pfn = fop.is_folio ? folio_pfn(fop.folio) : fop.pfn; if (write) { if (WARN_ON_ONCE(pud_pfn(*pud) != pfn)) - return; + goto out_unlock; entry = pud_mkyoung(*pud); entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); if (pudp_set_access_flags(vma, addr, pud, entry, 1)) update_mmu_cache_pud(vma, addr, pud); } - return; + goto out_unlock; } if (fop.is_folio) { @@ -1544,6 +1549,9 @@ static void insert_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); +out_unlock: + spin_unlock(ptl); + return VM_FAULT_NOPAGE; } /** @@ -1565,7 +1573,6 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, struct folio_or_pfn fop = { .pfn = pfn, }; - spinlock_t *ptl; /* * If we had pud_special, we could avoid all these restrictions, @@ -1577,16 +1584,9 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, unsigned long pfn, (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; - pfnmap_setup_cachemode_pfn(pfn, &pgprot); - ptl = pud_lock(vma->vm_mm, vmf->pud); - insert_pud(vma, addr, vmf->pud, fop, pgprot, write); - spin_unlock(ptl); - - return VM_FAULT_NOPAGE; + return insert_pud(vma, addr, vmf->pud, fop, pgprot, write); } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); @@ -1603,25 +1603,15 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, { struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address & PUD_MASK; - pud_t *pud = vmf->pud; - struct mm_struct *mm = vma->vm_mm; struct folio_or_pfn fop = { .folio = folio, .is_folio = true, }; - spinlock_t *ptl; - - if (addr < vma->vm_start || addr >= vma->vm_end) - return VM_FAULT_SIGBUS; if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER)) return VM_FAULT_SIGBUS; - ptl = pud_lock(mm, pud); - insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); - spin_unlock(ptl); - - return VM_FAULT_NOPAGE; + return insert_pud(vma, addr, vmf->pud, fop, vma->vm_page_prot, write); } EXPORT_SYMBOL_GPL(vmf_insert_folio_pud); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ From 5528ef06da011aa35fc9704127e41a154649a6df Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:23 +0200 Subject: [PATCH 038/372] mm/huge_memory: support huge zero folio in vmf_insert_folio_pmd() Just like we do for vmf_insert_page_mkwrite() -> ... -> insert_page_into_pte_locked() with the shared zeropage, support the huge zero folio in vmf_insert_folio_pmd(). When (un)mapping the huge zero folio in page tables, we neither adjust the refcount nor the mapcount, just like for the shared zeropage. For now, the huge zero folio is not marked as special yet, although vm_normal_page_pmd() really wants to treat it as special. We'll change that next. Link: https://lkml.kernel.org/r/20250811112631.759341-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7933791b75f4..ec89e0607424 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1418,9 +1418,11 @@ static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr, if (fop.is_folio) { entry = folio_mk_pmd(fop.folio, vma->vm_page_prot); - folio_get(fop.folio); - folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); - add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); + if (!is_huge_zero_folio(fop.folio)) { + folio_get(fop.folio); + folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); + add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); + } } else { entry = pmd_mkhuge(pfn_pmd(fop.pfn, prot)); entry = pmd_mkspecial(entry); From b0f86aaebed1fc2bdc7475b14675ebc9d542e5d3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:24 +0200 Subject: [PATCH 039/372] fs/dax: use vmf_insert_folio_pmd() to insert the huge zero folio Let's convert to vmf_insert_folio_pmd(). There is a theoretical change in behavior: in the unlikely case there is already something mapped, we'll now still call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE. Previously, we would have returned VM_FAULT_FALLBACK, and the caller would have zapped the PMD to try a PTE fault. However, that behavior was different to other PTE+PMD faults, when there would already be something mapped, and it's not even clear if it could be triggered. Assuming the huge zero folio is already mapped, all good, no need to fallback to PTEs. Assuming there is already a leaf page table ... the behavior would be just like when trying to insert a PMD mapping a folio through dax_fault_iter()->vmf_insert_folio_pmd(). Assuming there is already something else mapped as PMD? It sounds like a BUG, and the behavior would be just like when trying to insert a PMD mapping a folio through dax_fault_iter()->vmf_insert_folio_pmd(). So, it sounds reasonable to not handle huge zero folios differently to inserting PMDs mapping folios when there already is something mapped. Link: https://lkml.kernel.org/r/20250811112631.759341-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Alistair Popple Reviewed-by: Lorenzo Stoakes Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/dax.c | 47 ++++++++++------------------------------------- 1 file changed, 10 insertions(+), 37 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 20ecf652c129..89f071ba7b10 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1375,51 +1375,24 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, const struct iomap_iter *iter, void **entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; - unsigned long pmd_addr = vmf->address & PMD_MASK; - struct vm_area_struct *vma = vmf->vma; struct inode *inode = mapping->host; - pgtable_t pgtable = NULL; struct folio *zero_folio; - spinlock_t *ptl; - pmd_t pmd_entry; - unsigned long pfn; + vm_fault_t ret; zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm); - if (unlikely(!zero_folio)) - goto fallback; + if (unlikely(!zero_folio)) { + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry); + return VM_FAULT_FALLBACK; + } - pfn = page_to_pfn(&zero_folio->page); - *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, + *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio), DAX_PMD | DAX_ZERO_PAGE); - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } - - ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); - if (!pmd_none(*(vmf->pmd))) { - spin_unlock(ptl); - goto fallback; - } - - if (pgtable) { - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - mm_inc_nr_ptes(vma->vm_mm); - } - pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot); - set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); - spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); - return VM_FAULT_NOPAGE; - -fallback: - if (pgtable) - pte_free(vma->vm_mm, pgtable); - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry); - return VM_FAULT_FALLBACK; + ret = vmf_insert_folio_pmd(vmf, zero_folio, false); + if (ret == VM_FAULT_NOPAGE) + trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); + return ret; } #else static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, From d82d09e482199e6bbc204df10b2082f764cbe1f4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:25 +0200 Subject: [PATCH 040/372] mm/huge_memory: mark PMD mappings of the huge zero folio special The huge zero folio is refcounted (+mapcounted -- is that a word?) differently than "normal" folios, similarly (but different) to the ordinary shared zeropage. For this reason, we special-case these pages in vm_normal_page*/vm_normal_folio*, and only allow selected callers to still use them (e.g., GUP can still take a reference on them). vm_normal_page_pmd() already filters out the huge zero folio, to indicate it a special (return NULL). However, so far we are not making use of pmd_special() on architectures that support it (CONFIG_ARCH_HAS_PTE_SPECIAL), like we would with the ordinary shared zeropage. Let's mark PMD mappings of the huge zero folio similarly as special, so we can avoid the manual check for the huge zero folio with CONFIG_ARCH_HAS_PTE_SPECIAL next, and only perform the check on !CONFIG_ARCH_HAS_PTE_SPECIAL. In copy_huge_pmd(), where we have a manual pmd_special() check to handle PFNMAP, we have to manually rule out the huge zero folio. That code needs a serious cleanup, but that's something for another day. While at it, update the doc regarding the shared zero folios. No functional change intended: vm_normal_page_pmd() still returns NULL when it encounters the huge zero folio. Link: https://lkml.kernel.org/r/20250811112631.759341-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 8 ++++++-- mm/memory.c | 15 ++++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ec89e0607424..58bac83e7fa3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1309,6 +1309,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, { pmd_t entry; entry = folio_mk_pmd(zero_folio, vma->vm_page_prot); + entry = pmd_mkspecial(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); @@ -1418,7 +1419,9 @@ static vm_fault_t insert_pmd(struct vm_area_struct *vma, unsigned long addr, if (fop.is_folio) { entry = folio_mk_pmd(fop.folio, vma->vm_page_prot); - if (!is_huge_zero_folio(fop.folio)) { + if (is_huge_zero_folio(fop.folio)) { + entry = pmd_mkspecial(entry); + } else { folio_get(fop.folio); folio_add_file_rmap_pmd(fop.folio, &fop.folio->page, vma); add_mm_counter(mm, mm_counter_file(fop.folio), HPAGE_PMD_NR); @@ -1643,7 +1646,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, int ret = -ENOMEM; pmd = pmdp_get_lockless(src_pmd); - if (unlikely(pmd_present(pmd) && pmd_special(pmd))) { + if (unlikely(pmd_present(pmd) && pmd_special(pmd) && + !is_huge_zero_pmd(pmd))) { dst_ptl = pmd_lock(dst_mm, dst_pmd); src_ptl = pmd_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); diff --git a/mm/memory.c b/mm/memory.c index 0ba4f6b71847..626caedce35e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -555,7 +555,14 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this - * case, NULL is returned here. "Normal" mappings do have a struct page. + * case, NULL is returned here. "Normal" mappings do have a struct page and + * are ordinarily refcounted. + * + * Page mappings of the shared zero folios are always considered "special", as + * they are not ordinarily refcounted: neither the refcount nor the mapcount + * of these folios is adjusted when mapping them into user page tables. + * Selected page table walkers (such as GUP) can still identify mappings of the + * shared zero folios and work with the underlying "struct page". * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture @@ -585,9 +592,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct - * page (that is, those where pfn_valid is true) are refcounted and considered - * normal pages by the VM. The only exception are zeropages, which are - * *never* refcounted. + * page (that is, those where pfn_valid is true, except the shared zero + * folios) are refcounted and considered normal pages by the VM. * * The disadvantage is that pages are refcounted (which can be slower and * simply not an option for some PFNMAP users). The advantage is that we @@ -667,7 +673,6 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, { unsigned long pfn = pmd_pfn(pmd); - /* Currently it's only used for huge pfnmaps */ if (unlikely(pmd_special(pmd))) return NULL; From 30680d5ef0dc7a1032edc1519e85cbb504280d73 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:26 +0200 Subject: [PATCH 041/372] powerpc/ptdump: rename "struct pgtable_level" to "struct ptdump_pg_level" We want to make use of "pgtable_level" for an enum in core-mm. Other architectures seem to call "struct pgtable_level" either: * "struct pg_level" when not exposed in a header (riscv, arm) * "struct ptdump_pg_level" when expose in a header (arm64) So let's follow what arm64 does. Link: https://lkml.kernel.org/r/20250811112631.759341-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Ritesh Harjani (IBM) Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/mm/ptdump/8xx.c | 2 +- arch/powerpc/mm/ptdump/book3s64.c | 2 +- arch/powerpc/mm/ptdump/ptdump.h | 4 ++-- arch/powerpc/mm/ptdump/shared.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index b5c79b11ea3c..4ca9cf7a90c9 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -69,7 +69,7 @@ static const struct flag_info flag_array[] = { } }; -struct pgtable_level pg_level[5] = { +struct ptdump_pg_level pg_level[5] = { { /* pgd */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c index 5ad92d9dc5d1..6b2da9241d4c 100644 --- a/arch/powerpc/mm/ptdump/book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -102,7 +102,7 @@ static const struct flag_info flag_array[] = { } }; -struct pgtable_level pg_level[5] = { +struct ptdump_pg_level pg_level[5] = { { /* pgd */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), diff --git a/arch/powerpc/mm/ptdump/ptdump.h b/arch/powerpc/mm/ptdump/ptdump.h index 154efae96ae0..4232aa4b57ea 100644 --- a/arch/powerpc/mm/ptdump/ptdump.h +++ b/arch/powerpc/mm/ptdump/ptdump.h @@ -11,12 +11,12 @@ struct flag_info { int shift; }; -struct pgtable_level { +struct ptdump_pg_level { const struct flag_info *flag; size_t num; u64 mask; }; -extern struct pgtable_level pg_level[5]; +extern struct ptdump_pg_level pg_level[5]; void pt_dump_size(struct seq_file *m, unsigned long delta); diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index 39c30c62b7ea..58998960eb9a 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -67,7 +67,7 @@ static const struct flag_info flag_array[] = { } }; -struct pgtable_level pg_level[5] = { +struct ptdump_pg_level pg_level[5] = { { /* pgd */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), From b22cc9a9c7ff0ad8998d58fdd7122de6038c46a7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:27 +0200 Subject: [PATCH 042/372] mm/rmap: convert "enum rmap_level" to "enum pgtable_level" Let's factor it out, and convert all checks for unsupported levels to BUILD_BUG(). The code is written in a way such that force-inlining will optimize out the levels. [nathan@kernel.org: always inline __folio_rmap_sanity_checks()] Link: https://lkml.kernel.org/r/20250814-rmap-fix-build_bug-conversion-v1-1-fb7b10a0b362@kernel.org Link: https://lkml.kernel.org/r/20250811112631.759341-8-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Nathan Chancellor Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Cc: Nathan Chancellor Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 8 ++++++ include/linux/rmap.h | 62 +++++++++++++++++++---------------------- mm/rmap.c | 56 ++++++++++++++++++++----------------- 3 files changed, 67 insertions(+), 59 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 2b80fd456c8b..4f88e460eb9c 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1975,6 +1975,14 @@ static inline bool arch_has_pfn_modify_check(void) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; +enum pgtable_level { + PGTABLE_LEVEL_PTE = 0, + PGTABLE_LEVEL_PMD, + PGTABLE_LEVEL_PUD, + PGTABLE_LEVEL_P4D, + PGTABLE_LEVEL_PGD, +}; + #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 6cd020eea37a..e8aff6d2deda 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -394,18 +394,8 @@ typedef int __bitwise rmap_t; /* The anonymous (sub)page is exclusive to a single process. */ #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) -/* - * Internally, we're using an enum to specify the granularity. We make the - * compiler emit specialized code for each granularity. - */ -enum rmap_level { - RMAP_LEVEL_PTE = 0, - RMAP_LEVEL_PMD, - RMAP_LEVEL_PUD, -}; - -static inline void __folio_rmap_sanity_checks(const struct folio *folio, - const struct page *page, int nr_pages, enum rmap_level level) +static __always_inline void __folio_rmap_sanity_checks(const struct folio *folio, + const struct page *page, int nr_pages, enum pgtable_level level) { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); @@ -427,18 +417,18 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: /* * We don't support folios larger than a single PMD yet. So - * when RMAP_LEVEL_PMD is set, we assume that we are creating + * when PGTABLE_LEVEL_PMD is set, we assume that we are creating * a single "entire" mapping of the folio. */ VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Assume that we are creating a single "entire" mapping of the * folio. @@ -447,7 +437,7 @@ static inline void __folio_rmap_sanity_checks(const struct folio *folio, VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio); break; default: - VM_WARN_ON_ONCE(true); + BUILD_BUG(); } /* @@ -567,14 +557,14 @@ static inline void hugetlb_remove_rmap(struct folio *folio) static __always_inline void __folio_dup_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, - enum rmap_level level) + enum pgtable_level level) { const int orig_nr_pages = nr_pages; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { atomic_inc(&folio->_mapcount); break; @@ -587,11 +577,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, } folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; + default: + BUILD_BUG(); } } @@ -609,13 +601,13 @@ static __always_inline void __folio_dup_file_rmap(struct folio *folio, static inline void folio_dup_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, PGTABLE_LEVEL_PTE); } static __always_inline void folio_dup_file_rmap_pte(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { - __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, 1, dst_vma, PGTABLE_LEVEL_PTE); } /** @@ -632,7 +624,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *dst_vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE); + __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, PGTABLE_LEVEL_PTE); #else WARN_ON_ONCE(true); #endif @@ -640,7 +632,7 @@ static inline void folio_dup_file_rmap_pmd(struct folio *folio, static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, enum rmap_level level) + struct vm_area_struct *src_vma, enum pgtable_level level) { const int orig_nr_pages = nr_pages; bool maybe_pinned; @@ -665,7 +657,7 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, * copying if the folio maybe pinned. */ switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (unlikely(maybe_pinned)) { for (i = 0; i < nr_pages; i++) if (PageAnonExclusive(page + i)) @@ -687,8 +679,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, } while (page++, --nr_pages > 0); folio_add_large_mapcount(folio, orig_nr_pages, dst_vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (PageAnonExclusive(page)) { if (unlikely(maybe_pinned)) return -EBUSY; @@ -697,6 +689,8 @@ static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, atomic_inc(&folio->_entire_mapcount); folio_inc_large_mapcount(folio, dst_vma); break; + default: + BUILD_BUG(); } return 0; } @@ -730,7 +724,7 @@ static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma, - src_vma, RMAP_LEVEL_PTE); + src_vma, PGTABLE_LEVEL_PTE); } static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, @@ -738,7 +732,7 @@ static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio, struct vm_area_struct *src_vma) { return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -770,7 +764,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma, - src_vma, RMAP_LEVEL_PMD); + src_vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; @@ -778,7 +772,7 @@ static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, } static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) + struct page *page, int nr_pages, enum pgtable_level level) { VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); @@ -873,7 +867,7 @@ static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, static inline int folio_try_share_anon_rmap_pte(struct folio *folio, struct page *page) { - return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE); + return __folio_try_share_anon_rmap(folio, page, 1, PGTABLE_LEVEL_PTE); } /** @@ -904,7 +898,7 @@ static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); return -EBUSY; diff --git a/mm/rmap.c b/mm/rmap.c index 84a8d8b02ef7..0e9c4041f868 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1265,7 +1265,7 @@ static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) static __always_inline void __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; @@ -1274,7 +1274,7 @@ static __always_inline void __folio_add_rmap(struct folio *folio, __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_inc_and_test(&folio->_mapcount); break; @@ -1300,11 +1300,11 @@ static __always_inline void __folio_add_rmap(struct folio *folio, folio_add_large_mapcount(folio, orig_nr_pages, vma); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { - if (level == RMAP_LEVEL_PMD && first) + if (level == PGTABLE_LEVEL_PMD && first) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_inc_return_large_mapcount(folio, vma); if (nr == 1) @@ -1323,7 +1323,7 @@ static __always_inline void __folio_add_rmap(struct folio *folio, * We only track PMD mappings of PMD-sized * folios separately. */ - if (level == RMAP_LEVEL_PMD) + if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ @@ -1336,6 +1336,8 @@ static __always_inline void __folio_add_rmap(struct folio *folio, } folio_inc_large_mapcount(folio, vma); break; + default: + BUILD_BUG(); } __folio_mod_stat(folio, nr, nr_pmdmapped); } @@ -1427,7 +1429,7 @@ static void __page_check_anon_rmap(const struct folio *folio, static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - unsigned long address, rmap_t flags, enum rmap_level level) + unsigned long address, rmap_t flags, enum pgtable_level level) { int i; @@ -1440,20 +1442,22 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, if (flags & RMAP_EXCLUSIVE) { switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: for (i = 0; i < nr_pages; i++) SetPageAnonExclusive(page + i); break; - case RMAP_LEVEL_PMD: + case PGTABLE_LEVEL_PMD: SetPageAnonExclusive(page); break; - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PUD: /* * Keep the compiler happy, we don't support anonymous * PUD mappings. */ WARN_ON_ONCE(1); break; + default: + BUILD_BUG(); } } @@ -1507,7 +1511,7 @@ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, rmap_t flags) { __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, - RMAP_LEVEL_PTE); + PGTABLE_LEVEL_PTE); } /** @@ -1528,7 +1532,7 @@ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, - RMAP_LEVEL_PMD); + PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1609,7 +1613,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); @@ -1634,7 +1638,7 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1651,7 +1655,7 @@ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1672,7 +1676,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif @@ -1680,7 +1684,7 @@ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, - enum rmap_level level) + enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; int last = 0, nr = 0, nr_pmdmapped = 0; @@ -1689,7 +1693,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { - case RMAP_LEVEL_PTE: + case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_add_negative(-1, &folio->_mapcount); break; @@ -1719,11 +1723,11 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && atomic_read(mapped); break; - case RMAP_LEVEL_PMD: - case RMAP_LEVEL_PUD: + case PGTABLE_LEVEL_PMD: + case PGTABLE_LEVEL_PUD: if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { last = atomic_add_negative(-1, &folio->_entire_mapcount); - if (level == RMAP_LEVEL_PMD && last) + if (level == PGTABLE_LEVEL_PMD && last) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_dec_return_large_mapcount(folio, vma); if (!nr) { @@ -1743,7 +1747,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); - if (level == RMAP_LEVEL_PMD) + if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of another remove and an add? */ @@ -1757,6 +1761,8 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, partially_mapped = nr && nr < nr_pmdmapped; break; + default: + BUILD_BUG(); } /* @@ -1796,7 +1802,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, void folio_remove_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { - __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE); + __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** @@ -1813,7 +1819,7 @@ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD); + __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif @@ -1834,7 +1840,7 @@ void folio_remove_rmap_pud(struct folio *folio, struct page *page, { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) - __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD); + __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif From ec63a44011dccebca24e7ef7e8a9521306de1bc9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:28 +0200 Subject: [PATCH 043/372] mm/memory: convert print_bad_pte() to print_bad_page_map() print_bad_pte() looks like something that should actually be a WARN or similar, but historically it apparently has proven to be useful to detect corruption of page tables even on production systems -- report the issue and keep the system running to make it easier to actually detect what is going wrong (e.g., multiple such messages might shed a light). As we want to unify vm_normal_page_*() handling for PTE/PMD/PUD, we'll have to take care of print_bad_pte() as well. Let's prepare for using print_bad_pte() also for non-PTEs by adjusting the implementation and renaming the function to print_bad_page_map(). Provide print_bad_pte() as a simple wrapper. Document the implicit locking requirements for the page table re-walk. To make the function a bit more readable, factor out the ratelimit check into is_bad_page_map_ratelimited() and place the printing of page table content into __print_bad_page_map_pgtable(). We'll now dump information from each level in a single line, and just stop the table walk once we hit something that is not a present page table. The report will now look something like (dumping pgd to pmd values): [ 77.943408] BUG: Bad page map in process XXX pte:80000001233f5867 [ 77.944077] addr:00007fd84bb1c000 vm_flags:08100071 anon_vma: ... [ 77.945186] pgd:10a89f067 p4d:10a89f067 pud:10e5a2067 pmd:105327067 Not using pgdp_get(), because that does not work properly on some arm configs where pgd_t is an array. Note that we are dumping all levels even when levels are folded for simplicity. [david@redhat.com: drop warning] Link: https://lkml.kernel.org/r/923b279c-de33-44dd-a923-2959afad8626@redhat.com Link: https://lkml.kernel.org/r/20250811112631.759341-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Oscar Salvador Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 18 +++++++ mm/memory.c | 104 ++++++++++++++++++++++++++++++++-------- 2 files changed, 102 insertions(+), 20 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 4f88e460eb9c..94249e671a7e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1983,6 +1983,24 @@ enum pgtable_level { PGTABLE_LEVEL_PGD, }; +static inline const char *pgtable_level_to_str(enum pgtable_level level) +{ + switch (level) { + case PGTABLE_LEVEL_PTE: + return "pte"; + case PGTABLE_LEVEL_PMD: + return "pmd"; + case PGTABLE_LEVEL_PUD: + return "pud"; + case PGTABLE_LEVEL_P4D: + return "p4d"; + case PGTABLE_LEVEL_PGD: + return "pgd"; + default: + return "unknown"; + } +} + #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) diff --git a/mm/memory.c b/mm/memory.c index 626caedce35e..dc0107354d37 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -491,22 +491,8 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) add_mm_counter(mm, i, rss[i]); } -/* - * This function is called to print an error when a bad pte - * is found. For example, we might have a PFN-mapped pte in - * a region that doesn't allow it. - * - * The calling function must still handle the error. - */ -static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, struct page *page) +static bool is_bad_page_map_ratelimited(void) { - pgd_t *pgd = pgd_offset(vma->vm_mm, addr); - p4d_t *p4d = p4d_offset(pgd, addr); - pud_t *pud = pud_offset(p4d, addr); - pmd_t *pmd = pmd_offset(pud, addr); - struct address_space *mapping; - pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; @@ -518,7 +504,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; - return; + return true; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", @@ -529,15 +515,91 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; + return false; +} + +static void __print_bad_page_map_pgtable(struct mm_struct *mm, unsigned long addr) +{ + unsigned long long pgdv, p4dv, pudv, pmdv; + p4d_t p4d, *p4dp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pgd_t *pgdp; + + /* + * Although this looks like a fully lockless pgtable walk, it is not: + * see locking requirements for print_bad_page_map(). + */ + pgdp = pgd_offset(mm, addr); + pgdv = pgd_val(*pgdp); + + if (!pgd_present(*pgdp) || pgd_leaf(*pgdp)) { + pr_alert("pgd:%08llx\n", pgdv); + return; + } + + p4dp = p4d_offset(pgdp, addr); + p4d = p4dp_get(p4dp); + p4dv = p4d_val(p4d); + + if (!p4d_present(p4d) || p4d_leaf(p4d)) { + pr_alert("pgd:%08llx p4d:%08llx\n", pgdv, p4dv); + return; + } + + pudp = pud_offset(p4dp, addr); + pud = pudp_get(pudp); + pudv = pud_val(pud); + + if (!pud_present(pud) || pud_leaf(pud)) { + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx\n", pgdv, p4dv, pudv); + return; + } + + pmdp = pmd_offset(pudp, addr); + pmd = pmdp_get(pmdp); + pmdv = pmd_val(pmd); + + /* + * Dumping the PTE would be nice, but it's tricky with CONFIG_HIGHPTE, + * because the table should already be mapped by the caller and + * doing another map would be bad. print_bad_page_map() should + * already take care of printing the PTE. + */ + pr_alert("pgd:%08llx p4d:%08llx pud:%08llx pmd:%08llx\n", pgdv, + p4dv, pudv, pmdv); +} + +/* + * This function is called to print an error when a bad page table entry (e.g., + * corrupted page table entry) is found. For example, we might have a + * PFN-mapped pte in a region that doesn't allow it. + * + * The calling function must still handle the error. + * + * This function must be called during a proper page table walk, as it will + * re-walk the page table to dump information: the caller MUST prevent page + * table teardown (by holding mmap, vma or rmap lock) and MUST hold the leaf + * page table lock. + */ +static void print_bad_page_map(struct vm_area_struct *vma, + unsigned long addr, unsigned long long entry, struct page *page, + enum pgtable_level level) +{ + struct address_space *mapping; + pgoff_t index; + + if (is_bad_page_map_ratelimited()) + return; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s %s:%08llx", current->comm, + pgtable_level_to_str(level), entry); + __print_bad_page_map_pgtable(vma->vm_mm, addr); if (page) - dump_page(page, "bad pte"); + dump_page(page, "bad page map"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps mmap_prepare: %ps read_folio:%ps\n", @@ -549,6 +611,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } +#define print_bad_pte(vma, addr, pte, page) \ + print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) /* * vm_normal_page -- This function gets the "struct page" associated with a pte. From af38538801c6a97565b44700ee6695d7d60ad779 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:29 +0200 Subject: [PATCH 044/372] mm/memory: factor out common code from vm_normal_page_*() Let's reduce the code duplication and factor out the non-pte/pmd related magic into __vm_normal_page(). To keep it simpler, check the pfn against both zero folios, which shouldn't really make a difference. It's a good question if we can even hit the !CONFIG_ARCH_HAS_PTE_SPECIAL scenario in the PMD case in practice: but doesn't really matter, as it's now all unified in vm_normal_page_pfn(). Add kerneldoc for all involved functions. Note that, as a side product, we now: * Support the find_special_page special thingy also for PMD * Don't check for is_huge_zero_pfn() anymore if we have CONFIG_ARCH_HAS_PTE_SPECIAL and the PMD is not special. The VM_WARN_ON_ONCE would catch any abuse No functional change intended. Link: https://lkml.kernel.org/r/20250811112631.759341-10-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 208 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 120 insertions(+), 88 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index dc0107354d37..78af3f243cee 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -614,8 +614,14 @@ static void print_bad_page_map(struct vm_area_struct *vma, #define print_bad_pte(vma, addr, pte, page) \ print_bad_page_map(vma, addr, pte_val(pte), page, PGTABLE_LEVEL_PTE) -/* - * vm_normal_page -- This function gets the "struct page" associated with a pte. +/** + * __vm_normal_page() - Get the "struct page" associated with a page table entry. + * @vma: The VMA mapping the page table entry. + * @addr: The address where the page table entry is mapped. + * @pfn: The PFN stored in the page table entry. + * @special: Whether the page table entry is marked "special". + * @level: The page table level for error reporting purposes only. + * @entry: The page table entry value for error reporting purposes only. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this @@ -628,10 +634,10 @@ static void print_bad_page_map(struct vm_area_struct *vma, * Selected page table walkers (such as GUP) can still identify mappings of the * shared zero folios and work with the underlying "struct page". * - * There are 2 broad cases. Firstly, an architecture may define a pte_special() - * pte bit, in which case this function is trivial. Secondly, an architecture - * may not have a spare pte bit, which requires a more complicated scheme, - * described below. + * There are 2 broad cases. Firstly, an architecture may define a "special" + * page table entry bit, such as pte_special(), in which case this function is + * trivial. Secondly, an architecture may not have a spare page table + * entry bit, which requires a more complicated scheme, described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). @@ -664,63 +670,94 @@ static void print_bad_page_map(struct vm_area_struct *vma, * don't have to follow the strict linearity rule of PFNMAP mappings in * order to support COWable mappings. * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +static inline struct page *__vm_normal_page(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn, bool special, + unsigned long long entry, enum pgtable_level level) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { + if (unlikely(special)) { + if (vma->vm_ops && vma->vm_ops->find_special_page) + return vma->vm_ops->find_special_page(vma, addr); + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + + print_bad_page_map(vma, addr, entry, NULL, level); + return NULL; + } + /* + * With CONFIG_ARCH_HAS_PTE_SPECIAL, any special page table + * mappings (incl. shared zero folios) are marked accordingly. + */ + } else { + if (unlikely(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + /* If it has a "struct page", it's "normal". */ + if (!pfn_valid(pfn)) + return NULL; + } else { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; + + /* Only CoW'ed anon folios are "normal". */ + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) + return NULL; + } + + if (unlikely(pfn > highest_memmap_pfn)) { + /* Corrupted page table entry. */ + print_bad_page_map(vma, addr, entry, NULL, level); + return NULL; + } + /* + * NOTE! We still have PageReserved() pages in the page tables. + * For example, VDSO mappings can cause them to exist. + */ + VM_WARN_ON_ONCE(is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)); + return pfn_to_page(pfn); +} + +/** + * vm_normal_page() - Get the "struct page" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. */ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - - if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { - if (likely(!pte_special(pte))) - goto check_pfn; - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); - if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) - return NULL; - if (is_zero_pfn(pfn)) - return NULL; - - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } - - /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - if (is_zero_pfn(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; - } - } - - if (is_zero_pfn(pfn)) - return NULL; - -check_pfn: - if (unlikely(pfn > highest_memmap_pfn)) { - print_bad_pte(vma, addr, pte, NULL); - return NULL; - } - - /* - * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. - */ -out: - VM_WARN_ON_ONCE(is_zero_pfn(pfn)); - return pfn_to_page(pfn); + return __vm_normal_page(vma, addr, pte_pfn(pte), pte_special(pte), + pte_val(pte), PGTABLE_LEVEL_PTE); } +/** + * vm_normal_folio() - Get the "struct folio" associated with a PTE + * @vma: The VMA mapping the @pte. + * @addr: The address where the @pte is mapped. + * @pte: The PTE. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { @@ -732,42 +769,37 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/** + * vm_normal_page_pmd() - Get the "struct page" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct page" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { - unsigned long pfn = pmd_pfn(pmd); - - if (unlikely(pmd_special(pmd))) - return NULL; - - if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { - if (vma->vm_flags & VM_MIXEDMAP) { - if (!pfn_valid(pfn)) - return NULL; - goto out; - } else { - unsigned long off; - off = (addr - vma->vm_start) >> PAGE_SHIFT; - if (pfn == vma->vm_pgoff + off) - return NULL; - if (!is_cow_mapping(vma->vm_flags)) - return NULL; - } - } - - if (is_huge_zero_pfn(pfn)) - return NULL; - if (unlikely(pfn > highest_memmap_pfn)) - return NULL; - - /* - * NOTE! We still have PageReserved() pages in the page tables. - * eg. VDSO mappings can cause them to exist. - */ -out: - return pfn_to_page(pfn); + return __vm_normal_page(vma, addr, pmd_pfn(pmd), pmd_special(pmd), + pmd_val(pmd), PGTABLE_LEVEL_PMD); } +/** + * vm_normal_folio_pmd() - Get the "struct folio" associated with a PMD + * @vma: The VMA mapping the @pmd. + * @addr: The address where the @pmd is mapped. + * @pmd: The PMD. + * + * Get the "struct folio" associated with a PTE. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct folio" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { From 2db308160b5a191b494746fd167dbbaaead3fb26 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:30 +0200 Subject: [PATCH 045/372] mm: introduce and use vm_normal_page_pud() Let's introduce vm_normal_page_pud(), which ends up being fairly simple because of our new common helpers and there not being a PUD-sized zero folio. Use vm_normal_page_pud() in folio_walk_start() to resolve a TODO, structuring the code like the other (pmd/pte) cases. Defer introducing vm_normal_folio_pud() until really used. Note that we can so far get PUDs with hugetlb, daxfs and PFNMAP entries. Link: https://lkml.kernel.org/r/20250811112631.759341-11-david@redhat.com Reviewed-by: Wei Yang Reviewed-by: Oscar Salvador Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: David Vrabel Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/memory.c | 19 +++++++++++++++++++ mm/pagewalk.c | 26 +++++++++++++------------- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b626d1bacef5..8ca7d2fa7134 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2360,6 +2360,8 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); +struct page *vm_normal_page_pud(struct vm_area_struct *vma, unsigned long addr, + pud_t pud); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index 78af3f243cee..6f806bf3cc99 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -809,6 +809,25 @@ struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, return page_folio(page); return NULL; } + +/** + * vm_normal_page_pud() - Get the "struct page" associated with a PUD + * @vma: The VMA mapping the @pud. + * @addr: The address where the @pud is mapped. + * @pud: The PUD. + * + * Get the "struct page" associated with a PUD. See __vm_normal_page() + * for details on "normal" and "special" mappings. + * + * Return: Returns the "struct page" if this is a "normal" mapping. Returns + * NULL if this is a "special" mapping. + */ +struct page *vm_normal_page_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t pud) +{ + return __vm_normal_page(vma, addr, pud_pfn(pud), pud_special(pud), + pud_val(pud), PGTABLE_LEVEL_PUD); +} #endif /** diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 648038247a8d..c6753d370ff4 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -902,23 +902,23 @@ struct folio *folio_walk_start(struct folio_walk *fw, fw->pudp = pudp; fw->pud = pud; + if (pud_none(pud)) { + spin_unlock(ptl); + goto not_found; + } else if (pud_present(pud) && !pud_leaf(pud)) { + spin_unlock(ptl); + goto pmd_table; + } else if (pud_present(pud)) { + page = vm_normal_page_pud(vma, addr, pud); + if (page) + goto found; + } /* * TODO: FW_MIGRATION support for PUD migration entries * once there are relevant users. */ - if (!pud_present(pud) || pud_special(pud)) { - spin_unlock(ptl); - goto not_found; - } else if (!pud_leaf(pud)) { - spin_unlock(ptl); - goto pmd_table; - } - /* - * TODO: vm_normal_page_pud() will be handy once we want to - * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. - */ - page = pud_page(pud); - goto found; + spin_unlock(ptl); + goto not_found; } pmd_table: From 4c89792ea0a224340ff198abc7caffa211baccd6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 11 Aug 2025 13:26:31 +0200 Subject: [PATCH 046/372] mm: rename vm_ops->find_special_page() to vm_ops->find_normal_page() ... and hide it behind a kconfig option. There is really no need for any !xen code to perform this check. The naming is a bit off: we want to find the "normal" page when a PTE was marked "special". So it's really not "finding a special" page. Improve the documentation, and add a comment in the code where XEN ends up performing the pte_mkspecial() through a hypercall. More details can be found in commit 923b2919e2c3 ("xen/gntdev: mark userspace PTEs as special on x86 PV guests"). Link: https://lkml.kernel.org/r/20250811112631.759341-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Reviewed-by: Lorenzo Stoakes Reviewed-by: Wei Yang Cc: David Vrabel Cc: Alistair Popple Cc: Al Viro Cc: Baolin Wang Cc: Barry Song Cc: Christian Brauner Cc: Christophe Leroy Cc: Dan Williams Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Jann Horn Cc: Juegren Gross Cc: Lance Yang Cc: Liam Howlett Cc: Madhavan Srinivasan Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Michal Hocko Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oleksandr Tyshchenko Cc: Ryan Roberts Cc: Stefano Stabellini Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/xen/Kconfig | 1 + drivers/xen/gntdev.c | 5 +++-- include/linux/mm.h | 18 +++++++++++++----- mm/Kconfig | 2 ++ mm/memory.c | 12 ++++++++++-- tools/testing/vma/vma_internal.h | 18 +++++++++++++----- 6 files changed, 42 insertions(+), 14 deletions(-) diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 24f485827e03..f9a35ed266ec 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -138,6 +138,7 @@ config XEN_GNTDEV depends on XEN default m select MMU_NOTIFIER + select FIND_NORMAL_PAGE help Allows userspace processes to use grants. diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 1f2160765618..26f13b37c78e 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -321,6 +321,7 @@ static int find_grant_ptes(pte_t *pte, unsigned long addr, void *data) BUG_ON(pgnr >= map->count); pte_maddr = arbitrary_virt_to_machine(pte).maddr; + /* Note: this will perform a pte_mkspecial() through the hypercall. */ gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, map->grants[pgnr].ref, map->grants[pgnr].domid); @@ -528,7 +529,7 @@ static void gntdev_vma_close(struct vm_area_struct *vma) gntdev_put_map(priv, map); } -static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, +static struct page *gntdev_vma_find_normal_page(struct vm_area_struct *vma, unsigned long addr) { struct gntdev_grant_map *map = vma->vm_private_data; @@ -539,7 +540,7 @@ static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, static const struct vm_operations_struct gntdev_vmops = { .open = gntdev_vma_open, .close = gntdev_vma_close, - .find_special_page = gntdev_vma_find_special_page, + .find_normal_page = gntdev_vma_find_normal_page, }; /* ------------------------------------------------------------------ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8ca7d2fa7134..3868ca1a25f9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -657,13 +657,21 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif +#ifdef CONFIG_FIND_NORMAL_PAGE /* - * Called by vm_normal_page() for special PTEs to find the - * page for @addr. This is useful if the default behavior - * (using pte_page()) would not find the correct page. + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. */ - struct page *(*find_special_page)(struct vm_area_struct *vma, - unsigned long addr); + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ }; #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf..59a04d0b2e27 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1381,6 +1381,8 @@ config PT_RECLAIM Note: now only empty user PTE page table pages will be reclaimed. +config FIND_NORMAL_PAGE + def_bool n source "mm/damon/Kconfig" diff --git a/mm/memory.c b/mm/memory.c index 6f806bf3cc99..002c28795d8b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -639,6 +639,12 @@ static void print_bad_page_map(struct vm_area_struct *vma, * trivial. Secondly, an architecture may not have a spare page table * entry bit, which requires a more complicated scheme, described below. * + * With CONFIG_FIND_NORMAL_PAGE, we might have the "special" bit set on + * page table entries that actually map "normal" pages: however, that page + * cannot be looked up through the PFN stored in the page table entry, but + * instead will be looked up through vm_ops->find_normal_page(). So far, this + * only applies to PTEs. + * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. @@ -679,8 +685,10 @@ static inline struct page *__vm_normal_page(struct vm_area_struct *vma, { if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (unlikely(special)) { - if (vma->vm_ops && vma->vm_ops->find_special_page) - return vma->vm_ops->find_special_page(vma, addr); +#ifdef CONFIG_FIND_NORMAL_PAGE + if (vma->vm_ops && vma->vm_ops->find_normal_page) + return vma->vm_ops->find_normal_page(vma, addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn) || is_huge_zero_pfn(pfn)) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 3639aa8dd2b0..cb1c2a8afe26 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -467,13 +467,21 @@ struct vm_operations_struct { struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif +#ifdef CONFIG_FIND_NORMAL_PAGE /* - * Called by vm_normal_page() for special PTEs to find the - * page for @addr. This is useful if the default behavior - * (using pte_page()) would not find the correct page. + * Called by vm_normal_page() for special PTEs in @vma at @addr. This + * allows for returning a "normal" page from vm_normal_page() even + * though the PTE indicates that the "struct page" either does not exist + * or should not be touched: "special". + * + * Do not add new users: this really only works when a "normal" page + * was mapped, but then the PTE got changed to something weird (+ + * marked special) that would not make pte_pfn() identify the originally + * inserted page. */ - struct page *(*find_special_page)(struct vm_area_struct *vma, - unsigned long addr); + struct page *(*find_normal_page)(struct vm_area_struct *vma, + unsigned long addr); +#endif /* CONFIG_FIND_NORMAL_PAGE */ }; struct vm_unmapped_area_info { From b912586ba7cffabcfc1af7ff0f0b1569a73b21f4 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:09 +0200 Subject: [PATCH 047/372] mm: rename huge_zero_page to huge_zero_folio Patch series "add persistent huge zero folio support", v3. Many places in the kernel need to zero out larger chunks, but the maximum segment we can zero out at a time by ZERO_PAGE is limited by PAGE_SIZE. This concern was raised during the review of adding Large Block Size support to XFS[1][2]. This is especially annoying in block devices and filesystems where multiple ZERO_PAGEs are attached to the bio in different bvecs. With multipage bvec support in block layer, it is much more efficient to send out larger zero pages as a part of single bvec. Some examples of places in the kernel where this could be useful: - blkdev_issue_zero_pages() - iomap_dio_zero() - vmalloc.c:zero_iter() - rxperf_process_call() - fscrypt_zeroout_range_inline_crypt() - bch2_checksum_update() ... Usually huge_zero_folio is allocated on demand, and it will be deallocated by the shrinker if there are no users of it left. At the moment, huge_zero_folio infrastructure refcount is tied to the process lifetime that created it. This might not work for bio layer as the completions can be async and the process that created the huge_zero_folio might no longer be alive. And, one of the main point that came during discussion is to have something bigger than zero page as a drop-in replacement. Add a config option PERSISTENT_HUGE_ZERO_FOLIO that will always allocate the huge_zero_folio, and disable the shrinker so that huge_zero_folio is never freed. This makes using the huge_zero_folio without having to pass any mm struct and does not tie the lifetime of the zero folio to anything, making it a drop-in replacement for ZERO_PAGE. I have converted blkdev_issue_zero_pages() as an example as a part of this series. I also noticed close to 4% performance improvement just by replacing ZERO_PAGE with persistent huge_zero_folio. I will send patches to individual subsystems using the huge_zero_folio once this gets upstreamed. [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/ [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/ As the transition already happened from exposing huge_zero_page to huge_zero_folio, change the name of the shrinker and the other helper function to reflect that. No functional changes. Link: https://lkml.kernel.org/r/20250811084113.647267-1-kernel@pankajraghav.com Link: https://lkml.kernel.org/r/20250811084113.647267-2-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- mm/huge_memory.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 58bac83e7fa3..3f0c8c2856d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -207,7 +207,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, return orders; } -static bool get_huge_zero_page(void) +static bool get_huge_zero_folio(void) { struct folio *zero_folio; retry: @@ -237,7 +237,7 @@ retry: return true; } -static void put_huge_zero_page(void) +static void put_huge_zero_folio(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -251,11 +251,11 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) return READ_ONCE(huge_zero_folio); - if (!get_huge_zero_page()) + if (!get_huge_zero_folio()) return NULL; if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - put_huge_zero_page(); + put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); } @@ -263,18 +263,18 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - put_huge_zero_page(); + put_huge_zero_folio(); } -static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink, + struct shrink_control *sc) { /* we can free zero page only if last reference remains */ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; } -static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink, + struct shrink_control *sc) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct folio *zero_folio = xchg(&huge_zero_folio, NULL); @@ -287,7 +287,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, return 0; } -static struct shrinker *huge_zero_page_shrinker; +static struct shrinker *huge_zero_folio_shrinker; #ifdef CONFIG_SYSFS static ssize_t enabled_show(struct kobject *kobj, @@ -849,8 +849,8 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) static int __init thp_shrinker_init(void) { - huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); - if (!huge_zero_page_shrinker) + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); + if (!huge_zero_folio_shrinker) return -ENOMEM; deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | @@ -858,13 +858,13 @@ static int __init thp_shrinker_init(void) SHRINKER_NONSLAB, "thp-deferred_split"); if (!deferred_split_shrinker) { - shrinker_free(huge_zero_page_shrinker); + shrinker_free(huge_zero_folio_shrinker); return -ENOMEM; } - huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; - huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; - shrinker_register(huge_zero_page_shrinker); + huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count; + huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; + shrinker_register(huge_zero_folio_shrinker); deferred_split_shrinker->count_objects = deferred_split_count; deferred_split_shrinker->scan_objects = deferred_split_scan; @@ -875,7 +875,7 @@ static int __init thp_shrinker_init(void) static void __init thp_shrinker_exit(void) { - shrinker_free(huge_zero_page_shrinker); + shrinker_free(huge_zero_folio_shrinker); shrinker_free(deferred_split_shrinker); } From 2843408ca971d1472a6a8b32ee4647f55ecab598 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:10 +0200 Subject: [PATCH 048/372] mm: rename MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO As all the helper functions has been renamed from *_page to *_folio, rename the MM flag from MMF_HUGE_ZERO_PAGE to MMF_HUGE_ZERO_FOLIO. No functional changes. Link: https://lkml.kernel.org/r/20250811084113.647267-3-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 2 +- mm/huge_memory.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3ed763e7ec6f..cf94df4955c7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1758,7 +1758,7 @@ enum { #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3f0c8c2856d3..2801ce9bbde9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -248,13 +248,13 @@ static void put_huge_zero_folio(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); @@ -262,7 +262,7 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); } From 2d8bd8049e89efe42a5397de4effd899e8dd2249 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:11 +0200 Subject: [PATCH 049/372] mm: add persistent huge zero folio Many places in the kernel need to zero out larger chunks, but the maximum segment that can be zeroed out at a time by ZERO_PAGE is limited by PAGE_SIZE. This is especially annoying in block devices and filesystems where multiple ZERO_PAGEs are attached to the bio in different bvecs. With multipage bvec support in block layer, it is much more efficient to send out larger zero pages as a part of single bvec. This concern was raised during the review of adding Large Block Size support to XFS[1][2]. Usually huge_zero_folio is allocated on demand, and it will be deallocated by the shrinker if there are no users of it left. At moment, huge_zero_folio infrastructure refcount is tied to the process lifetime that created it. This might not work for bio layer as the completions can be async and the process that created the huge_zero_folio might no longer be alive. And, one of the main points that came up during discussion is to have something bigger than zero page as a drop-in replacement. Add a config option PERSISTENT_HUGE_ZERO_FOLIO that will result in allocating the huge zero folio during early init and never free the memory by disabling the shrinker. This makes using the huge_zero_folio without having to pass any mm struct and does not tie the lifetime of the zero folio to anything, making it a drop-in replacement for ZERO_PAGE. If PERSISTENT_HUGE_ZERO_FOLIO config option is enabled, then mm_get_huge_zero_folio() will simply return the allocated page instead of dynamically allocating a new PMD page. Use this option carefully in resource constrained systems as it uses one full PMD sized page for zeroing purposes. [1] https://lore.kernel.org/linux-xfs/20231027051847.GA7885@lst.de/ [2] https://lore.kernel.org/linux-xfs/ZitIK5OnR7ZNY0IG@infradead.org/ Link: https://lkml.kernel.org/r/20250811084113.647267-4-kernel@pankajraghav.com Signed-off-by: David Hildenbrand Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Co-developed-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Zi Yan Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 16 ++++++++++++++++ mm/Kconfig | 16 ++++++++++++++++ mm/huge_memory.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7748489fde1b..bd547857c6c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -495,6 +495,17 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); +static inline struct folio *get_persistent_huge_zero_folio(void) +{ + if (!IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return NULL; + + if (unlikely(!huge_zero_folio)) + return NULL; + + return huge_zero_folio; +} + static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); @@ -685,6 +696,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, { return 0; } + +static inline struct folio *get_persistent_huge_zero_folio(void) +{ + return NULL; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, diff --git a/mm/Kconfig b/mm/Kconfig index 59a04d0b2e27..4108bcd96784 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -823,6 +823,22 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_WANTS_THP_SWAP def_bool n +config PERSISTENT_HUGE_ZERO_FOLIO + bool "Allocate a PMD sized folio for zeroing" + depends on TRANSPARENT_HUGEPAGE + help + Enable this option to reduce the runtime refcounting overhead + of the huge zero folio and expand the places in the kernel + that can use huge zero folios. For instance, block I/O benefits + from access to large folios for zeroing memory. + + With this option enabled, the huge zero folio is allocated + once and never freed. One full huge page's worth of memory shall + be used. + + Say Y if your system has lots of memory. Say N if you are + memory constrained. + config MM_ID def_bool n diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2801ce9bbde9..b8bb078a1a34 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -248,6 +248,9 @@ static void put_huge_zero_folio(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return huge_zero_folio; + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) return READ_ONCE(huge_zero_folio); @@ -262,6 +265,9 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) void mm_put_huge_zero_folio(struct mm_struct *mm) { + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) + return; + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) put_huge_zero_folio(); } @@ -849,16 +855,34 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) static int __init thp_shrinker_init(void) { - huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); - if (!huge_zero_folio_shrinker) - return -ENOMEM; - deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | SHRINKER_NONSLAB, "thp-deferred_split"); - if (!deferred_split_shrinker) { - shrinker_free(huge_zero_folio_shrinker); + if (!deferred_split_shrinker) + return -ENOMEM; + + deferred_split_shrinker->count_objects = deferred_split_count; + deferred_split_shrinker->scan_objects = deferred_split_scan; + shrinker_register(deferred_split_shrinker); + + if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) { + /* + * Bump the reference of the huge_zero_folio and do not + * initialize the shrinker. + * + * huge_zero_folio will always be NULL on failure. We assume + * that get_huge_zero_folio() will most likely not fail as + * thp_shrinker_init() is invoked early on during boot. + */ + if (!get_huge_zero_folio()) + pr_warn("Allocating persistent huge zero folio failed\n"); + return 0; + } + + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); + if (!huge_zero_folio_shrinker) { + shrinker_free(deferred_split_shrinker); return -ENOMEM; } @@ -866,10 +890,6 @@ static int __init thp_shrinker_init(void) huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; shrinker_register(huge_zero_folio_shrinker); - deferred_split_shrinker->count_objects = deferred_split_count; - deferred_split_shrinker->scan_objects = deferred_split_scan; - shrinker_register(deferred_split_shrinker); - return 0; } From 415a0fd62f1899fe2bb81d661e427194b1c97201 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:12 +0200 Subject: [PATCH 050/372] mm: add largest_zero_folio() routine The callers of mm_get_huge_zero_folio() have access to a mm struct and the lifetime of the huge_zero_folio is tied to the lifetime of the mm struct. largest_zero_folio() will give access to huge_zero_folio when PERSISTENT_HUGE_ZERO_FOLIO config option is enabled for callers that do not want to tie the lifetime to a mm struct. This is very useful for filesystem and block layers where the request completions can be async and there is no guarantee on the mm struct lifetime. This function will return a ZERO_PAGE folio if PERSISTENT_HUGE_ZERO_FOLIO is disabled or if we failed to allocate a huge_zero_folio during early init. Link: https://lkml.kernel.org/r/20250811084113.647267-5-kernel@pankajraghav.com Signed-off-by: David Hildenbrand Signed-off-by: Pankaj Raghav Reviewed-by: Lorenzo Stoakes Co-developed-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Zi Yan Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bd547857c6c1..14d424830fa8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -714,4 +714,26 @@ static inline int split_folio_to_order(struct folio *folio, int new_order) return split_folio_to_list_to_order(folio, NULL, new_order); } +/** + * largest_zero_folio - Get the largest zero size folio available + * + * This function shall be used when mm_get_huge_zero_folio() cannot be + * used as there is no appropriate mm lifetime to tie the huge zero folio + * from the caller. + * + * Deduce the size of the folio with folio_size instead of assuming the + * folio size. + * + * Return: pointer to PMD sized zero folio if CONFIG_PERSISTENT_HUGE_ZERO_FOLIO + * is enabled or a single page sized zero folio + */ +static inline struct folio *largest_zero_folio(void) +{ + struct folio *folio = get_persistent_huge_zero_folio(); + + if (folio) + return folio; + + return page_folio(ZERO_PAGE(0)); +} #endif /* _LINUX_HUGE_MM_H */ From ea5e101fb6018486ae706b905665b2fe63f9c979 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Mon, 11 Aug 2025 10:41:13 +0200 Subject: [PATCH 051/372] block: use largest_zero_folio in __blkdev_issue_zero_pages() Use largest_zero_folio() in __blkdev_issue_zero_pages(). On systems with CONFIG_PERSISTENT_HUGE_ZERO_FOLIO enabled, we will end up sending larger bvecs instead of multiple small ones. Noticed a 4% increase in performance on a commercial NVMe SSD which does not support OP_WRITE_ZEROES. The device's MDTS was 128K. The performance gains might be bigger if the device supports bigger MDTS. Link: https://lkml.kernel.org/r/20250811084113.647267-6-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Acked-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Hannes Reinecke Cc: Kiryl Shutsemau Cc: Baolin Wang Cc: Christoph Hellwig Cc: "Darrick J. Wong" Cc: Dev Jain Cc: Jens Axboe Cc: Liam Howlett Cc: Luis Chamberalin Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Mike Rapoport Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Thomas Gleinxer Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- block/blk-lib.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index 4c9f20a689f7..3030a772d3aa 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -196,6 +196,8 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned int flags) { + struct folio *zero_folio = largest_zero_folio(); + while (nr_sects) { unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects); struct bio *bio; @@ -208,15 +210,14 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev, break; do { - unsigned int len, added; + unsigned int len; - len = min_t(sector_t, - PAGE_SIZE, nr_sects << SECTOR_SHIFT); - added = bio_add_page(bio, ZERO_PAGE(0), len, 0); - if (added < len) + len = min_t(sector_t, folio_size(zero_folio), + nr_sects << SECTOR_SHIFT); + if (!bio_add_folio(bio, zero_folio, len, 0)) break; - nr_sects -= added >> SECTOR_SHIFT; - sector += added >> SECTOR_SHIFT; + nr_sects -= len >> SECTOR_SHIFT; + sector += len >> SECTOR_SHIFT; } while (nr_sects); *biop = bio_chain_and_submit(*biop, bio); From be564840bbc2bdd803794a7c1a3b5195a901b0d4 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 11 Aug 2025 11:25:08 +0300 Subject: [PATCH 052/372] kho: allow scratch areas with zero size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "kho: fixes and cleanups", v3. These are small KHO and KHO test fixes and cleanups. This patch (of 3): Parsing of kho_scratch parameter treats zero size as an invalid value, although it should be fine for user to request zero sized scratch area for some types if scratch memory, when for example there is no need to create scratch area in the low memory. Treat zero as a valid value for a scratch area size but reject kho_scratch parameter that defines no scratch memory at all. Link: https://lkml.kernel.org/r/20250811082510.4154080-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20250811082510.4154080-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Pasha Tatashin Cc: Shuah Khan Cc: Thomas Weißschuh Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index ecd1ac210dbd..1a65419e3756 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -405,6 +405,7 @@ static int __init kho_parse_scratch_size(char *p) { size_t len; unsigned long sizes[3]; + size_t total_size = 0; int i; if (!p) @@ -441,11 +442,15 @@ static int __init kho_parse_scratch_size(char *p) } sizes[i] = memparse(p, &endp); - if (!sizes[i] || endp == p) + if (endp == p) return -EINVAL; p = endp; + total_size += sizes[i]; } + if (!total_size) + return -EINVAL; + scratch_size_lowmem = sizes[0]; scratch_size_global = sizes[1]; scratch_size_pernode = sizes[2]; From 950c31e8f1280c45697e9353e2d4973ff79312bf Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 11 Aug 2025 11:25:09 +0300 Subject: [PATCH 053/372] lib/test_kho: fixes for error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update kho_test_save() so that folios array won't be freed when returning from the function and the fdt will be freed on error * Reset state->nr_folios to 0 in kho_test_generate_data() on error * Simplify allocation of folios info in fdt. Link: https://lkml.kernel.org/r/20250811082510.4154080-3-rppt@kernel.org Fixes: b753522bed0b ("kho: add test for kexec handover") Signed-off-by: Mike Rapoport (Microsoft) Reported-by: Pratyush Yadav Closes: https://lore.kernel.org/all/mafs0zfcjcepf.fsf@kernel.org Reviewed-by: Pratyush Yadav Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Pasha Tatashin Cc: Shuah Khan Cc: Thomas Weißschuh Signed-off-by: Andrew Morton --- lib/test_kho.c | 52 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/lib/test_kho.c b/lib/test_kho.c index c2eb899c3b45..fe8504e3407b 100644 --- a/lib/test_kho.c +++ b/lib/test_kho.c @@ -67,13 +67,20 @@ static struct notifier_block kho_test_nb = { static int kho_test_save_data(struct kho_test_state *state, void *fdt) { - phys_addr_t *folios_info __free(kvfree) = NULL; + phys_addr_t *folios_info; int err = 0; - folios_info = kvmalloc_array(state->nr_folios, sizeof(*folios_info), - GFP_KERNEL); - if (!folios_info) - return -ENOMEM; + err |= fdt_begin_node(fdt, "data"); + err |= fdt_property(fdt, "nr_folios", &state->nr_folios, + sizeof(state->nr_folios)); + err |= fdt_property_placeholder(fdt, "folios_info", + state->nr_folios * sizeof(*folios_info), + (void **)&folios_info); + err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum)); + err |= fdt_end_node(fdt); + + if (err) + return err; for (int i = 0; i < state->nr_folios; i++) { struct folio *folio = state->folios[i]; @@ -83,17 +90,9 @@ static int kho_test_save_data(struct kho_test_state *state, void *fdt) err = kho_preserve_folio(folio); if (err) - return err; + break; } - err |= fdt_begin_node(fdt, "data"); - err |= fdt_property(fdt, "nr_folios", &state->nr_folios, - sizeof(state->nr_folios)); - err |= fdt_property(fdt, "folios_info", folios_info, - state->nr_folios * sizeof(*folios_info)); - err |= fdt_property(fdt, "csum", &state->csum, sizeof(state->csum)); - err |= fdt_end_node(fdt); - return err; } @@ -140,7 +139,10 @@ static int kho_test_generate_data(struct kho_test_state *state) unsigned int size; void *addr; - /* cap allocation so that we won't exceed max_mem */ + /* + * Since get_order() rounds up, make sure that actual + * allocation is smaller so that we won't exceed max_mem + */ if (alloc_size + (PAGE_SIZE << order) > max_mem) { order = get_order(max_mem - alloc_size); if (order) @@ -165,13 +167,14 @@ static int kho_test_generate_data(struct kho_test_state *state) err_free_folios: for (int i = 0; i < state->nr_folios; i++) folio_put(state->folios[i]); + state->nr_folios = 0; return -ENOMEM; } static int kho_test_save(void) { struct kho_test_state *state = &kho_test_state; - struct folio **folios __free(kvfree) = NULL; + struct folio **folios; unsigned long max_nr; int err; @@ -185,13 +188,23 @@ static int kho_test_save(void) err = kho_test_generate_data(state); if (err) - return err; + goto err_free_folios; err = kho_test_prepare_fdt(state); if (err) - return err; + goto err_free_folios; - return register_kho_notifier(&kho_test_nb); + err = register_kho_notifier(&kho_test_nb); + if (err) + goto err_free_fdt; + + return 0; + +err_free_fdt: + folio_put(state->fdt); +err_free_folios: + kvfree(folios); + return err; } static int kho_test_restore_data(const void *fdt, int node) @@ -291,6 +304,7 @@ static void kho_test_cleanup(void) folio_put(kho_test_state.folios[i]); kvfree(kho_test_state.folios); + folio_put(kho_test_state.fdt); } static void __exit kho_test_exit(void) From 801295be015fc53557a6ef221ab74df6950d1b80 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Mon, 11 Aug 2025 11:25:10 +0300 Subject: [PATCH 054/372] selftest/kho: update generation of initrd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use nolibc include directory rather than include a cumulative nolibc.h on the compiler command line and replace use of 'sudo cpio' with usr/gen_init_cpio. While on it fix spelling of KHO_FINALIZE Link: https://lkml.kernel.org/r/20250811082510.4154080-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Suggested-by: Thomas Weißschuh Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Pasha Tatashin Cc: Pratyush Yadav Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/kho/init.c | 13 ++++--------- tools/testing/selftests/kho/vmtest.sh | 26 ++++++++++++++------------ 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/kho/init.c b/tools/testing/selftests/kho/init.c index 8034e24c6bf6..6d9e91d55d68 100644 --- a/tools/testing/selftests/kho/init.c +++ b/tools/testing/selftests/kho/init.c @@ -1,22 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 -#ifndef NOLIBC -#include #include #include #include -#include +#include #include #include -#endif +#include /* from arch/x86/include/asm/setup.h */ #define COMMAND_LINE_SIZE 2048 -/* from include/linux/kexex.h */ -#define KEXEC_FILE_NO_INITRAMFS 0x00000004 - -#define KHO_FINILIZE "/debugfs/kho/out/finalize" +#define KHO_FINALIZE "/debugfs/kho/out/finalize" #define KERNEL_IMAGE "/kernel" static int mount_filesystems(void) @@ -32,7 +27,7 @@ static int kho_enable(void) const char enable[] = "1"; int fd; - fd = open(KHO_FINILIZE, O_RDWR); + fd = open(KHO_FINALIZE, O_RDWR); if (fd < 0) return -1; diff --git a/tools/testing/selftests/kho/vmtest.sh b/tools/testing/selftests/kho/vmtest.sh index ec70a17bd476..3f6c17166846 100755 --- a/tools/testing/selftests/kho/vmtest.sh +++ b/tools/testing/selftests/kho/vmtest.sh @@ -10,7 +10,6 @@ kernel_dir=$(realpath "$test_dir/../../../..") tmp_dir=$(mktemp -d /tmp/kho-test.XXXXXXXX) headers_dir="$tmp_dir/usr" -initrd_dir="$tmp_dir/initrd" initrd="$tmp_dir/initrd.cpio" source "$test_dir/../kselftest/ktap_helpers.sh" @@ -81,19 +80,22 @@ EOF function mkinitrd() { local kernel=$1 - mkdir -p "$initrd_dir"/{dev,debugfs,proc} - sudo mknod "$initrd_dir/dev/console" c 5 1 + "$CROSS_COMPILE"gcc -s -static -Os -nostdinc -nostdlib \ + -fno-asynchronous-unwind-tables -fno-ident \ + -I "$headers_dir/include" \ + -I "$kernel_dir/tools/include/nolibc" \ + -o "$tmp_dir/init" "$test_dir/init.c" - "$CROSS_COMPILE"gcc -s -static -Os -nostdinc -I"$headers_dir/include" \ - -fno-asynchronous-unwind-tables -fno-ident -nostdlib \ - -include "$test_dir/../../../include/nolibc/nolibc.h" \ - -o "$initrd_dir/init" "$test_dir/init.c" \ + cat > "$tmp_dir/cpio_list" </dev/null - find . | cpio -H newc --create > "$initrd" 2>/dev/null - popd &>/dev/null + "$build_dir/usr/gen_init_cpio" "$tmp_dir/cpio_list" > "$initrd" } function run_qemu() { From 10725cd2b09afe512ff60df651b9dd0e11720801 Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Sun, 10 Aug 2025 21:43:54 +0900 Subject: [PATCH 055/372] selftests/damon: test no-op commit broke DAMON status Add test to verify that DAMON status is not changed after a no-op commit. [ekffu200098@gmail.com: change wrong json.dump usage to json.dumps] Link: https://lkml.kernel.org/r/20250816014033.190451-1-ekffu200098@gmail.com Link: https://lkml.kernel.org/r/20250810124354.16456-1-ekffu200098@gmail.com Signed-off-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Cc: Honggyu Kim Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + .../damon/sysfs_no_op_commit_break.py | 72 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100755 tools/testing/selftests/damon/sysfs_no_op_commit_break.py diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 9a3499827d4b..029de547f31c 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -18,6 +18,7 @@ TEST_PROGS += reclaim.sh lru_sort.sh TEST_PROGS += sysfs_update_removed_scheme_dir.sh TEST_PROGS += sysfs_update_schemes_tried_regions_hang.py TEST_PROGS += sysfs_memcg_path_leak.sh +TEST_PROGS += sysfs_no_op_commit_break.py EXTRA_CLEAN = __pycache__ diff --git a/tools/testing/selftests/damon/sysfs_no_op_commit_break.py b/tools/testing/selftests/damon/sysfs_no_op_commit_break.py new file mode 100755 index 000000000000..2c65cffe6b54 --- /dev/null +++ b/tools/testing/selftests/damon/sysfs_no_op_commit_break.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +import json +import os +import subprocess +import sys + +import _damon_sysfs + +def dump_damon_status_dict(pid): + try: + subprocess.check_output(['which', 'drgn'], stderr=subprocess.DEVNULL) + except: + return None, 'drgn not found' + file_dir = os.path.dirname(os.path.abspath(__file__)) + dump_script = os.path.join(file_dir, 'drgn_dump_damon_status.py') + rc = subprocess.call(['drgn', dump_script, pid, 'damon_dump_output'], + stderr=subprocess.DEVNULL) + + if rc != 0: + return None, f'drgn fail: return code({rc})' + try: + with open('damon_dump_output', 'r') as f: + return json.load(f), None + except Exception as e: + return None, 'json.load fail (%s)' % e + +def main(): + kdamonds = _damon_sysfs.Kdamonds( + [_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + schemes=[_damon_sysfs.Damos( + ops_filters=[ + _damon_sysfs.DamosFilter( + type_='anon', + matching=True, + allow=True, + ) + ] + )], + )])] + ) + + err = kdamonds.start() + if err is not None: + print('kdamond start failed: %s' % err) + exit(1) + + before_commit_status, err = \ + dump_damon_status_dict(kdamonds.kdamonds[0].pid) + if err is not None: + print('before-commit status dump failed: %s' % err) + exit(1) + + kdamonds.kdamonds[0].commit() + + after_commit_status, err = \ + dump_damon_status_dict(kdamonds.kdamonds[0].pid) + if err is not None: + print('after-commit status dump failed: %s' % err) + exit(1) + + if before_commit_status != after_commit_status: + print(f'before: {json.dumps(before_commit_status, indent=2)}') + print(f'after: {json.dumps(after_commit_status, indent=2)}') + exit(1) + + kdamonds.stop() + +if __name__ == '__main__': + main() From 4a12633e87abec277a9911614d4aee3ef66b9c4c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 9 Aug 2025 19:42:09 +0000 Subject: [PATCH 056/372] selftests/mm: do check_huge_anon() with a number been passed in Currently it hard codes the number of hugepage to check for check_huge_anon(), but it would be more reasonable to do the check based on a number passed in. Pass in the hugepage number and do the check based on it. Link: https://lkml.kernel.org/r/20250809194209.30484-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed by: Donet Tom Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Vishal Moola (Oracle) Reviewed-by: wang lian Cc: Dev Jain Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/split_huge_page_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 44a3f8a58806..bf40e6b121ab 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -111,7 +111,7 @@ static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hp unsigned long rss_anon_before, rss_anon_after; size_t i; - if (!check_huge_anon(one_page, 4, pmd_pagesize)) + if (!check_huge_anon(one_page, nr_hpages, pmd_pagesize)) ksft_exit_fail_msg("No THP is allocated\n"); rss_anon_before = rss_anon(); From bb6525f2f8c41e89ba3fc506bc1705c68cf845ae Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:10 +0100 Subject: [PATCH 057/372] mm: add bitmap mm->flags field Patch series "mm: make mm->flags a bitmap and 64-bit on all arches". We are currently in the bizarre situation where we are constrained on the number of flags we can set in an mm_struct based on whether this is a 32-bit or 64-bit kernel. This is because mm->flags is an unsigned long field, which is 32-bits on a 32-bit system and 64-bits on a 64-bit system. In order to keep things functional across both architectures, we do not permit mm flag bits to be set above flag 31 (i.e. the 32nd bit). This is a silly situation, especially given how profligate we are in storing metadata in mm_struct, so let's convert mm->flags into a bitmap and allow ourselves as many bits as we like. In order to execute this change, we introduce a new opaque type - mm_flags_t - which wraps a bitmap. We go further and mark the bitmap field __private, which forces users to have to use accessors, which allows us to enforce atomicity rules around mm->flags (except on those occasions they are not required - fork, etc.) and makes it far easier to keep track of how mm flags are being utilised. In order to implement this change sensibly and an an iterative way, we start by introducing the type with the same bitsize as the current mm flags (system word size) and place it in union with mm->flags. We are then able to gradually update users as we go without being forced to do everything in a single patch. In the course of working on this series I noticed the MMF_* flag masks encounter a sign extension bug that, due to the 32-bit limit on mm->flags thus far, has not caused any issues in practice, but required fixing for this series. We must make special dispensation for two cases - coredump and initailisation on fork, but of which use masks extensively. Since coredump flags are set in stone, we can safely assume they will remain in the first 32-bits of the flags. We therefore provide special non-atomic accessors for this case that access the first system word of flags, keeping everything there essentially the same. For mm->flags initialisation on fork, we adjust the logic to ensure all bits are cleared correctly, and then adjust the existing intialisation logic, dubbing the implementation utilising flags as legacy. This means we get the same fast operations as we do now, but in future we can also choose to update the forking logic to additionally propagate flags beyond 32-bits across fork. With this change in place we can, in future, decide to have as many bits as we please. Since the size of the bitmap will scale in system word multiples, there should be no issues with changes in alignment in mm_struct. Additionally, the really sensitive field (mmap_lock) is located prior to the flags field so this should have no impact on that either. This patch (of 10): We are currently in the bizarre situation where we are constrained on the number of flags we can set in an mm_struct based on whether this is a 32-bit or 64-bit kernel. This is because mm->flags is an unsigned long field, which is 32-bits on a 32-bit system and 64-bits on a 64-bit system. In order to keep things functional across both architectures, we do not permit mm flag bits to be set above flag 31 (i.e. the 32nd bit). This is a silly situation, especially given how profligate we are in storing metadata in mm_struct, so let's convert mm->flags into a bitmap and allow ourselves as many bits as we like. To keep things manageable, firstly we introduce the bitmap at a system word system as a new field mm->_flags, in union. This means the new bitmap mm->_flags is bitwise exactly identical to the existing mm->flags field. We have an opportunity to also introduce some type safety here, so let's wrap the mm flags field as a struct and declare it as an mm_flags_t typedef to keep it consistent with vm_flags_t for VMAs. We make the internal field privately accessible, in order to force the use of helper functions so we can enforce that accesses are bitwise as required. We therefore introduce accessors prefixed with mm_flags_*() for callers to use. We place the bit parameter first so as to match the parameter ordering of the *_bit() functions. Having this temporary union arrangement allows us to incrementally swap over users of mm->flags patch-by-patch rather than having to do everything in one fell swoop. [lorenzo.stoakes@oracle.com: place __private in correct place, const-ify __mm_flags_get_word] Link: https://lkml.kernel.org/r/d4ba117d-6234-4069-b871-254d152d7d21@lucifer.local Link: https://lkml.kernel.org/r/cover.1755012943.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/9de8dfd9de8c95cd31622d6e52051ba0d1848f5a.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3868ca1a25f9..4ed4a0b9dad6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,6 +34,8 @@ #include #include #include +#include +#include struct mempolicy; struct anon_vma; @@ -720,6 +722,36 @@ static inline void assert_fault_locked(struct vm_fault *vmf) } #endif /* CONFIG_PER_VMA_LOCK */ +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) +{ + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) +{ + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_set(int flag, struct mm_struct *mm) +{ + set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_clear(int flag, struct mm_struct *mm) +{ + clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); +} + +static inline void mm_flags_clear_all(struct mm_struct *mm) +{ + bitmap_zero(ACCESS_PRIVATE(&mm->_flags, __mm_flags), NUM_MM_FLAG_BITS); +} + extern const struct vm_operations_struct vma_dummy_vm_ops; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cf94df4955c7..0e001dbad455 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -927,6 +928,15 @@ struct mm_cid { }; #endif +/* + * Opaque type representing current mm_struct flag state. Must be accessed via + * mm_flags_xxx() helper functions. + */ +#define NUM_MM_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} __private mm_flags_t; + struct kioctx_table; struct iommu_mm_data; struct mm_struct { @@ -1109,7 +1119,11 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - unsigned long flags; /* Must use atomic bitops to access */ + /* Temporary union while we convert users to mm_flags_t. */ + union { + unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t _flags; /* Must use mm_flags_* helpers to access */ + }; #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1219,6 +1233,28 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +/* Set the first system word of mm flags, non-atomically. */ +static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + bitmap_copy(bitmap, &value, BITS_PER_LONG); +} + +/* Obtain a read-only view of the bitmap. */ +static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) +{ + return (const unsigned long *)ACCESS_PRIVATE(&mm->_flags, __mm_flags); +} + +/* Read the first system word of mm flags, non-atomically. */ +static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) +{ + const unsigned long *bitmap = __mm_flags_get_bitmap(mm); + + return bitmap_read(bitmap, 0, BITS_PER_LONG); +} + #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; From 12e423ba4eaed7b1561b677d32e6599f932d03db Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:11 +0100 Subject: [PATCH 058/372] mm: convert core mm to mm_flags_*() accessors As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. This will result in the debug output being that of a bitmap output, which will result in a minor change here, but since this is for debug only, this should have no bearing. Otherwise, no functional changes intended. [akpm@linux-foundation.org: fix typo in comment]Link: https://lkml.kernel.org/r/1eb2266f4408798a55bda00cb04545a3203aa572.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 +- include/linux/khugepaged.h | 6 ++++-- include/linux/ksm.h | 6 +++--- include/linux/mm.h | 2 +- include/linux/mman.h | 2 +- include/linux/oom.h | 2 +- mm/debug.c | 4 ++-- mm/gup.c | 10 +++++----- mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 10 +++++----- mm/ksm.c | 32 ++++++++++++++++---------------- mm/mmap.c | 8 ++++---- mm/oom_kill.c | 26 +++++++++++++------------- mm/util.c | 6 +++--- 14 files changed, 63 insertions(+), 61 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 14d424830fa8..84b7eebe0d68 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -327,7 +327,7 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, * example, s390 kvm. */ return (vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags); + mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index ff6120463745..eb1946a70cff 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -2,6 +2,8 @@ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H +#include + extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; @@ -20,13 +22,13 @@ extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { - if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, oldmm)) __khugepaged_enter(mm); } static inline void khugepaged_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + if (mm_flags_test(MMF_VM_HUGEPAGE, mm)) __khugepaged_exit(mm); } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/include/linux/ksm.h b/include/linux/ksm.h index c17b955e7b0b..22e67ca7cba3 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -56,13 +56,13 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ - if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return __ksm_enter(mm); return 0; @@ -70,7 +70,7 @@ static inline int ksm_execve(struct mm_struct *mm) static inline void ksm_exit(struct mm_struct *mm) { - if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGEABLE, mm)) __ksm_exit(mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 4ed4a0b9dad6..34311ebe62cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1949,7 +1949,7 @@ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); - if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm)) return false; return folio_maybe_dma_pinned(folio); diff --git a/include/linux/mman.h b/include/linux/mman.h index de9e8e6229a4..0ba8a7e8b90a 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -201,7 +201,7 @@ static inline bool arch_memory_deny_write_exec_supported(void) static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (!mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 1e0fc6931ce9..7b02bc1d0a7e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -91,7 +91,7 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) */ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) { - if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags))) + if (unlikely(mm_flags_test(MMF_UNSTABLE, mm))) return VM_FAULT_SIGBUS; return 0; } diff --git a/mm/debug.c b/mm/debug.c index b4388f4dcd4d..64ddb0c4b4be 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,7 +182,7 @@ void dump_mm(const struct mm_struct *mm) "start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_brk %lx brk %lx start_stack %lx\n" "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" - "binfmt %px flags %lx\n" + "binfmt %px flags %*pb\n" #ifdef CONFIG_AIO "ioctx_table %px\n" #endif @@ -211,7 +211,7 @@ void dump_mm(const struct mm_struct *mm) mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, - mm->binfmt, mm->flags, + mm->binfmt, NUM_MM_FLAG_BITS, __mm_flags_get_bitmap(mm), #ifdef CONFIG_AIO mm->ioctx_table, #endif diff --git a/mm/gup.c b/mm/gup.c index adffe663594d..331d22bf7b2d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -475,10 +475,10 @@ EXPORT_SYMBOL_GPL(unpin_folios); * lifecycle. Avoid setting the bit unless necessary, or it might cause write * cache bouncing on large SMP machines for concurrent pinned gups. */ -static inline void mm_set_has_pinned_flag(unsigned long *mm_flags) +static inline void mm_set_has_pinned_flag(struct mm_struct *mm) { - if (!test_bit(MMF_HAS_PINNED, mm_flags)) - set_bit(MMF_HAS_PINNED, mm_flags); + if (!mm_flags_test(MMF_HAS_PINNED, mm)) + mm_flags_set(MMF_HAS_PINNED, mm); } #ifdef CONFIG_MMU @@ -1693,7 +1693,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, mmap_assert_locked(mm); if (flags & FOLL_PIN) - mm_set_has_pinned_flag(&mm->flags); + mm_set_has_pinned_flag(mm); /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior @@ -3210,7 +3210,7 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EINVAL; if (gup_flags & FOLL_PIN) - mm_set_has_pinned_flag(¤t->mm->flags); + mm_set_has_pinned_flag(current->mm); if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b8bb078a1a34..a2f476e7419a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -251,13 +251,13 @@ struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) return huge_zero_folio; - if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) return READ_ONCE(huge_zero_folio); if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test_and_set(MMF_HUGE_ZERO_FOLIO, mm)) put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); @@ -268,7 +268,7 @@ void mm_put_huge_zero_folio(struct mm_struct *mm) if (IS_ENABLED(CONFIG_PERSISTENT_HUGE_ZERO_FOLIO)) return; - if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + if (mm_flags_test(MMF_HUGE_ZERO_FOLIO, mm)) put_huge_zero_folio(); } @@ -1145,7 +1145,7 @@ static unsigned long __thp_get_unmapped_area(struct file *filp, off_sub = (off - ret) & (size - 1); - if (test_bit(MMF_TOPDOWN, ¤t->mm->flags) && !off_sub) + if (mm_flags_test(MMF_TOPDOWN, current->mm) && !off_sub) return ret + size; ret += off_sub; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b40bdfd224c..550eb00116c5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - test_bit(MMF_DISABLE_THP, &mm->flags); + mm_flags_test(MMF_DISABLE_THP, mm); } static bool hugepage_pmd_enabled(void) @@ -445,7 +445,7 @@ void __khugepaged_enter(struct mm_struct *mm) /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); - if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) + if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; mm_slot = mm_slot_alloc(mm_slot_cache); @@ -472,7 +472,7 @@ void __khugepaged_enter(struct mm_struct *mm) void khugepaged_enter_vma(struct vm_area_struct *vma, vm_flags_t vm_flags) { - if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && + if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, PMD_ORDER)) @@ -497,7 +497,7 @@ void __khugepaged_exit(struct mm_struct *mm) spin_unlock(&khugepaged_mm_lock); if (free) { - clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + mm_flags_clear(MMF_VM_HUGEPAGE, mm); mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { @@ -1459,7 +1459,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Not strictly needed because the mm exited already. * - * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + * mm_flags_clear(MMF_VM_HUGEPAGE, mm); */ /* khugepaged_mm_lock actually not necessary for the below */ diff --git a/mm/ksm.c b/mm/ksm.c index 160787bb121c..2ef29802a49b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1217,8 +1217,8 @@ mm_exiting: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmdrop(mm); } else spin_unlock(&ksm_mmlist_lock); @@ -2620,8 +2620,8 @@ no_vmas: spin_unlock(&ksm_mmlist_lock); mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGEABLE, mm); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); mmap_read_unlock(mm); mmdrop(mm); } else { @@ -2742,7 +2742,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) vm_flags_t ksm_vma_flags(const struct mm_struct *mm, const struct file *file, vm_flags_t vm_flags) { - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags) && + if (mm_flags_test(MMF_VM_MERGE_ANY, mm) && __ksm_should_add_vma(file, vm_flags)) vm_flags |= VM_MERGEABLE; @@ -2784,16 +2784,16 @@ int ksm_enable_merge_any(struct mm_struct *mm) { int err; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; } - set_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_set(MMF_VM_MERGE_ANY, mm); ksm_add_vmas(mm); return 0; @@ -2815,7 +2815,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) { int err; - if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGE_ANY, mm)) return 0; err = ksm_del_vmas(mm); @@ -2824,7 +2824,7 @@ int ksm_disable_merge_any(struct mm_struct *mm) return err; } - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); return 0; } @@ -2832,9 +2832,9 @@ int ksm_disable(struct mm_struct *mm) { mmap_assert_write_locked(mm); - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) return 0; - if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) + if (mm_flags_test(MMF_VM_MERGE_ANY, mm)) return ksm_disable_merge_any(mm); return ksm_del_vmas(mm); } @@ -2852,7 +2852,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, if (!vma_ksm_compatible(vma)) return 0; - if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + if (!mm_flags_test(MMF_VM_MERGEABLE, mm)) { err = __ksm_enter(mm); if (err) return err; @@ -2912,7 +2912,7 @@ int __ksm_enter(struct mm_struct *mm) list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - set_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_set(MMF_VM_MERGEABLE, mm); mmgrab(mm); if (needs_wakeup) @@ -2954,8 +2954,8 @@ void __ksm_exit(struct mm_struct *mm) if (easy_to_free) { mm_slot_free(mm_slot_cache, mm_slot); - clear_bit(MMF_VM_MERGE_ANY, &mm->flags); - clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mm_flags_clear(MMF_VM_MERGE_ANY, mm); + mm_flags_clear(MMF_VM_MERGEABLE, mm); mmdrop(mm); } else if (mm_slot) { mmap_write_lock(mm); diff --git a/mm/mmap.c b/mm/mmap.c index 7306253cc3b5..7a057e0e8da9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -802,7 +802,7 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { - if (test_bit(MMF_TOPDOWN, &mm->flags)) + if (mm_flags_test(MMF_TOPDOWN, mm)) return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags, vm_flags); return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); @@ -1284,7 +1284,7 @@ void exit_mmap(struct mm_struct *mm) * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper * because the memory has been already freed. */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); vma_iter_set(&vmi, vma->vm_end); @@ -1859,14 +1859,14 @@ loop_out: mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); } out: mmap_write_unlock(mm); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 25923cfec9c6..17650f0b516e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/oom_kill.c - * + * * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... @@ -218,7 +218,7 @@ long oom_badness(struct task_struct *p, unsigned long totalpages) */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_SKIP, &p->mm->flags) || + mm_flags_test(MMF_OOM_SKIP, p->mm) || in_vfork(p)) { task_unlock(p); return LONG_MIN; @@ -325,7 +325,7 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) * any memory is quite low. */ if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { - if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, task->signal->oom_mm)) goto next; goto abort; } @@ -524,7 +524,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) * should imply barriers already and the reader would hit a page fault * if it stumbled over a reaped memory. */ - set_bit(MMF_UNSTABLE, &mm->flags); + mm_flags_set(MMF_UNSTABLE, mm); for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) @@ -583,7 +583,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * under mmap_lock for reading because it serializes against the * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap(). */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { trace_skip_task_reaping(tsk->pid); goto out_unlock; } @@ -619,7 +619,7 @@ static void oom_reap_task(struct task_struct *tsk) schedule_timeout_idle(HZ/10); if (attempts <= MAX_OOM_REAP_RETRIES || - test_bit(MMF_OOM_SKIP, &mm->flags)) + mm_flags_test(MMF_OOM_SKIP, mm)) goto done; pr_info("oom_reaper: unable to reap pid:%d (%s)\n", @@ -634,7 +634,7 @@ done: * Hide this mm from OOM killer because it has been either reaped or * somebody can't call mmap_write_unlock(mm). */ - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); @@ -670,7 +670,7 @@ static void wake_oom_reaper(struct timer_list *timer) unsigned long flags; /* The victim managed to terminate on its own - see exit_mmap */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + if (mm_flags_test(MMF_OOM_SKIP, mm)) { put_task_struct(tsk); return; } @@ -695,7 +695,7 @@ static void wake_oom_reaper(struct timer_list *timer) static void queue_oom_reaper(struct task_struct *tsk) { /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + if (mm_flags_test_and_set(MMF_OOM_REAP_QUEUED, tsk->signal->oom_mm)) return; get_task_struct(tsk); @@ -892,7 +892,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_SKIP, &mm->flags)) + if (mm_flags_test(MMF_OOM_SKIP, mm)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -977,7 +977,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; if (is_global_init(p)) { can_oom_reap = false; - set_bit(MMF_OOM_SKIP, &mm->flags); + mm_flags_set(MMF_OOM_SKIP, mm); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); @@ -1235,7 +1235,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) reap = true; else { /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + if (!mm_flags_test(MMF_OOM_SKIP, mm)) ret = -EINVAL; } task_unlock(p); @@ -1251,7 +1251,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure * possible change in exit_mmap is seen */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) + if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); diff --git a/mm/util.c b/mm/util.c index f814e6a59ab1..d235b74f7aff 100644 --- a/mm/util.c +++ b/mm/util.c @@ -471,17 +471,17 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } #endif #ifdef CONFIG_MMU From 879d0d99541f6877c4e0f532c589c39869cf7077 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:12 +0100 Subject: [PATCH 059/372] mm: convert prctl to mm_flags_*() accessors As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. No functional change intended. Link: https://lkml.kernel.org/r/b64f07b94822d02beb88d0d21a6a85f9ee45fc69.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/sys.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 1e28b40053ce..605f7fe9a143 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2392,9 +2392,9 @@ static inline unsigned long get_current_mdwe(void) { unsigned long ret = 0; - if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) ret |= PR_MDWE_REFUSE_EXEC_GAIN; - if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE_NO_INHERIT, current->mm)) ret |= PR_MDWE_NO_INHERIT; return ret; @@ -2427,9 +2427,9 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, return -EPERM; /* Cannot unset the flags */ if (bits & PR_MDWE_NO_INHERIT) - set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); + mm_flags_set(MMF_HAS_MDWE_NO_INHERIT, current->mm); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) - set_bit(MMF_HAS_MDWE, ¤t->mm->flags); + mm_flags_set(MMF_HAS_MDWE, current->mm); return 0; } @@ -2627,7 +2627,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags); + error = !!mm_flags_test(MMF_DISABLE_THP, me->mm); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) @@ -2635,9 +2635,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (mmap_write_lock_killable(me->mm)) return -EINTR; if (arg2) - set_bit(MMF_DISABLE_THP, &me->mm->flags); + mm_flags_set(MMF_DISABLE_THP, me->mm); else - clear_bit(MMF_DISABLE_THP, &me->mm->flags); + mm_flags_clear(MMF_DISABLE_THP, me->mm); mmap_write_unlock(me->mm); break; case PR_MPX_ENABLE_MANAGEMENT: @@ -2770,7 +2770,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags); + error = !!mm_flags_test(MMF_VM_MERGE_ANY, me->mm); break; #endif case PR_RISCV_V_SET_CONTROL: From 4141c2dc88f21881e34e0a3678fa48f0303adc5e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:13 +0100 Subject: [PATCH 060/372] mm: convert arch-specific code to mm_flags_*() accessors As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. No functional change intended. [lorenzo.stoakes@oracle.com: fix typo] Link: https://lkml.kernel.org/r/f8ff8fe9-0c89-4742-bf52-d31319d948c1@lucifer.local Link: https://lkml.kernel.org/r/6e0a4563fcade8678d0fc99859b3998d4354e82f.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/s390/mm/mmap.c | 4 ++-- arch/sparc/kernel/sys_sparc_64.c | 4 ++-- arch/x86/mm/mmap.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 40a526d28184..547104ccc22a 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -182,10 +182,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) */ if (mmap_is_legacy(rlim_stack)) { mm->mmap_base = mmap_base_legacy(random_factor); - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { mm->mmap_base = mmap_base(random_factor, rlim_stack); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index c5a284df7b41..785e9909340f 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -309,7 +309,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) gap == RLIM_INFINITY || sysctl_legacy_va_layout) { mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); } else { /* We know it's 32-bit */ unsigned long task_size = STACK_TOP32; @@ -320,7 +320,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) gap = (task_size / 6 * 5); mm->mmap_base = PAGE_ALIGN(task_size - gap - random_factor); - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); } } diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 5ed2109211da..708f85dc9380 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -122,9 +122,9 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { if (mmap_is_legacy()) - clear_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_clear(MMF_TOPDOWN, mm); else - set_bit(MMF_TOPDOWN, &mm->flags); + mm_flags_set(MMF_TOPDOWN, mm); arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, arch_rnd(mmap64_rnd_bits), task_size_64bit(0), From c0951573e0d6e39083ef7f39a2f0983ece8fb1a0 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:14 +0100 Subject: [PATCH 061/372] mm: convert uprobes to mm_flags_*() accessors As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. No functional change intended. Link: https://lkml.kernel.org/r/1d4fe5963904cc0c707da1f53fbfe6471d3eff10.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7ca1940607bd..31a12b60055f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1153,15 +1153,15 @@ static int install_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page(). */ - first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); + first_uprobe = !mm_flags_test(MMF_HAS_UPROBES, mm); if (first_uprobe) - set_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_set(MMF_HAS_UPROBES, mm); ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret) - clear_bit(MMF_RECALC_UPROBES, &mm->flags); + mm_flags_clear(MMF_RECALC_UPROBES, mm); else if (first_uprobe) - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); return ret; } @@ -1171,7 +1171,7 @@ static int remove_breakpoint(struct uprobe *uprobe, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; - set_bit(MMF_RECALC_UPROBES, &mm->flags); + mm_flags_set(MMF_RECALC_UPROBES, mm); return set_orig_insn(&uprobe->arch, vma, vaddr); } @@ -1303,7 +1303,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) /* consult only the "caller", new consumer. */ if (consumer_filter(new, mm)) err = install_breakpoint(uprobe, vma, info->vaddr); - } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) { + } else if (mm_flags_test(MMF_HAS_UPROBES, mm)) { if (!filter_chain(uprobe, mm)) err |= remove_breakpoint(uprobe, vma, info->vaddr); } @@ -1595,7 +1595,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (vma->vm_file && (vma->vm_flags & (VM_WRITE|VM_SHARED)) == VM_WRITE && - test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags)) + mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm)) delayed_ref_ctr_inc(vma); if (!valid_vma(vma, true)) @@ -1655,12 +1655,12 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return; - if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || - test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) + if (!mm_flags_test(MMF_HAS_UPROBES, vma->vm_mm) || + mm_flags_test(MMF_RECALC_UPROBES, vma->vm_mm)) return; if (vma_has_uprobes(vma, start, end)) - set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); + mm_flags_set(MMF_RECALC_UPROBES, vma->vm_mm); } static vm_fault_t xol_fault(const struct vm_special_mapping *sm, @@ -1823,10 +1823,10 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { - set_bit(MMF_HAS_UPROBES, &newmm->flags); + if (mm_flags_test(MMF_HAS_UPROBES, oldmm)) { + mm_flags_set(MMF_HAS_UPROBES, newmm); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ - set_bit(MMF_RECALC_UPROBES, &newmm->flags); + mm_flags_set(MMF_RECALC_UPROBES, newmm); } } @@ -2370,7 +2370,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) return; } - clear_bit(MMF_HAS_UPROBES, &mm->flags); + mm_flags_clear(MMF_HAS_UPROBES, mm); } static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) @@ -2468,7 +2468,7 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb *is_swbp = -EFAULT; } - if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) + if (!uprobe && mm_flags_test_and_clear(MMF_RECALC_UPROBES, mm)) mmf_recalc_uprobes(mm); mmap_read_unlock(mm); @@ -2818,7 +2818,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) if (!current->mm) return 0; - if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + if (!mm_flags_test(MMF_HAS_UPROBES, current->mm) && (!current->utask || !current->utask->return_instances)) return 0; From 39f8049cd49f7e88f89a33f97f996c7306e8be0b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:15 +0100 Subject: [PATCH 062/372] mm: update coredump logic to correctly use bitmap mm flags The coredump logic is slightly different from other users in that it both stores mm flags and additionally sets and gets using masks. Since the MMF_DUMPABLE_* flags must remain as they are for uABI reasons, and of course these are within the first 32-bits of the flags, it is reasonable to provide access to these in the same fashion so this logic can all still keep working as it has been. Therefore, introduce coredump-specific helpers __mm_flags_get_dumpable() and __mm_flags_set_mask_dumpable() for this purpose, and update all core dump users of mm flags to use these. [lorenzo.stoakes@oracle.com: abstract set_mask_bits() invocation to mm_types.h to satisfy ARC] Link: https://lkml.kernel.org/r/0e7ad263-1ff7-446d-81fe-97cff9c0e7ed@lucifer.local Link: https://lkml.kernel.org/r/2a5075f7e3c5b367d988178c79a3063d12ee53a9.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Christian Brauner Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/coredump.c | 4 +++- fs/exec.c | 2 +- fs/pidfs.c | 7 +++++-- fs/proc/base.c | 8 +++++--- include/linux/mm_types.h | 12 ++++++++++++ include/linux/sched/coredump.h | 18 +++++++++++++++++- 6 files changed, 43 insertions(+), 8 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 5dce257c67fc..f9d82ffc4b88 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1103,8 +1103,10 @@ void vfs_coredump(const kernel_siginfo_t *siginfo) * We must use the same mm->flags while dumping core to avoid * inconsistency of bit flags, since this flag is not protected * by any locks. + * + * Note that we only care about MMF_DUMP* flags. */ - .mm_flags = mm->flags, + .mm_flags = __mm_flags_get_dumpable(mm), .vma_meta = NULL, .cpu = raw_smp_processor_id(), }; diff --git a/fs/exec.c b/fs/exec.c index 2a1e5e4042a1..dbac0e84cc3e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1999,7 +1999,7 @@ void set_dumpable(struct mm_struct *mm, int value) if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; - set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); + __mm_flags_set_mask_dumpable(mm, value); } SYSCALL_DEFINE3(execve, diff --git a/fs/pidfs.c b/fs/pidfs.c index 108e7527f837..9913c5268fef 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -357,8 +357,11 @@ static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) { task_lock(task); - if (task->mm) - kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags); + if (task->mm) { + unsigned long flags = __mm_flags_get_dumpable(task->mm); + + kinfo.coredump_mask = pidfs_coredump_mask(flags); + } task_unlock(task); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 62d35631ba8c..f0c093c58aaf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2962,8 +2962,10 @@ static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, ret = 0; mm = get_task_mm(task); if (mm) { + unsigned long flags = __mm_flags_get_dumpable(mm); + len = snprintf(buffer, sizeof(buffer), "%08lx\n", - ((mm->flags & MMF_DUMP_FILTER_MASK) >> + ((flags & MMF_DUMP_FILTER_MASK) >> MMF_DUMP_FILTER_SHIFT)); mmput(mm); ret = simple_read_from_buffer(buf, count, ppos, buffer, len); @@ -3002,9 +3004,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { if (val & mask) - set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_set(i + MMF_DUMP_FILTER_SHIFT, mm); else - clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); + mm_flags_clear(i + MMF_DUMP_FILTER_SHIFT, mm); } mmput(mm); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e001dbad455..9d224075d895 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1255,6 +1255,18 @@ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) return bitmap_read(bitmap, 0, BITS_PER_LONG); } +/* + * Update the first system word of mm flags ONLY, applying the specified mask to + * it, then setting all flags specified by bits. + */ +static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, + unsigned long mask, unsigned long bits) +{ + unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + + set_mask_bits(bitmap, mask, bits); +} + #define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \ MT_FLAGS_USE_RCU) extern struct mm_struct init_mm; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 6eb65ceed213..b7fafe999073 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,6 +8,20 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ +static inline unsigned long __mm_flags_get_dumpable(struct mm_struct *mm) +{ + /* + * By convention, dumpable bits are contained in first 32 bits of the + * bitmap, so we can simply access this first unsigned long directly. + */ + return __mm_flags_get_word(mm); +} + +static inline void __mm_flags_set_mask_dumpable(struct mm_struct *mm, int value) +{ + __mm_flags_set_mask_bits_word(mm, MMF_DUMPABLE_MASK, value); +} + extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -22,7 +36,9 @@ static inline int __get_dumpable(unsigned long mm_flags) static inline int get_dumpable(struct mm_struct *mm) { - return __get_dumpable(mm->flags); + unsigned long flags = __mm_flags_get_dumpable(mm); + + return __get_dumpable(flags); } #endif /* _LINUX_SCHED_COREDUMP_H */ From 01f86753a05a3b971d69147d6d074c1a8d29b57d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:16 +0100 Subject: [PATCH 063/372] mm: correct sign-extension issue in MMF_* flag masks There is an issue with the mask declarations in linux/mm_types.h, which naively do (1 << bit) operations. Unfortunately this results in the 1 being defaulted as a signed (32-bit) integer. When the compiler expands the MMF_INIT_MASK bitmask it comes up with: (((1 << 2) - 1) | (((1 << 9) - 1) << 2) | (1 << 24) | (1 << 28) | (1 << 30) | (1 << 31)) Which overflows the signed integer to -788,527,105. Implicitly casting this to an unsigned integer results in sign-expansion, and thus this value becomes 0xffffffffd10007ff, rather than the intended 0xd10007ff. While we're limited to a maximum of 32 bits in mm->flags, this isn't an issue as the remaining bits being masked will always be zero. However, now we are moving towards having more bits in this flag, this becomes an issue. Simply resolve this by using the _BITUL() helper to cast the shifted value to an unsigned long. [lorenzo.stoakes@oracle.com: prefer BIT() to _BITUL()] Link: https://lkml.kernel.org/r/a0290c77-cd88-46d6-8d9a-073be7600d88@lucifer.local Link: https://lkml.kernel.org/r/f92194bee8c92a04fd4c9b2c14c7e65229639300.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Hildenbrand Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9d224075d895..de09ae2a0de6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1767,7 +1767,7 @@ enum { * the modes are SUID_DUMP_* defined in linux/sched/coredump.h */ #define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +#define MMF_DUMPABLE_MASK (BIT(MMF_DUMPABLE_BITS) - 1) /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -1782,13 +1782,13 @@ enum { #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) + ((BIT(MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + (BIT(MMF_DUMP_ANON_PRIVATE) | BIT(MMF_DUMP_ANON_SHARED) | \ + BIT(MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +# define MMF_DUMP_MASK_DEFAULT_ELF BIT(MMF_DUMP_ELF_HEADERS) #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif @@ -1808,7 +1808,7 @@ enum { #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* @@ -1821,16 +1821,15 @@ enum { #define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - +#define MMF_HAS_MDWE_MASK BIT(MMF_HAS_MDWE) #define MMF_HAS_MDWE_NO_INHERIT 29 #define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) +#define MMF_VM_MERGE_ANY_MASK BIT(MMF_VM_MERGE_ANY) #define MMF_TOPDOWN 31 /* mm searches top down by default */ -#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) +#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ From 19148a19da86f1b7d1a1b067c9f656b0f3a60fb1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:17 +0100 Subject: [PATCH 064/372] mm: update fork mm->flags initialisation to use bitmap We now need to account for flag initialisation on fork. We retain the existing logic as much as we can, but dub the existing flag mask legacy. These flags are therefore required to fit in the first 32-bits of the flags field. However, further flag propagation upon fork can be implemented in mm_init() on a per-flag basis. We ensure we clear the entire bitmap prior to setting it, and use __mm_flags_get_word() and __mm_flags_set_word() to manipulate these legacy fields efficiently. Link: https://lkml.kernel.org/r/9fb8954a7a0f0184f012a8e66f8565bcbab014ba.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 13 ++++++++++--- kernel/fork.c | 7 +++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index de09ae2a0de6..69ce407b4343 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1831,16 +1831,23 @@ enum { #define MMF_TOPDOWN 31 /* mm searches top down by default */ #define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ +#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) -static inline unsigned long mmf_init_flags(unsigned long flags) +/* Legacy flags must fit within 32 bits. */ +static_assert((u64)MMF_INIT_LEGACY_MASK <= (u64)UINT_MAX); + +/* + * Initialise legacy flags according to masks, propagating selected flags on + * fork. Further flag manipulation can be performed by the caller. + */ +static inline unsigned long mmf_init_legacy_flags(unsigned long flags) { if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) flags &= ~((1UL << MMF_HAS_MDWE) | (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; + return flags & MMF_INIT_LEGACY_MASK; } #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index af673856499d..b04ecba4a709 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1057,11 +1057,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_uprobes_state(mm); hugetlb_count_init(mm); + mm_flags_clear_all(mm); if (current->mm) { - mm->flags = mmf_init_flags(current->mm->flags); + unsigned long flags = __mm_flags_get_word(current->mm); + + __mm_flags_set_word(mm, mmf_init_legacy_flags(flags)); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { - mm->flags = default_dump_filter; + __mm_flags_set_word(mm, default_dump_filter); mm->def_flags = 0; } From d14d3f535e13ff0661b9a74133a8d6b9f9950712 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:18 +0100 Subject: [PATCH 065/372] mm: convert remaining users to mm_flags_*() accessors As part of the effort to move to mm->flags becoming a bitmap field, convert existing users to making use of the mm_flags_*() accessors which will, when the conversion is complete, be the only means of accessing mm_struct flags. No functional change intended. Link: https://lkml.kernel.org/r/cc67a56f9a8746a8ec7d9791853dc892c1c33e0b.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- fs/proc/array.c | 2 +- fs/proc/base.c | 4 ++-- fs/proc/task_mmu.c | 2 +- kernel/fork.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index d6a0369caa93..c286dc12325e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) - thp_enabled = !test_bit(MMF_DISABLE_THP, &mm->flags); + thp_enabled = !mm_flags_test(MMF_DISABLE_THP, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } diff --git a/fs/proc/base.c b/fs/proc/base.c index f0c093c58aaf..b997ceef9135 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1163,7 +1163,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task); if (p) { - if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { + if (mm_flags_test(MMF_MULTIPROCESS, p->mm)) { mm = p->mm; mmgrab(mm); } @@ -3276,7 +3276,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); seq_printf(m, "ksm_merge_any: %s\n", - test_bit(MMF_VM_MERGE_ANY, &mm->flags) ? "yes" : "no"); + mm_flags_test(MMF_VM_MERGE_ANY, mm) ? "yes" : "no"); ret = mmap_read_lock_killable(mm); if (ret) { mmput(mm); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e64cf40ce9c4..e8e7bef34531 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1592,7 +1592,7 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return false; if (!is_cow_mapping(vma->vm_flags)) return false; - if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) + if (likely(!mm_flags_test(MMF_HAS_PINNED, vma->vm_mm))) return false; folio = vm_normal_folio(vma, addr, pte); if (!folio) diff --git a/kernel/fork.c b/kernel/fork.c index b04ecba4a709..5115be549234 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1887,7 +1887,7 @@ static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) /* We need to synchronize with __set_oom_adj */ mutex_lock(&oom_adj_mutex); - set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + mm_flags_set(MMF_MULTIPROCESS, tsk->mm); /* Update the values in case they were changed after copy_signal */ tsk->signal->oom_score_adj = current->signal->oom_score_adj; tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; From 8166353fb8841390bf3ffe1b923a5ddeb348e4f7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 12 Aug 2025 16:44:19 +0100 Subject: [PATCH 066/372] mm: replace mm->flags with bitmap entirely and set to 64 bits Now we have updated all users of mm->flags to use the bitmap accessors, repalce it with the bitmap version entirely. We are then able to move to having 64 bits of mm->flags on both 32-bit and 64-bit architectures. We also update the VMA userland tests to ensure that everything remains functional there. No functional changes intended, other than there now being 64 bits of available mm_struct flags. Link: https://lkml.kernel.org/r/e1f6654e016d36c43959764b01355736c5cbcdf8.1755012943.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Gordeev Cc: Alexander Shishkin Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Baolin Wang Cc: Barry Song Cc: Ben Segall Cc: Borislav Betkov Cc: Chengming Zhou Cc: Christian Borntraeger Cc: Christian Brauner Cc: David Rientjes Cc: David S. Miller Cc: Dev Jain Cc: Dietmar Eggemann Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ian Rogers Cc: Ingo Molnar Cc: Jan Kara Cc: Jann Horn Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Juri Lelli Cc: Kan Liang Cc: Kees Cook Cc: Marc Rutland Cc: Mariano Pache Cc: "Masami Hiramatsu (Google)" Cc: Mateusz Guzik Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Namhyung kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Shakeel Butt Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Valentin Schneider Cc: Vasily Gorbik Cc: Vincent Guittot Cc: Vlastimil Babka Cc: xu xin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++------ include/linux/mm_types.h | 14 +++++--------- tools/testing/vma/vma_internal.h | 19 +++++++++++++++++-- 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 34311ebe62cc..b61e2d4858cf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -724,32 +724,32 @@ static inline void assert_fault_locked(struct vm_fault *vmf) static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { - return test_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline bool mm_flags_test_and_set(int flag, struct mm_struct *mm) { - return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_and_set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline bool mm_flags_test_and_clear(int flag, struct mm_struct *mm) { - return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + return test_and_clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_set(int flag, struct mm_struct *mm) { - set_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + set_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_clear(int flag, struct mm_struct *mm) { - clear_bit(flag, ACCESS_PRIVATE(&mm->_flags, __mm_flags)); + clear_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } static inline void mm_flags_clear_all(struct mm_struct *mm) { - bitmap_zero(ACCESS_PRIVATE(&mm->_flags, __mm_flags), NUM_MM_FLAG_BITS); + bitmap_zero(ACCESS_PRIVATE(&mm->flags, __mm_flags), NUM_MM_FLAG_BITS); } extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 69ce407b4343..05475b5fd516 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -932,7 +932,7 @@ struct mm_cid { * Opaque type representing current mm_struct flag state. Must be accessed via * mm_flags_xxx() helper functions. */ -#define NUM_MM_FLAG_BITS BITS_PER_LONG +#define NUM_MM_FLAG_BITS (64) typedef struct { DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } __private mm_flags_t; @@ -1119,11 +1119,7 @@ struct mm_struct { /* Architecture-specific MM context */ mm_context_t context; - /* Temporary union while we convert users to mm_flags_t. */ - union { - unsigned long flags; /* Must use atomic bitops to access */ - mm_flags_t _flags; /* Must use mm_flags_* helpers to access */ - }; + mm_flags_t flags; /* Must use mm_flags_* hlpers to access */ #ifdef CONFIG_AIO spinlock_t ioctx_lock; @@ -1236,7 +1232,7 @@ struct mm_struct { /* Set the first system word of mm flags, non-atomically. */ static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); bitmap_copy(bitmap, &value, BITS_PER_LONG); } @@ -1244,7 +1240,7 @@ static inline void __mm_flags_set_word(struct mm_struct *mm, unsigned long value /* Obtain a read-only view of the bitmap. */ static inline const unsigned long *__mm_flags_get_bitmap(const struct mm_struct *mm) { - return (const unsigned long *)ACCESS_PRIVATE(&mm->_flags, __mm_flags); + return (const unsigned long *)ACCESS_PRIVATE(&mm->flags, __mm_flags); } /* Read the first system word of mm flags, non-atomically. */ @@ -1262,7 +1258,7 @@ static inline unsigned long __mm_flags_get_word(const struct mm_struct *mm) static inline void __mm_flags_set_mask_bits_word(struct mm_struct *mm, unsigned long mask, unsigned long bits) { - unsigned long *bitmap = ACCESS_PRIVATE(&mm->_flags, __mm_flags); + unsigned long *bitmap = ACCESS_PRIVATE(&mm->flags, __mm_flags); set_mask_bits(bitmap, mask, bits); } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index cb1c2a8afe26..f13354bf0a1e 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -249,6 +249,14 @@ struct mutex {}; #define DEFINE_MUTEX(mutexname) \ struct mutex mutexname = {} +#define DECLARE_BITMAP(name, bits) \ + unsigned long name[BITS_TO_LONGS(bits)] + +#define NUM_MM_FLAG_BITS (64) +typedef struct { + __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); +} mm_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -260,7 +268,7 @@ struct mm_struct { unsigned long def_flags; - unsigned long flags; /* Must use atomic bitops to access */ + mm_flags_t flags; /* Must use mm_flags_* helpers to access */ }; struct vm_area_struct; @@ -1333,6 +1341,13 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } +# define ACCESS_PRIVATE(p, member) ((p)->member) + +static inline bool mm_flags_test(int flag, const struct mm_struct *mm) +{ + return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1363,7 +1378,7 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, static inline bool map_deny_write_exec(unsigned long old, unsigned long new) { /* If MDWE is disabled, we have nothing to deny. */ - if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + if (mm_flags_test(MMF_HAS_MDWE, current->mm)) return false; /* If the new VMA is not executable, we have nothing to deny. */ From adf085ff0d6fde54015bfca1ce6e4ce392828ba9 Mon Sep 17 00:00:00 2001 From: Qianfeng Rong Date: Tue, 12 Aug 2025 21:52:25 +0800 Subject: [PATCH 067/372] mm: remove redundant __GFP_NOWARN Commit 16f5dfbc851b ("gfp: include __GFP_NOWARN in GFP_NOWAIT") made GFP_NOWAIT implicitly include __GFP_NOWARN. Therefore, explicit __GFP_NOWARN combined with GFP_NOWAIT (e.g., `GFP_NOWAIT | __GFP_NOWARN`) is now redundant. Let's clean up these redundant flags across subsystems. No functional changes. Link: https://lkml.kernel.org/r/20250812135225.274316-1-rongqianfeng@vivo.com Signed-off-by: Qianfeng Rong Reviewed-by: Harry Yoo Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Reviewed-by: Vishal Moola (Oracle) Reviewed-by: SeongJae Park Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 2 +- mm/filemap.c | 2 +- mm/mmu_gather.c | 4 ++-- mm/rmap.c | 2 +- mm/vmalloc.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 2e3409a6c8a4..998c5180a603 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -303,7 +303,7 @@ static unsigned int __damon_migrate_folio_list( * instead of migrated. */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | - __GFP_NOWARN | __GFP_NOMEMALLOC | GFP_NOWAIT, + __GFP_NOMEMALLOC | GFP_NOWAIT, .nid = target_nid, }; diff --git a/mm/filemap.c b/mm/filemap.c index 751838ef05e5..d1fb0b12bff2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1961,7 +1961,7 @@ no_page: gfp &= ~__GFP_FS; if (fgp_flags & FGP_NOWAIT) { gfp &= ~GFP_KERNEL; - gfp |= GFP_NOWAIT | __GFP_NOWARN; + gfp |= GFP_NOWAIT; } if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) fgp_flags |= FGP_LOCK; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index b49cc6385f1f..374aa6f021c6 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -32,7 +32,7 @@ static bool tlb_next_batch(struct mmu_gather *tlb) if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; - batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + batch = (void *)__get_free_page(GFP_NOWAIT); if (!batch) return false; @@ -364,7 +364,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) struct mmu_table_batch **batch = &tlb->batch; if (*batch == NULL) { - *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); + *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT); if (*batch == NULL) { tlb_table_invalidate(tlb); tlb_remove_table_one(table); diff --git a/mm/rmap.c b/mm/rmap.c index 0e9c4041f868..1c5988dbd1e7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -285,7 +285,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; - avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); + avc = anon_vma_chain_alloc(GFP_NOWAIT); if (unlikely(!avc)) { unlock_anon_vma_root(root); root = NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e299b51bd922..710fe512368f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -5196,7 +5196,7 @@ static void vmap_init_nodes(void) int n = clamp_t(unsigned int, num_possible_cpus(), 1, 128); if (n > 1) { - vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN); + vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT); if (vn) { /* Node partition is 16 pages. */ vmap_zone_size = (1 << 4) * PAGE_SIZE; From 53c225ffa72a63bbca8537a3357c8baec122b9a6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 13 Aug 2025 09:13:33 +0100 Subject: [PATCH 068/372] selftests/mm: fix spelling mistake "mrmeap" -> "mremap" There are spelling mistakes in perror messages. Fix these. Link: https://lkml.kernel.org/r/20250813081333.1978096-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 5bd52a951cbd..bf2863b102e3 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -846,7 +846,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp, } if (err != EFAULT) { errno = err; - perror("mrmeap() unexpected error"); + perror("mremap() unexpected error"); success = false; goto out_unmap; } @@ -899,7 +899,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp, } if (err != EFAULT) { errno = err; - perror("mrmeap() unexpected error"); + perror("mremap() unexpected error"); success = false; goto out_unmap; } @@ -948,7 +948,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp, } if (err != EFAULT) { errno = err; - perror("mrmeap() unexpected error"); + perror("mremap() unexpected error"); success = false; goto out_unmap; } From dca4437a5861abf197f1ee934d4db251328d0ed6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 19 Aug 2025 12:34:04 -0700 Subject: [PATCH 069/372] mm/zswap: store length == PAGE_SIZE'. Because the uncompressed data is saved in zpool, same to the compressed ones, this introduces no change in terms of memory management including movability and migratability of involved pages. This change is also not increasing per zswap entry metadata overhead. But as the number of incompressible pages increases, total zswap metadata overhead is proportionally increased. The overhead should not be problematic in usual cases, since the zswap metadata for single zswap entry is much smaller than PAGE_SIZE, and in common zswap use cases there should be a sufficient amount of compressible pages. Also it can be mitigated by the zswap writeback. When the writeback is disabled, the additional overhead could be problematic. For the case, keep the current behavior that just returns the failure and let swap_writeout() put the page back to the active LRU list in the case. Knowing how many incompressible pages are stored at the given moment will be useful for future investigations. Add a new debugfs file called stored_incompressible_pages for the purpose. Tests ----- I tested this patch using a simple self-written microbenchmark that is available at GitHub[1]. You can reproduce the test I did by executing run_tests.sh of the repo on your system. Note that the repo's documentation is not good as of this writing, so you may need to read and use the code. The basic test scenario is simple. Run a test program making artificial accesses to memory having artificial content under memory.high-set memory limit and measure how many accesses were made in a given time. The test program repeatedly and randomly access three anonymous memory regions. The regions are all 500 MiB size, and be accessed in the same probability. Two of those are filled up with a simple content that can easily be compressed, while the remaining one is filled up with a content that s read from /dev/urandom, which is easy to fail at compressing to a size smaller than PAGE_SIZE. The program runs for two minutes and prints out the number of accesses made every five seconds. The test script runs the program under below four configurations. - 0: memory.high is set to 2 GiB, zswap is disabled. - 1-1: memory.high is set to 1350 MiB, zswap is disabled. - 1-2: On 1-1, zswap is enabled without this patch. - 1-3: On 1-2, this patch is applied. For all zswap enabled cases, zswap shrinker is enabled. Configuration '0' is for showing the original memory performance. Configurations 1-1, 1-2 and 1-3 are for showing the performance of swap, zswap, and this patch under a level of memory pressure (~10% of working set). Configurations 0 and 1-1 are not the main focus of this patch, but I'm adding those since their results transparently show how far this microbenchmark test is from the real world. Because the per-5 seconds performance is not very reliable, I measured the average of that for the last one minute period of the test program run. I also measured a few vmstat counters including zswpin, zswpout, zswpwb, pswpin and pswpout during the test runs. The measurement results are as below. To save space, I show performance numbers that are normalized to that of the configuration '0' (no memory pressure). The averaged accesses per 5 seconds of configuration '0' was 36493417.75. config 0 1-1 1-2 1-3 perf_normalized 1.0000 0.0057 0.0235 0.0367 perf_stdev_ratio 0.0582 0.0652 0.0167 0.0346 zswpin 0 0 3548424 1999335 zswpout 0 0 3588817 2361689 zswpwb 0 0 10214 340270 pswpin 0 485806 772038 340967 pswpout 0 649543 144773 340270 'perf_normalized' is the performance metric, normalized to that of configuration '0' (no pressure). 'perf_stdev_ratio' is the standard deviation of the averaged data points, as a ratio to the averaged metric value. For example, configuration '0' performance was showing 5.8% stdev. Configurations 1-1 and 1-3 were having about 6.5% and 6.1% stdev. Also the results were highly variable between multiple runs. So this result is not very stable but just showing ball park figures. Please keep this in your mind when reading these results. Under about 10% of working set memory pressure, the performance was dropped to about 0.57% of no-pressure one, when the normal swap is used (1-1). Note that ~10% working set pressure is already extreme, at least on this test setup. No one would desire system setups that can degrade performance to 0.57% of the best case. By turning zswap on (1-2), the performance was improved about 4x, resulting in about 2.35% of no-pressure one. Because of the incompressible pages in the third memory region, a significant amount of (non-zswap) swap I/O operations were made, though. By applying this patch (1-3), about 56% performance improvement was made, resulting in about 3.67% of no-pressure one. Reduced pswpin of 1-3 compared to 1-2 let us see where this improvement came from. Tests without Zswap Shrinker ---------------------------- Zswap shrinker is not enabled by default, so I ran the above test after disabling zswap shrinker. The results are as below. config 0 1-1 1-2 1-3 perf_normalized 1.0000 0.0056 0.0185 0.0260 perf_stdev_ratio 0.0467 0.0348 0.1832 0.3387 zswpin 0 0 2506765 6049078 zswpout 0 0 2534357 6115426 zswpwb 0 0 0 0 pswpin 0 463694 472978 0 pswpout 0 686227 612149 0 The overall normalized performance of the different configs are very similar to those of zswap shrinker enabled case. By adding the memory pressure, the performance was dropped to 0.56% of the original one. By enabling zswap without zswap shrinker, the performance was increased to 1.85% of the original one. By applying this patch on it, the performance was further increased to 2.6% of the original one. Even though zswap shrinker is disabled, 1-2 shows high numbers of pswpin and pswpout because the incompressible pages are directly swapped out. In the case of 1-3, it shows zero pswpin and pswpout since it saves incompressible pages in the memory, and shows higher performance. Note that the performance of 1-2 and 1-3 varies pretty much. Standard deviation of the performance for 1-2 was about 18.32% of the performance, while that for 1-3 was about 33.87%. Because zswap shrinker is disabled and the memory pressure is induced by memory.high, the workload got penalty_jiffies sleeps, and this resulted in the unstabilized performance. Related Works ------------- This is not an entirely new attempt. Nhat Pham and Takero Funaki tried very similar approaches in October 2023[2] and April 2024[3], respectively. The two approaches didn't get merged mainly due to the metadata overhead concern. I described why I think that shouldn't be a problem for this change, which is automatically disabled when writeback is disabled, at the beginning of this changelog. This patch is not particularly different from those, and actually built upon those. I wrote this from scratch again, though. Hence adding Suggested-by tags for them. Actually Nhat first suggested this to me offlist. Historically, writeback disabling was introduced partially as a way to solve the LRU order issue. Yosry pointed out[4] this is still suboptimal when the incompressible pages are cold, since the incompressible pages will continuously be tried to be zswapped out, and burn CPU cycles for compression attempts that will anyway fail. One imaginable solution for the problem is reusing the swapped-out page and its struct page to store in the zswap pool. But that's out of the scope of this patch. [sj@kernel.org: mark zswap_stored_incompressible_pages as static] Link: https://lkml.kernel.org/r/20250821161750.78192-1-sj@kernel.org [sj@kernel.org: v5] Link: https://lkml.kernel.org/r/20250822190817.49287-1-sj@kernel.org [sj@kernel.org: cleanup incompressible pages handling code] Link: https://lkml.kernel.org/r/20250828163913.57957-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250819193404.46680-1-sj@kernel.org Link: https://github.com/sjp38/eval_zswap/blob/master/run.sh [1] Link: https://lore.kernel.org/20231017003519.1426574-3-nphamcs@gmail.com [2] Link: https://lore.kernel.org/20240706022523.1104080-6-flintglass@gmail.com [3] Link: https://lore.kernel.org/CAJD7tkZXS-UJVAFfvxJ0nNgTzWBiqepPYA4hEozi01_qktkitg@mail.gmail.com [4] Signed-off-by: SeongJae Park Suggested-by: Nhat Pham Suggested-by: Takero Funaki Acked-by: Nhat Pham Acked-by: Chris Li Cc: Chengming Zhou Cc: David Hildenbrand Cc: Johannes Weiner Cc: SeongJae Park Cc: Baoquan He Cc: Barry Song Cc: Kairui Song Cc: Herbert Xu Signed-off-by: Andrew Morton --- mm/zswap.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 3c0fd8a13718..0c8dd8876d8e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -42,8 +42,10 @@ /********************************* * statistics **********************************/ -/* The number of compressed pages currently stored in zswap */ +/* The number of pages currently stored in zswap */ atomic_long_t zswap_stored_pages = ATOMIC_LONG_INIT(0); +/* The number of incompressible pages currently stored in zswap */ +static atomic_long_t zswap_stored_incompressible_pages = ATOMIC_LONG_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -811,6 +813,8 @@ static void zswap_entry_free(struct zswap_entry *entry) obj_cgroup_uncharge_zswap(entry->objcg, entry->length); obj_cgroup_put(entry->objcg); } + if (entry->length == PAGE_SIZE) + atomic_long_dec(&zswap_stored_incompressible_pages); zswap_entry_cache_free(entry); atomic_long_dec(&zswap_stored_pages); } @@ -948,6 +952,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, struct zpool *zpool; gfp_t gfp; u8 *dst; + bool mapped = false; acomp_ctx = acomp_ctx_get_cpu_lock(pool); dst = acomp_ctx->buffer; @@ -976,8 +981,26 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, */ comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (comp_ret) - goto unlock; + + /* + * If a page cannot be compressed into a size smaller than PAGE_SIZE, + * save the content as is without a compression, to keep the LRU order + * of writebacks. If writeback is disabled, reject the page since it + * only adds metadata overhead. swap_writeout() will put the page back + * to the active LRU list in the case. + */ + if (comp_ret || !dlen || dlen >= PAGE_SIZE) { + dlen = PAGE_SIZE; + if (!mem_cgroup_zswap_writeback_enabled( + folio_memcg(page_folio(page)))) { + comp_ret = comp_ret ? comp_ret : -EINVAL; + goto unlock; + } + comp_ret = 0; + dlen = PAGE_SIZE; + dst = kmap_local_page(page); + mapped = true; + } zpool = pool->zpool; gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; @@ -990,6 +1013,8 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, entry->length = dlen; unlock: + if (mapped) + kunmap_local(dst); if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC) zswap_reject_compress_poor++; else if (comp_ret) @@ -1006,12 +1031,18 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - int decomp_ret, dlen; + int decomp_ret = 0, dlen = PAGE_SIZE; u8 *src, *obj; acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); + /* zswap entries of length PAGE_SIZE are not compressed. */ + if (entry->length == PAGE_SIZE) { + memcpy_to_folio(folio, 0, obj, entry->length); + goto read_done; + } + /* * zpool_obj_read_begin() might return a kmap address of highmem when * acomp_ctx->buffer is not used. However, sg_init_one() does not @@ -1032,6 +1063,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) decomp_ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; +read_done: zpool_obj_read_end(zpool, entry->handle, obj); acomp_ctx_put_unlock(acomp_ctx); @@ -1524,6 +1556,8 @@ static bool zswap_store_page(struct page *page, obj_cgroup_charge_zswap(objcg, entry->length); } atomic_long_inc(&zswap_stored_pages); + if (entry->length == PAGE_SIZE) + atomic_long_inc(&zswap_stored_incompressible_pages); /* * We finish initializing the entry while it's already in xarray. @@ -1792,6 +1826,14 @@ static int debugfs_get_stored_pages(void *data, u64 *val) } DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n"); +static int debugfs_get_stored_incompressible_pages(void *data, u64 *val) +{ + *val = atomic_long_read(&zswap_stored_incompressible_pages); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(stored_incompressible_pages_fops, + debugfs_get_stored_incompressible_pages, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1819,6 +1861,9 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, NULL, &total_size_fops); debugfs_create_file("stored_pages", 0444, zswap_debugfs_root, NULL, &stored_pages_fops); + debugfs_create_file("stored_incompressible_pages", 0444, + zswap_debugfs_root, NULL, + &stored_incompressible_pages_fops); return 0; } From 0f9ab62a6e44ef51ea3e7f1c552b447cf4eb20ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Aug 2025 10:30:08 +0200 Subject: [PATCH 070/372] mempool: rename struct mempool_s to struct mempool Drop the pointless _s prefix and align to the usual struct naming to prepare for actually using the struct instead of the typedef so that random headers don't need to include mempool.h for just having a pointer to the mempool. Link: https://lkml.kernel.org/r/20250812083105.371295-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Harry Yoo Reviewed-by: Vlastimil Babka Cc: Christoph Lameter (Ampere) Cc: David Rientjes Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/blkdev.h | 2 +- include/linux/mempool.h | 2 +- include/linux/netfs.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fe1797bbec42..28ceaeffc0c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -199,7 +199,7 @@ struct gendisk { unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; spinlock_t zone_wplugs_lock; - struct mempool_s *zone_wplugs_pool; + struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7b151441341b..34941a4b9026 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -15,7 +15,7 @@ struct kmem_cache; typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data); typedef void (mempool_free_t)(void *element, void *pool_data); -typedef struct mempool_s { +typedef struct mempool { spinlock_t lock; int min_nr; /* nr of elements at *elements */ int curr_nr; /* Current nr of elements at *elements */ diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 98c96d649bf9..72ee7d210a74 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -21,7 +21,7 @@ #include enum netfs_sreq_ref_trace; -typedef struct mempool_s mempool_t; +typedef struct mempool mempool_t; struct folio_queue; /** From 9d246d7410c9c4187e37cc3bfd1284d06b697566 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Tue, 12 Aug 2025 08:12:11 +0000 Subject: [PATCH 071/372] selftests/damon: fix damon selftests by installing _common.sh _common.sh was recently introduced but is not installed and then triggers an error when trying to run the damon selftests: selftests: damon: sysfs.sh ./sysfs.sh: line 4: _common.sh: No such file or directory Install this file to avoid this error. Link: https://lkml.kernel.org/r/20250812-alex-fixes_manual-v1-1-c4e99b1f80e4@rivosinc.com Fixes: 511914506d19 ("selftests/damon: introduce _common.sh to host shared function") Signed-off-by: Alexandre Ghiti Tested-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Tested-by: Enze Li Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 029de547f31c..2180c328a825 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -5,6 +5,7 @@ TEST_GEN_FILES += access_memory access_memory_even TEST_FILES = _damon_sysfs.py TEST_FILES += drgn_dump_damon_status.py +TEST_FILES += _common.sh # functionality tests TEST_PROGS += sysfs.sh From 348e474f18e12c5a27422d0ae3b4316b13dbe3c9 Mon Sep 17 00:00:00 2001 From: Chris Li Date: Tue, 12 Aug 2025 00:10:58 -0700 Subject: [PATCH 072/372] mm/swapfile.c: introduce function alloc_swap_scan_list() Patch series "mm/swapfile.c and swap.h cleanup", v3. This patch series, which builds on Kairui's swap improve cluster scan series. https://lore.kernel.org/linux-mm/20250806161748.76651-1-ryncsn@gmail.com/ It introduces a new function, alloc_swap_scan_list(), for swapfile.c. It also cleans up swap.h by removing comments that reference fields that have been deleted. There are no functional changes in this two-patch series. This patch (of 2): alloc_swap_scan_list() will scan the whole list or the first cluster. This reduces the repeat patterns of isolating a cluster then scanning that cluster. As a result, cluster_alloc_swap_entry() is shorter and shallower. No functional change. Link: https://lkml.kernel.org/r/20250812-swap-scan-list-v3-0-6d73504d267b@kernel.org Link: https://lkml.kernel.org/r/20250812-swap-scan-list-v3-1-6d73504d267b@kernel.org Signed-off-by: Chris Li Reviewed-by: Kairui Song Acked-by: Nhat Pham Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton --- mm/swapfile.c | 86 ++++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4a0cf4fb348d..a7ffabbe65ef 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -820,6 +820,29 @@ out: return found; } +static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, + struct list_head *list, + unsigned int order, + unsigned char usage, + bool scan_all) +{ + unsigned int found = SWAP_ENTRY_INVALID; + + do { + struct swap_cluster_info *ci = isolate_lock_cluster(si, list); + unsigned long offset; + + if (!ci) + break; + offset = cluster_offset(si, ci); + found = alloc_swap_scan_cluster(si, ci, offset, order, usage); + if (found) + break; + } while (scan_all); + + return found; +} + static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) { long to_scan = 1; @@ -913,32 +936,24 @@ new_cluster: * to spread out the writes. */ if (si->flags & SWP_PAGE_DISCARD) { - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); + if (found) + goto done; } if (order < PMD_ORDER) { - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->nonfull_clusters[order], + order, usage, true); + if (found) + goto done; } if (!(si->flags & SWP_PAGE_DISCARD)) { - ci = isolate_lock_cluster(si, &si->free_clusters); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->free_clusters, order, usage, + false); + if (found) + goto done; } /* Try reclaim full clusters if free and nonfull lists are drained */ @@ -952,13 +967,10 @@ new_cluster: * failure is not critical. Scanning one cluster still * keeps the list rotated and reclaimed (for HAS_CACHE). */ - ci = isolate_lock_cluster(si, &si->frag_clusters[order]); - if (ci) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - order, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->frag_clusters[order], order, + usage, false); + if (found) + goto done; } /* @@ -977,19 +989,15 @@ new_cluster: * Clusters here have at least one usable slots and can't fail order 0 * allocation, but reclaim may drop si->lock and race with another user. */ - while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->frag_clusters[o], + 0, usage, true); + if (found) + goto done; - while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) { - found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci), - 0, usage); - if (found) - goto done; - } + found = alloc_swap_scan_list(si, &si->nonfull_clusters[o], + 0, usage, true); + if (found) + goto done; } done: if (!(si->flags & SWP_SOLIDSTATE)) From 85b8cec15034e07500a6e5b8a5aea8185a3d775a Mon Sep 17 00:00:00 2001 From: Chris Li Date: Tue, 12 Aug 2025 00:10:59 -0700 Subject: [PATCH 073/372] mm: swap.h: Remove deleted field from comments The comment for struct swap_info_struct.lock incorrectly mentions fields that have already been deleted from the structure. Update the comments to accurately reflect the current struct swap_info_struct. There is no functional change. Link: https://lkml.kernel.org/r/20250812-swap-scan-list-v3-2-6d73504d267b@kernel.org Signed-off-by: Chris Li Reviewed-by: Kairui Song Acked-by: Nhat Pham Reviewed-by: Barry Song Cc: Baoquan He Cc: "Huang, Ying" Cc: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index a060d102e0d1..c2da85cb7fe7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -320,11 +320,8 @@ struct swap_info_struct { struct completion comp; /* seldom referenced */ spinlock_t lock; /* * protect map scan related fields like - * swap_map, lowest_bit, highest_bit, - * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc, - * highest_alloc, free/discard cluster - * list. other fields are only changed + * swap_map, inuse_pages and all cluster + * lists. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If From 50944692052b1e76fd211f4da0798ab19d7fc276 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Wed, 13 Aug 2025 12:30:24 -0700 Subject: [PATCH 074/372] userfaultfd: opportunistic TLB-flush batching for present pages in MOVE MOVE ioctl's runtime is dominated by TLB-flush cost, which is required for moving present pages. Mitigate this cost by opportunistically batching present contiguous pages for TLB flushing. Without batching, in our testing on an arm64 Android device with UFFD GC, which uses MOVE ioctl for compaction, we observed that out of the total time spent in move_pages_pte(), over 40% is in ptep_clear_flush(), and ~20% in vm_normal_folio(). With batching, the proportion of vm_normal_folio() increases to over 70% of move_pages_pte() without any changes to vm_normal_folio(). Furthermore, time spent within move_pages_pte() is only ~20%, which includes TLB-flush overhead. When the GC intensive benchmark, which was used to gather the above numbers, is run on cuttlefish (qemu android instance on x86_64), the completion time of the benchmark went down from ~45mins to ~20mins. Furthermore, system_server, one of the most performance critical system processes on android, saw over 50% reduction in GC compaction time on an arm64 android device. [lokeshgidra@google.com: make calculation of largest extent that can be batched unconditional on length, per Barry] Link: https://lkml.kernel.org/r/20250816191123.3601561-1-lokeshgidra@google.com Link: https://lkml.kernel.org/r/20250813193024.2279805-1-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Acked-by: Peter Xu Reviewed-by: Barry Song Cc: Suren Baghdasaryan Cc: Kalesh Singh Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 224 +++++++++++++++++++++++++++++++---------------- 1 file changed, 150 insertions(+), 74 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index aefdf3a812a1..50aaa8dcd24c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1026,18 +1026,64 @@ static inline bool is_pte_pages_stable(pte_t *dst_pte, pte_t *src_pte, pmd_same(dst_pmdval, pmdp_get_lockless(dst_pmd)); } -static int move_present_pte(struct mm_struct *mm, - struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, - unsigned long dst_addr, unsigned long src_addr, - pte_t *dst_pte, pte_t *src_pte, - pte_t orig_dst_pte, pte_t orig_src_pte, - pmd_t *dst_pmd, pmd_t dst_pmdval, - spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) +/* + * Checks if the two ptes and the corresponding folio are eligible for batched + * move. If so, then returns pointer to the locked folio. Otherwise, returns NULL. + * + * NOTE: folio's reference is not required as the whole operation is within + * PTL's critical section. + */ +static struct folio *check_ptes_for_batched_move(struct vm_area_struct *src_vma, + unsigned long src_addr, + pte_t *src_pte, pte_t *dst_pte, + struct anon_vma *src_anon_vma) +{ + pte_t orig_dst_pte, orig_src_pte; + struct folio *folio; + + orig_dst_pte = ptep_get(dst_pte); + if (!pte_none(orig_dst_pte)) + return NULL; + + orig_src_pte = ptep_get(src_pte); + if (!pte_present(orig_src_pte) || is_zero_pfn(pte_pfn(orig_src_pte))) + return NULL; + + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); + if (!folio || !folio_trylock(folio)) + return NULL; + if (!PageAnonExclusive(&folio->page) || folio_test_large(folio) || + folio_anon_vma(folio) != src_anon_vma) { + folio_unlock(folio); + return NULL; + } + return folio; +} + +/* + * Moves src folios to dst in a batch as long as they share the same + * anon_vma as the first folio, are not large, and can successfully + * take the lock via folio_trylock(). + */ +static long move_present_ptes(struct mm_struct *mm, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + pte_t *dst_pte, pte_t *src_pte, + pte_t orig_dst_pte, pte_t orig_src_pte, + pmd_t *dst_pmd, pmd_t dst_pmdval, + spinlock_t *dst_ptl, spinlock_t *src_ptl, + struct folio **first_src_folio, unsigned long len, + struct anon_vma *src_anon_vma) { int err = 0; + struct folio *src_folio = *first_src_folio; + unsigned long src_start = src_addr; + unsigned long src_end; + len = pmd_addr_end(dst_addr, dst_addr + len) - dst_addr; + src_end = pmd_addr_end(src_addr, src_addr + len); + flush_cache_range(src_vma, src_addr, src_end); double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1051,31 +1097,56 @@ static int move_present_pte(struct mm_struct *mm, err = -EBUSY; goto out; } + /* It's safe to drop the reference now as the page-table is holding one. */ + folio_put(*first_src_folio); + *first_src_folio = NULL; + arch_enter_lazy_mmu_mode(); - orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte); - /* Folio got pinned from under us. Put it back and fail the move. */ - if (folio_maybe_dma_pinned(src_folio)) { - set_pte_at(mm, src_addr, src_pte, orig_src_pte); - err = -EBUSY; - goto out; + while (true) { + orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); + /* Folio got pinned from under us. Put it back and fail the move. */ + if (folio_maybe_dma_pinned(src_folio)) { + set_pte_at(mm, src_addr, src_pte, orig_src_pte); + err = -EBUSY; + break; + } + + folio_move_anon_rmap(src_folio, dst_vma); + src_folio->index = linear_page_index(dst_vma, dst_addr); + + orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); + /* Set soft dirty bit so userspace can notice the pte was moved */ +#ifdef CONFIG_MEM_SOFT_DIRTY + orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); +#endif + if (pte_dirty(orig_src_pte)) + orig_dst_pte = pte_mkdirty(orig_dst_pte); + orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); + set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + + src_addr += PAGE_SIZE; + if (src_addr == src_end) + break; + dst_addr += PAGE_SIZE; + dst_pte++; + src_pte++; + + folio_unlock(src_folio); + src_folio = check_ptes_for_batched_move(src_vma, src_addr, src_pte, + dst_pte, src_anon_vma); + if (!src_folio) + break; } - folio_move_anon_rmap(src_folio, dst_vma); - src_folio->index = linear_page_index(dst_vma, dst_addr); + arch_leave_lazy_mmu_mode(); + if (src_addr > src_start) + flush_tlb_range(src_vma, src_start, src_addr); - orig_dst_pte = folio_mk_pte(src_folio, dst_vma->vm_page_prot); - /* Set soft dirty bit so userspace can notice the pte was moved */ -#ifdef CONFIG_MEM_SOFT_DIRTY - orig_dst_pte = pte_mksoft_dirty(orig_dst_pte); -#endif - if (pte_dirty(orig_src_pte)) - orig_dst_pte = pte_mkdirty(orig_dst_pte); - orig_dst_pte = pte_mkwrite(orig_dst_pte, dst_vma); - - set_pte_at(mm, dst_addr, dst_pte, orig_dst_pte); + if (src_folio) + folio_unlock(src_folio); out: double_pt_unlock(dst_ptl, src_ptl); - return err; + return src_addr > src_start ? src_addr - src_start : err; } static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, @@ -1140,7 +1211,7 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, set_pte_at(mm, dst_addr, dst_pte, orig_src_pte); double_pt_unlock(dst_ptl, src_ptl); - return 0; + return PAGE_SIZE; } static int move_zeropage_pte(struct mm_struct *mm, @@ -1167,20 +1238,20 @@ static int move_zeropage_pte(struct mm_struct *mm, set_pte_at(mm, dst_addr, dst_pte, zero_pte); double_pt_unlock(dst_ptl, src_ptl); - return 0; + return PAGE_SIZE; } /* - * The mmap_lock for reading is held by the caller. Just move the page - * from src_pmd to dst_pmd if possible, and return true if succeeded - * in moving the page. + * The mmap_lock for reading is held by the caller. Just move the page(s) + * from src_pmd to dst_pmd if possible, and return number of bytes moved. + * On failure, an error code is returned. */ -static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, - struct vm_area_struct *dst_vma, - struct vm_area_struct *src_vma, - unsigned long dst_addr, unsigned long src_addr, - __u64 mode) +static long move_pages_ptes(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, + struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, + unsigned long len, __u64 mode) { swp_entry_t entry; struct swap_info_struct *si = NULL; @@ -1194,11 +1265,10 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; - int err = 0; + long ret = 0; - flush_cache_range(src_vma, src_addr, src_addr + PAGE_SIZE); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, - src_addr, src_addr + PAGE_SIZE); + src_addr, src_addr + len); mmu_notifier_invalidate_range_start(&range); retry: /* @@ -1212,7 +1282,7 @@ retry: /* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } @@ -1231,14 +1301,14 @@ retry: * transparent huge pages under us. */ if (unlikely(!src_pte)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* Sanity checks before the operation */ if (pmd_none(*dst_pmd) || pmd_none(*src_pmd) || pmd_trans_huge(*dst_pmd) || pmd_trans_huge(*src_pmd)) { - err = -EINVAL; + ret = -EINVAL; goto out; } @@ -1246,7 +1316,7 @@ retry: orig_dst_pte = ptep_get(dst_pte); spin_unlock(dst_ptl); if (!pte_none(orig_dst_pte)) { - err = -EEXIST; + ret = -EEXIST; goto out; } @@ -1255,21 +1325,21 @@ retry: spin_unlock(src_ptl); if (pte_none(orig_src_pte)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) - err = -ENOENT; + ret = -ENOENT; else /* nothing to do to move a hole */ - err = 0; + ret = PAGE_SIZE; goto out; } /* If PTE changed after we locked the folio them start over */ if (src_folio && unlikely(!pte_same(src_folio_pte, orig_src_pte))) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } if (pte_present(orig_src_pte)) { if (is_zero_pfn(pte_pfn(orig_src_pte))) { - err = move_zeropage_pte(mm, dst_vma, src_vma, + ret = move_zeropage_pte(mm, dst_vma, src_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl); @@ -1292,14 +1362,14 @@ retry: spin_lock(src_ptl); if (!pte_same(orig_src_pte, ptep_get(src_pte))) { spin_unlock(src_ptl); - err = -EAGAIN; + ret = -EAGAIN; goto out; } folio = vm_normal_folio(src_vma, src_addr, orig_src_pte); if (!folio || !PageAnonExclusive(&folio->page)) { spin_unlock(src_ptl); - err = -EBUSY; + ret = -EBUSY; goto out; } @@ -1313,7 +1383,7 @@ retry: */ if (!locked && folio_test_large(folio)) { spin_unlock(src_ptl); - err = -EAGAIN; + ret = -EAGAIN; goto out; } @@ -1332,7 +1402,7 @@ retry: } if (WARN_ON_ONCE(!folio_test_anon(src_folio))) { - err = -EBUSY; + ret = -EBUSY; goto out; } } @@ -1343,8 +1413,8 @@ retry: pte_unmap(src_pte); pte_unmap(dst_pte); src_pte = dst_pte = NULL; - err = split_folio(src_folio); - if (err) + ret = split_folio(src_folio); + if (ret) goto out; /* have to reacquire the folio after it got split */ folio_unlock(src_folio); @@ -1362,7 +1432,7 @@ retry: src_anon_vma = folio_get_anon_vma(src_folio); if (!src_anon_vma) { /* page was unmapped from under us */ - err = -EAGAIN; + ret = -EAGAIN; goto out; } if (!anon_vma_trylock_write(src_anon_vma)) { @@ -1375,10 +1445,11 @@ retry: } } - err = move_present_pte(mm, dst_vma, src_vma, - dst_addr, src_addr, dst_pte, src_pte, - orig_dst_pte, orig_src_pte, dst_pmd, - dst_pmdval, dst_ptl, src_ptl, src_folio); + ret = move_present_ptes(mm, dst_vma, src_vma, + dst_addr, src_addr, dst_pte, src_pte, + orig_dst_pte, orig_src_pte, dst_pmd, + dst_pmdval, dst_ptl, src_ptl, &src_folio, + len, src_anon_vma); } else { struct folio *folio = NULL; @@ -1389,20 +1460,20 @@ retry: pte_unmap(dst_pte); src_pte = dst_pte = NULL; migration_entry_wait(mm, src_pmd, src_addr); - err = -EAGAIN; + ret = -EAGAIN; } else - err = -EFAULT; + ret = -EFAULT; goto out; } if (!pte_swp_exclusive(orig_src_pte)) { - err = -EBUSY; + ret = -EBUSY; goto out; } si = get_swap_device(entry); if (unlikely(!si)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* @@ -1422,7 +1493,7 @@ retry: swap_cache_index(entry)); if (!IS_ERR_OR_NULL(folio)) { if (folio_test_large(folio)) { - err = -EBUSY; + ret = -EBUSY; folio_put(folio); goto out; } @@ -1439,7 +1510,7 @@ retry: goto retry; } } - err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, + ret = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, dst_ptl, src_ptl, src_folio, si, entry); } @@ -1466,7 +1537,7 @@ out: if (si) put_swap_device(si); - return err; + return ret; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -1737,7 +1808,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, { struct mm_struct *mm = ctx->mm; struct vm_area_struct *src_vma, *dst_vma; - unsigned long src_addr, dst_addr; + unsigned long src_addr, dst_addr, src_end; pmd_t *src_pmd, *dst_pmd; long err = -EINVAL; ssize_t moved = 0; @@ -1780,8 +1851,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, if (err) goto out_unlock; - for (src_addr = src_start, dst_addr = dst_start; - src_addr < src_start + len;) { + for (src_addr = src_start, dst_addr = dst_start, src_end = src_start + len; + src_addr < src_end;) { spinlock_t *ptl; pmd_t dst_pmdval; unsigned long step_size; @@ -1849,6 +1920,8 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, dst_addr, src_addr); step_size = HPAGE_PMD_SIZE; } else { + long ret; + if (pmd_none(*src_pmd)) { if (!(mode & UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES)) { err = -ENOENT; @@ -1865,10 +1938,13 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, unsigned long dst_start, break; } - err = move_pages_pte(mm, dst_pmd, src_pmd, - dst_vma, src_vma, - dst_addr, src_addr, mode); - step_size = PAGE_SIZE; + ret = move_pages_ptes(mm, dst_pmd, src_pmd, + dst_vma, src_vma, dst_addr, + src_addr, src_end - src_addr, mode); + if (ret < 0) + err = ret; + else + step_size = ret; } cond_resched(); From 868ade323e9deff67b8be3e93876596e4d2c71d3 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 31 Jul 2025 10:50:05 +0800 Subject: [PATCH 075/372] rust: allocator: add KUnit tests for alignment guarantees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a test module to verify memory alignment guarantees for Rust kernel allocators. The tests cover `Kmalloc`, `Vmalloc` and `KVmalloc` allocators with both standard and large page-aligned allocations. Key features of the tests: 1. Creates alignment-constrained types: - 128-byte aligned `Blob` - 8192-byte (4-page) aligned `LargeAlignBlob` 2. Validates allocators using `TestAlign` helper which: - Checks address alignment masks - Supports uninitialized allocations 3. Tests all three allocators with both alignment requirements: - Kmalloc with 128B and 8192B - Vmalloc with 128B and 8192B - KVmalloc with 128B and 8192B Link: https://lkml.kernel.org/r/d2e3d6454c1435713be0fe3c0dc444d2c60bba51.1753929369.git.zhuhui@kylinos.cn Co-developed-by: Geliang Tang Signed-off-by: Geliang Tang Signed-off-by: Hui Zhu Reviewed-by: Kunwu Chan Acked-by: Danilo Krummrich Cc: Alex Gaynor Cc: Alice Ryhl Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Gary Guo Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Cc: "Uladzislau Rezki (Sony)" Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/kernel/alloc/allocator.rs | 56 ++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/rust/kernel/alloc/allocator.rs b/rust/kernel/alloc/allocator.rs index f4ae0cf0a594..b561e7a57bb8 100644 --- a/rust/kernel/alloc/allocator.rs +++ b/rust/kernel/alloc/allocator.rs @@ -190,3 +190,59 @@ unsafe impl Allocator for KVmalloc { unsafe { ReallocFunc::KVREALLOC.call(ptr, layout, old_layout, flags, nid) } } } + +#[macros::kunit_tests(rust_allocator)] +mod tests { + use super::*; + use core::mem::MaybeUninit; + use kernel::prelude::*; + + #[test] + fn test_alignment() -> Result { + const TEST_SIZE: usize = 1024; + const TEST_LARGE_ALIGN_SIZE: usize = kernel::page::PAGE_SIZE * 4; + + // These two structs are used to test allocating aligned memory. + // they don't need to be accessed, so they're marked as dead_code. + #[expect(dead_code)] + #[repr(align(128))] + struct Blob([u8; TEST_SIZE]); + #[expect(dead_code)] + #[repr(align(8192))] + struct LargeAlignBlob([u8; TEST_LARGE_ALIGN_SIZE]); + + struct TestAlign(Box, A>); + impl TestAlign { + fn new() -> Result { + Ok(Self(Box::<_, A>::new_uninit(GFP_KERNEL)?)) + } + + fn is_aligned_to(&self, align: usize) -> bool { + assert!(align.is_power_of_two()); + + let addr = self.0.as_ptr() as usize; + addr & (align - 1) == 0 + } + } + + let ta = TestAlign::::new()?; + assert!(ta.is_aligned_to(128)); + + let ta = TestAlign::::new()?; + assert!(ta.is_aligned_to(8192)); + + let ta = TestAlign::::new()?; + assert!(ta.is_aligned_to(128)); + + let ta = TestAlign::::new()?; + assert!(ta.is_aligned_to(8192)); + + let ta = TestAlign::::new()?; + assert!(ta.is_aligned_to(128)); + + let ta = TestAlign::::new()?; + assert!(ta.is_aligned_to(8192)); + + Ok(()) + } +} From ec45783fce52f358c9e8680d2837bc0d477f16ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Aug 2025 16:57:55 +0200 Subject: [PATCH 076/372] memcg: optimize exit to user space memcg uses TIF_NOTIFY_RESUME to handle reclaiming on exit to user space. TIF_NOTIFY_RESUME is a multiplexing TIF bit, which is utilized by other entities as well. This results in a unconditional mem_cgroup_handle_over_high() call for every invocation of resume_user_mode_work(), which is a pointless exercise as most of the time there is no reclaim work to do. Especially since RSEQ is used by glibc, TIF_NOTIFY_RESUME is raised quite frequently and the empty calls show up in exit path profiling. Optimize this by doing a quick check of the reclaim condition before invoking it. [akpm@linux-foundation.org: remove now-unneeded test of memcg_nr_pages_over_high==0, per Shakeel] Link: https://lkml.kernel.org/r/87tt2b6zgs.ffs@tglx Signed-off-by: Thomas Gleixner Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Muchun Song Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 +++++++- mm/memcontrol.c | 7 ++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 785173aa0739..9fa3afc90dd5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -900,7 +900,13 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(gfp_t gfp_mask); +void __mem_cgroup_handle_over_high(gfp_t gfp_mask); + +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) +{ + if (unlikely(current->memcg_nr_pages_over_high)) + __mem_cgroup_handle_over_high(gfp_mask); +} unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8dd7fbed5a94..9712a751690f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2203,7 +2203,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * try_charge() (context permitting), as well as from the userland * return path where reclaim is always able to block. */ -void mem_cgroup_handle_over_high(gfp_t gfp_mask) +void __mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2213,9 +2213,6 @@ void mem_cgroup_handle_over_high(gfp_t gfp_mask) struct mem_cgroup *memcg; bool in_retry = false; - if (likely(!nr_pages)) - return; - memcg = get_mem_cgroup_from_mm(current->mm); current->memcg_nr_pages_over_high = 0; @@ -2486,7 +2483,7 @@ done_restock: if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) - mem_cgroup_handle_over_high(gfp_mask); + __mem_cgroup_handle_over_high(gfp_mask); return 0; } From 0ee82798282aeede8b11553593f7920b15b2240f Mon Sep 17 00:00:00 2001 From: Liao Yuanhong Date: Wed, 13 Aug 2025 17:45:43 +0800 Subject: [PATCH 077/372] lib/test_maple_tree.c: remove redundant semicolons Remove unnecessary semicolons. Link: https://lkml.kernel.org/r/20250813094543.555906-1-liaoyuanhong@vivo.com Signed-off-by: Liao Yuanhong Reviewed-by: Dev Jain Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index cb3936595b0d..1433ecc854cb 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -3562,7 +3562,7 @@ static noinline void __init check_state_handling(struct maple_tree *mt) MT_BUG_ON(mt, mas.last != 0x1500); MT_BUG_ON(mt, !mas_is_active(&mas)); - /* find: start ->active on value */; + /* find: start ->active on value */ mas_set(&mas, 1200); entry = mas_find(&mas, ULONG_MAX); MT_BUG_ON(mt, entry != ptr); From 668208b161a0b679427e7d0f34c0a65fd7d23979 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Thu, 14 Aug 2025 12:06:14 +0000 Subject: [PATCH 078/372] riscv: use an atomic xchg in pudp_huge_get_and_clear() Make sure we return the right pud value and not a value that could have been overwritten in between by a different core. Link: https://lkml.kernel.org/r/20250814-dev-alex-thp_pud_xchg-v1-1-b4704dfae206@rivosinc.com Fixes: c3cc2a4a3a23 ("riscv: Add support for PUD THP") Signed-off-by: Alexandre Ghiti Cc: Andrew Donnellan Cc: Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgtable.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 91697fbf1f90..e69346307e78 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -942,6 +942,17 @@ static inline int pudp_test_and_clear_young(struct vm_area_struct *vma, return ptep_test_and_clear_young(vma, address, (pte_t *)pudp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long address, pud_t *pudp) +{ + pud_t pud = __pud(atomic_long_xchg((atomic_long_t *)pudp, 0)); + + page_table_check_pud_clear(mm, pud); + + return pud; +} + static inline int pud_young(pud_t pud) { return pte_young(pud_pte(pud)); From 6a204d4b14c99232e05d35305c27ebce1c009840 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 14 Aug 2025 14:22:45 -0300 Subject: [PATCH 079/372] mm/page_alloc: only set ALLOC_HIGHATOMIC for __GPF_HIGH allocations Commit 524c48072e56 ("mm/page_alloc: rename ALLOC_HIGH to ALLOC_MIN_RESERVE") is the start of a series that explains how __GFP_HIGH, which implies ALLOC_MIN_RESERVE, is going to be used instead of __GFP_ATOMIC for high atomic reserves. Commit eb2e2b425c69 ("mm/page_alloc: explicitly record high-order atomic allocations in alloc_flags") introduced ALLOC_HIGHATOMIC for such allocations of order higher than 0. It still used __GFP_ATOMIC, though. Then, commit 1ebbb21811b7 ("mm/page_alloc: explicitly define how __GFP_HIGH non-blocking allocations accesses reserves") just turned that check for !__GFP_DIRECT_RECLAIM, ignoring that high atomic reserves were expected to test for __GFP_HIGH. This leads to high atomic reserves being added for high-order GFP_NOWAIT allocations and others that clear __GFP_DIRECT_RECLAIM, which is unexpected. Later, those reserves lead to 0-order allocations going to the slow path and starting reclaim. From /proc/pagetypeinfo, without the patch: Node 0, zone DMA, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type HighAtomic 1 8 10 9 7 3 0 0 0 0 0 Node 0, zone Normal, type HighAtomic 64 20 12 5 0 0 0 0 0 0 0 With the patch: Node 0, zone DMA, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone DMA32, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Link: https://lkml.kernel.org/r/20250814172245.1259625-1-cascardo@igalia.com Fixes: 1ebbb21811b7 ("mm/page_alloc: explicitly define how __GFP_HIGH non-blocking allocations accesses reserves") Signed-off-by: Thadeu Lima de Souza Cascardo Tested-by: Helen Koike Reviewed-by: Vlastimil Babka Tested-by: Sergey Senozhatsky Acked-by: Michal Hocko Cc: Mel Gorman Cc: Matthew Wilcox Cc: NeilBrown Cc: Thierry Reding Cc: Brendan Jackman Cc: Johannes Weiner Cc: Suren Baghdasaryan Cc: Zi Yan Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d1d037f97c5f..09241bb7663e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4408,7 +4408,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) if (!(gfp_mask & __GFP_NOMEMALLOC)) { alloc_flags |= ALLOC_NON_BLOCK; - if (order > 0) + if (order > 0 && (alloc_flags & ALLOC_MIN_RESERVE)) alloc_flags |= ALLOC_HIGHATOMIC; } From a3f451ad33791d56455c5434c5c30b427e147be8 Mon Sep 17 00:00:00 2001 From: Enze Li Date: Thu, 14 Aug 2025 20:54:16 +0800 Subject: [PATCH 080/372] selftests/damon/access_memory_even: remove unused header file Since the time.h header file is not actually needed in this code, we can safely remove its inclusion. Link: https://lkml.kernel.org/r/20250814125417.659937-1-lienze@kylinos.cn Signed-off-by: Enze Li Reviewed-by: SeongJae Park Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/access_memory_even.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/damon/access_memory_even.c b/tools/testing/selftests/damon/access_memory_even.c index a9f4e9aaf3a9..93f3a71bcfd4 100644 --- a/tools/testing/selftests/damon/access_memory_even.c +++ b/tools/testing/selftests/damon/access_memory_even.c @@ -9,7 +9,6 @@ #include #include #include -#include int main(int argc, char *argv[]) { From 0c04015d45e6f102e14bde69b05a0a0591923248 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Thu, 14 Aug 2025 17:00:52 +0800 Subject: [PATCH 081/372] mm/page_alloc: simplify lowmem_reserve max calculation Use max() to find the maximum lowmem_reserve value and min_t() to cap it to managed_pages in calculate_totalreserve_pages(), instead of open-coding the comparisons. No functional change. [liuye@kylinos.cn: fix layout, use min_t] Link: https://lkml.kernel.org/r/20250815024509.37900-1-ye.liu@linux.dev Link: https://lkml.kernel.org/r/20250814090053.22241-1-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: Johannes Weiner Acked-by: Zi Yan Reviewed-by: Wei Yang Cc: Brendan Jackman Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 09241bb7663e..fd55ca824c47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6236,16 +6236,13 @@ static void calculate_totalreserve_pages(void) unsigned long managed_pages = zone_managed_pages(zone); /* Find valid and maximum lowmem_reserve in the zone */ - for (j = i; j < MAX_NR_ZONES; j++) { - if (zone->lowmem_reserve[j] > max) - max = zone->lowmem_reserve[j]; - } + for (j = i; j < MAX_NR_ZONES; j++) + max = max(max, zone->lowmem_reserve[j]); /* we treat the high watermark as reserved pages. */ max += high_wmark_pages(zone); - if (max > managed_pages) - max = managed_pages; + max = min_t(unsigned long, max, managed_pages); pgdat->totalreserve_pages += max; From e4fe1388dfbdd9641331071cd9b4a9efa16da34d Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Thu, 14 Aug 2025 15:37:59 +0800 Subject: [PATCH 082/372] mm: fix typos in VMA comments Fix the following typos in VMA-related files: 1. "operationr" -> "operation" in mm/vma.h 2. "initialisaing" -> "initializing" in mm/vma_init.c Link: https://lkml.kernel.org/r/20250814073800.13617-1-ye.liu@linux.dev Signed-off-by: Ye Liu Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/vma.h | 2 +- mm/vma_init.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vma.h b/mm/vma.h index b123a9cdedb0..bcdc261c5b15 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -145,7 +145,7 @@ struct vma_merge_struct { */ bool __remove_middle :1; /* - * Internal flag used during the merge operationr to indicate we will + * Internal flag used during the merge operation to indicate we will * remove vmg->next. */ bool __remove_next :1; diff --git a/mm/vma_init.c b/mm/vma_init.c index 8e53c7943561..d847c6557261 100644 --- a/mm/vma_init.c +++ b/mm/vma_init.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared + * Functions for initializing, allocating, freeing and duplicating VMAs. Shared * between CONFIG_MMU and non-CONFIG_MMU kernel configurations. */ From 5922deb3ecc0e3c6027e4779e00d2505ebeba9df Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Thu, 14 Aug 2025 15:18:28 +0800 Subject: [PATCH 083/372] mm/page_alloc: remove redundant pcp->free_count initialization in per_cpu_pages_init() In per_cpu_pages_init(), pcp->free_count is explicitly initialized to 0, but this is redundant because the entire struct is already zeroed by memset(pcp, 0, sizeof(*pcp)). Link: https://lkml.kernel.org/r/20250814071828.12036-1-ye.liu@linux.dev Signed-off-by: Ye Liu Reviewed-by: Brendan Jackman Acked-by: Johannes Weiner Reviewed-by: Zi Yan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd55ca824c47..2ee21e46f0fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5946,7 +5946,6 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta pcp->high_min = BOOT_PAGESET_HIGH; pcp->high_max = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; - pcp->free_count = 0; } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, From eda0bf339b41a948af745e0b17db56037813c04f Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Sat, 16 Aug 2025 09:31:07 +0530 Subject: [PATCH 084/372] mm/selftests: fix incorrect pointer being passed to mark_range() Patch series "selftests/mm: Fix false positives and skip unsupported tests", v4. This patch series addresses false positives in the generic mm selftests and skips tests that cannot run correctly due to missing features or system limitations. This patch (of 7): In main(), the high address is stored in hptr, but for mark_range(), the address passed is ptr, not hptr. Fixed this by changing ptr[i] to hptr[i] in mark_range() function call. Link: https://lkml.kernel.org/r/20250816040113.760010-1-aboorvad@linux.ibm.com Link: https://lkml.kernel.org/r/20250816040113.760010-2-aboorvad@linux.ibm.com Fixes: b2a79f62133a ("selftests/mm: virtual_address_range: unmap chunks after validation") Co-developed-by: Aboorva Devarajan Signed-off-by: Aboorva Devarajan Signed-off-by: Donet Tom Reviewed-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/virtual_address_range.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 169dbd692bf5..e24c36a39f22 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -227,7 +227,7 @@ int main(int argc, char *argv[]) if (hptr[i] == MAP_FAILED) break; - mark_range(ptr[i], MAP_CHUNK_SIZE); + mark_range(hptr[i], MAP_CHUNK_SIZE); validate_addr(hptr[i], 1); } hchunks = i; From 0ef3783d7558de874570696b681fcbb173f42e81 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Sat, 16 Aug 2025 09:31:08 +0530 Subject: [PATCH 085/372] selftests/mm: add support to test 4PB VA on PPC64 PowerPC64 supports a 4PB virtual address space, but this test was previously limited to 512TB. This patch extends the coverage up to the full 4PB VA range on PowerPC64. Memory from 0 to 128TB is allocated without an address hint, while allocations from 128TB to 4PB use a hint address. Link: https://lkml.kernel.org/r/20250816040113.760010-3-aboorvad@linux.ibm.com Co-developed-by: Aboorva Devarajan Signed-off-by: Aboorva Devarajan Signed-off-by: Donet Tom Reviewed-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/virtual_address_range.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index e24c36a39f22..81b33d8f78f4 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -44,12 +44,18 @@ * On Arm64 the address space is 256TB and support for * high mappings up to 4PB virtual address space has * been added. + * + * On PowerPC64, the address space up to 128TB can be + * mapped without a hint. Addresses beyond 128TB, up to + * 4PB, can be mapped with a hint. + * */ #define NR_CHUNKS_128TB ((128 * SZ_1TB) / MAP_CHUNK_SIZE) /* Number of chunks for 128TB */ #define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) #define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) #define NR_CHUNKS_3840TB (NR_CHUNKS_128TB * 30UL) +#define NR_CHUNKS_3968TB (NR_CHUNKS_128TB * 31UL) #define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ #define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ @@ -59,6 +65,11 @@ #define HIGH_ADDR_SHIFT 49 #define NR_CHUNKS_LOW NR_CHUNKS_256TB #define NR_CHUNKS_HIGH NR_CHUNKS_3840TB +#elif defined(__PPC64__) +#define HIGH_ADDR_MARK ADDR_MARK_128TB +#define HIGH_ADDR_SHIFT 48 +#define NR_CHUNKS_LOW NR_CHUNKS_128TB +#define NR_CHUNKS_HIGH NR_CHUNKS_3968TB #else #define HIGH_ADDR_MARK ADDR_MARK_128TB #define HIGH_ADDR_SHIFT 48 From 08c907c5bc0ae8c9f0741c9622366ed30a1f6232 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Sat, 16 Aug 2025 09:31:09 +0530 Subject: [PATCH 086/372] selftest/mm: fix ksm_funtional_test failures This patch fixes 2 issues. 1) After fork() in test_prctl_fork, the child process uses the file descriptors from the parent process to read ksm_stat and ksm_merging_pages. This results in incorrect values being read (parent process ksm_stat and ksm_merging_pages will be read in child), causing the test to fail. This patch calls init_global_file_handles() in the child process to ensure that the current process's file descriptors are used to read ksm_stat and ksm_merging_pages. 2) All tests currently call ksm_merge to trigger page merging. To ensure the system remains in a consistent state for subsequent tests, it is better to call ksm_unmerge during the test cleanup phase In the test_prctl_fork test, after a fork(), reading ksm_merging_pages in the child process returns a non-zero value because a previous test performed a merge, and the child's memory state is inherited from the parent. Although the child process calls ksm_unmerge, the ksm_merging_pages counter in the parent is reset to zero, while the child's counter remains unchanged. This discrepancy causes the test to fail. To avoid this issue, each test should call ksm_unmerge during cleanup to ensure the counter is reset and the system is in a clean state for subsequent tests. execv argument is an array of pointers to null-terminated strings. In this patch we also added NULL in the execv argument. Link: https://lkml.kernel.org/r/20250816040113.760010-4-aboorvad@linux.ibm.com Fixes: 6c47de3be3a0 ("selftest/mm: ksm_functional_tests: extend test case for ksm fork/exec") Co-developed-by: Aboorva Devarajan Signed-off-by: Aboorva Devarajan Signed-off-by: Donet Tom Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Shuah Khan Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/ksm_functional_tests.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index d8bd1911dfc0..996dc6645570 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -46,6 +46,8 @@ static int ksm_use_zero_pages_fd; static int pagemap_fd; static size_t pagesize; +static void init_global_file_handles(void); + static bool range_maps_duplicates(char *addr, unsigned long size) { unsigned long offs_a, offs_b, pfn_a, pfn_b; @@ -274,6 +276,7 @@ static void test_unmerge(void) ksft_test_result(!range_maps_duplicates(map, size), "Pages were unmerged\n"); unmap: + ksm_unmerge(); munmap(map, size); } @@ -338,6 +341,7 @@ static void test_unmerge_zero_pages(void) ksft_test_result(!range_maps_duplicates(map, size), "KSM zero pages were unmerged\n"); unmap: + ksm_unmerge(); munmap(map, size); } @@ -366,6 +370,7 @@ static void test_unmerge_discarded(void) ksft_test_result(!range_maps_duplicates(map, size), "Pages were unmerged\n"); unmap: + ksm_unmerge(); munmap(map, size); } @@ -452,6 +457,7 @@ static void test_unmerge_uffd_wp(void) close_uffd: close(uffd); unmap: + ksm_unmerge(); munmap(map, size); } #endif @@ -515,6 +521,7 @@ static int test_child_ksm(void) else if (map == MAP_MERGE_SKIP) return -3; + ksm_unmerge(); munmap(map, size); return 0; } @@ -548,6 +555,7 @@ static void test_prctl_fork(void) child_pid = fork(); if (!child_pid) { + init_global_file_handles(); exit(test_child_ksm()); } else if (child_pid < 0) { ksft_test_result_fail("fork() failed\n"); @@ -595,7 +603,7 @@ static void test_prctl_fork_exec(void) return; } else if (child_pid == 0) { char *prg_name = "./ksm_functional_tests"; - char *argv_for_program[] = { prg_name, FORK_EXEC_CHILD_PRG_NAME }; + char *argv_for_program[] = { prg_name, FORK_EXEC_CHILD_PRG_NAME, NULL }; execv(prg_name, argv_for_program); return; @@ -644,6 +652,7 @@ static void test_prctl_unmerge(void) ksft_test_result(!range_maps_duplicates(map, size), "Pages were unmerged\n"); unmap: + ksm_unmerge(); munmap(map, size); } @@ -677,6 +686,7 @@ static void test_prot_none(void) ksft_test_result(!range_maps_duplicates(map, size), "Pages were unmerged\n"); unmap: + ksm_unmerge(); munmap(map, size); } From 7bc857ddeeaa35a059f6e645365526e10e1f3511 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Sat, 16 Aug 2025 09:31:10 +0530 Subject: [PATCH 087/372] mm/selftests: fix split_huge_page_test failure on systems with 64KB page size The split_huge_page_test fails on systems with a 64KB base page size. This is because the order of a 2MB huge page is different: On 64KB systems, the order is 5. On 4KB systems, it's 9. The test currently assumes a maximum huge page order of 9, which is only valid for 4KB base page systems. On systems with 64KB pages, attempting to split huge pages beyond their actual order (5) causes the test to fail. In this patch, we calculate the huge page order based on the system's base page size. With this change, the tests now run successfully on both 64KB and 4KB page size systems. Link: https://lkml.kernel.org/r/20250816040113.760010-5-aboorvad@linux.ibm.com Fixes: fa6c02315f74 ("mm: huge_memory: a new debugfs interface for splitting THP tests") Co-developed-by: Aboorva Devarajan Signed-off-by: Aboorva Devarajan Signed-off-by: Donet Tom Reviewed-by: Dev Jain Reviewed-by: Zi Yan Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 5 ----- .../selftests/mm/split_huge_page_test.c | 22 ++++++++++++------- tools/testing/selftests/mm/uffd-wp-mremap.c | 5 ----- tools/testing/selftests/mm/vm_util.h | 5 +++++ 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index c744c603d688..b51fbeb93751 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -41,11 +41,6 @@ static size_t hugetlbsizes[10]; static int gup_fd; static bool has_huge_zeropage; -static int sz2ord(size_t size) -{ - return __builtin_ctzll(size / pagesize); -} - static int detect_thp_sizes(size_t sizes[], int max) { int count = 0; diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index bf40e6b121ab..505b5bb1829e 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -525,6 +525,9 @@ int main(int argc, char **argv) const char *fs_loc; bool created_tmp; int offset; + unsigned int max_order; + unsigned int nr_pages; + unsigned int tests; ksft_print_header(); @@ -536,35 +539,38 @@ int main(int argc, char **argv) if (argc > 1) optional_xfs_path = argv[1]; - ksft_set_plan(1+8+1+9+9+8*4+2); - pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); if (!pmd_pagesize) ksft_exit_fail_msg("Reading PMD pagesize failed\n"); + nr_pages = pmd_pagesize / pagesize; + max_order = sz2ord(pmd_pagesize); + tests = 2 + (max_order - 1) + (2 * max_order) + (max_order - 1) * 4 + 2; + ksft_set_plan(tests); + fd_size = 2 * pmd_pagesize; split_pmd_zero_pages(); - for (i = 0; i < 9; i++) + for (i = 0; i < max_order; i++) if (i != 1) split_pmd_thp_to_order(i); split_pte_mapped_thp(); - for (i = 0; i < 9; i++) + for (i = 0; i < max_order; i++) split_file_backed_thp(i); created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template, &fs_loc); - for (i = 8; i >= 0; i--) + for (i = max_order - 1; i >= 0; i--) split_thp_in_pagecache_to_order_at(fd_size, fs_loc, i, -1); - for (i = 0; i < 9; i++) + for (i = 0; i < max_order; i++) for (offset = 0; - offset < pmd_pagesize / pagesize; - offset += MAX(pmd_pagesize / pagesize / 4, 1 << i)) + offset < nr_pages; + offset += MAX(nr_pages / 4, 1 << i)) split_thp_in_pagecache_to_order_at(fd_size, fs_loc, i, offset); cleanup_thp_fs(fs_loc, created_tmp); diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c index c2ba7d46c7b4..e1193550e717 100644 --- a/tools/testing/selftests/mm/uffd-wp-mremap.c +++ b/tools/testing/selftests/mm/uffd-wp-mremap.c @@ -19,11 +19,6 @@ static size_t thpsizes[20]; static int nr_hugetlbsizes; static size_t hugetlbsizes[10]; -static int sz2ord(size_t size) -{ - return __builtin_ctzll(size / pagesize); -} - static int detect_thp_sizes(size_t sizes[], int max) { int count = 0; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 1843ad48d32b..85f7dae9a0c1 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -127,6 +127,11 @@ static inline void log_test_result(int result) ksft_test_result_report(result, "%s\n", test_name); } +static inline int sz2ord(size_t size) +{ + return __builtin_ctzll(size / getpagesize()); +} + void *sys_mremap(void *old_address, unsigned long old_size, unsigned long new_size, int flags, void *new_address); From 2d941088f4570f50df8a079b3ca25c392226fb38 Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Sat, 16 Aug 2025 09:31:11 +0530 Subject: [PATCH 088/372] selftests/mm: fix child process exit codes in ksm_functional_tests In ksm_functional_tests, test_child_ksm() returned negative values to indicate errors. However, when passed to exit(), these were interpreted as large unsigned values (e.g, -2 became 254), leading to incorrect handling in the parent process. As a result, some tests appeared to be skipped or silently failed. This patch changes test_child_ksm() to return positive error codes (1, 2, 3) and updates test_child_ksm_err() to interpret them correctly. Additionally, test_prctl_fork_exec() now uses exit(4) after a failed execv() to clearly signal exec failures. This ensures the parent accurately detects and reports child process failures. -------------- Before patch: -------------- - [RUN] test_unmerge ok 1 Pages were unmerged ... - [RUN] test_prctl_fork - No pages got merged - [RUN] test_prctl_fork_exec ok 7 PR_SET_MEMORY_MERGE value is inherited ... Bail out! 1 out of 8 tests failed - Planned tests != run tests (9 != 8) - Totals: pass:7 fail:1 xfail:0 xpass:0 skip:0 error:0 -------------- After patch: -------------- - [RUN] test_unmerge ok 1 Pages were unmerged ... - [RUN] test_prctl_fork - No pages got merged not ok 7 Merge in child failed - [RUN] test_prctl_fork_exec ok 8 PR_SET_MEMORY_MERGE value is inherited ... Bail out! 2 out of 9 tests failed - Totals: pass:7 fail:2 xfail:0 xpass:0 skip:0 error:0 Link: https://lkml.kernel.org/r/20250816040113.760010-6-aboorvad@linux.ibm.com Fixes: 6c47de3be3a0 ("selftest/mm: ksm_functional_tests: extend test case for ksm fork/exec") Co-developed-by: Donet Tom Signed-off-by: Donet Tom Signed-off-by: Aboorva Devarajan Acked-by: David Hildenbrand Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Shuah Khan Cc: Zi Yan Signed-off-by: Andrew Morton --- .../testing/selftests/mm/ksm_functional_tests.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 996dc6645570..534aa405cac7 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -512,14 +512,14 @@ static int test_child_ksm(void) /* Test if KSM is enabled for the process. */ if (prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) != 1) - return -1; + return 1; /* Test if merge could really happen. */ map = __mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_NONE); if (map == MAP_MERGE_FAIL) - return -2; + return 2; else if (map == MAP_MERGE_SKIP) - return -3; + return 3; ksm_unmerge(); munmap(map, size); @@ -528,12 +528,14 @@ static int test_child_ksm(void) static void test_child_ksm_err(int status) { - if (status == -1) + if (status == 1) ksft_test_result_fail("unexpected PR_GET_MEMORY_MERGE result in child\n"); - else if (status == -2) + else if (status == 2) ksft_test_result_fail("Merge in child failed\n"); - else if (status == -3) + else if (status == 3) ksft_test_result_skip("Merge in child skipped\n"); + else if (status == 4) + ksft_test_result_fail("Binary not found\n"); } /* Verify that prctl ksm flag is inherited. */ @@ -606,7 +608,7 @@ static void test_prctl_fork_exec(void) char *argv_for_program[] = { prg_name, FORK_EXEC_CHILD_PRG_NAME, NULL }; execv(prg_name, argv_for_program); - return; + exit(4); } if (waitpid(child_pid, &status, 0) > 0) { From e36215431ce2e381bd3ff0bf2d16d1fe5792ddba Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Sat, 16 Aug 2025 09:31:12 +0530 Subject: [PATCH 089/372] selftests/mm: skip thuge-gen test if system is not setup properly Make thuge-gen skip instead of fail when it can't run due to system settings. If shmmax is too small or no 1G huge pages are available, the test now prints a warning and is marked as skipped. ------------------- Before Patch: ------------------- ~ running ./thuge-gen ~ Bail out! Please do echo 262144 > /proc/sys/kernel/shmmax ~ Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0 ~ [FAIL] not ok 28 thuge-gen ~ exit=1 ------------------- After Patch: ------------------- ~ running ./thuge-gen ~ ~ WARNING: shmmax is too small to run this test. ~ ~ Please run the following command to increase shmmax: ~ ~ echo 262144 > /proc/sys/kernel/shmmax ~ 1..0 ~ SKIP Test skipped due to insufficient shmmax value. ~ [SKIP] ok 29 thuge-gen ~ SKIP Link: https://lkml.kernel.org/r/20250816040113.760010-7-aboorvad@linux.ibm.com Co-developed-by: Donet Tom Signed-off-by: Donet Tom Signed-off-by: Aboorva Devarajan Reviewed-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/thuge-gen.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 8e2b08dc5762..4f5e290ff1a6 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -177,13 +177,16 @@ void find_pagesizes(void) globfree(&g); read_sysfs("/proc/sys/kernel/shmmax", &shmmax_val); - if (shmmax_val < NUM_PAGES * largest) - ksft_exit_fail_msg("Please do echo %lu > /proc/sys/kernel/shmmax", - largest * NUM_PAGES); + if (shmmax_val < NUM_PAGES * largest) { + ksft_print_msg("WARNING: shmmax is too small to run this test.\n"); + ksft_print_msg("Please run the following command to increase shmmax:\n"); + ksft_print_msg("echo %lu > /proc/sys/kernel/shmmax\n", largest * NUM_PAGES); + ksft_exit_skip("Test skipped due to insufficient shmmax value.\n"); + } #if defined(__x86_64__) if (largest != 1U<<30) { - ksft_exit_fail_msg("No GB pages available on x86-64\n" + ksft_exit_skip("No GB pages available on x86-64\n" "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); } #endif From 19de1e5d11d142d50c80cad1aa9916f25a36ab0d Mon Sep 17 00:00:00 2001 From: Aboorva Devarajan Date: Sat, 16 Aug 2025 09:31:13 +0530 Subject: [PATCH 090/372] selftests/mm: skip hugepage-mremap test if userfaultfd unavailable Gracefully skip test if userfaultfd is not supported (ENOSYS) or not permitted (EPERM), instead of failing. This avoids misleading failures with clear skip messages. -------------- Before Patch -------------- ~ running ./hugepage-mremap ... ~ Bail out! userfaultfd: Function not implemented ~ Planned tests != run tests (1 != 0) ~ Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0 ~ [FAIL] not ok 4 hugepage-mremap # exit=1 -------------- After Patch -------------- ~ running ./hugepage-mremap ... ~ ok 2 # SKIP userfaultfd is not supported/not enabled. ~ 1 skipped test(s) detected. ~ Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0 ~ [SKIP] ok 4 hugepage-mremap # SKIP Link: https://lkml.kernel.org/r/20250816040113.760010-8-aboorvad@linux.ibm.com Co-developed-by: Donet Tom Signed-off-by: Donet Tom Signed-off-by: Aboorva Devarajan Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: "Ritesh Harjani (IBM)" Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugepage-mremap.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/hugepage-mremap.c b/tools/testing/selftests/mm/hugepage-mremap.c index c463d1c09c9b..2bd1dac75c3f 100644 --- a/tools/testing/selftests/mm/hugepage-mremap.c +++ b/tools/testing/selftests/mm/hugepage-mremap.c @@ -65,10 +65,20 @@ static void register_region_with_uffd(char *addr, size_t len) struct uffdio_api uffdio_api; /* Create and enable userfaultfd object. */ - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) - ksft_exit_fail_msg("userfaultfd: %s\n", strerror(errno)); + if (uffd == -1) { + switch (errno) { + case EPERM: + ksft_exit_skip("Insufficient permissions, try running as root.\n"); + break; + case ENOSYS: + ksft_exit_skip("userfaultfd is not supported/not enabled.\n"); + break; + default: + ksft_exit_fail_msg("userfaultfd failed with %s\n", strerror(errno)); + break; + } + } uffdio_api.api = UFFD_API; uffdio_api.features = 0; From e338d83531540c6cda4bdc8de52aaae186d8a97c Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 15 Aug 2025 11:32:24 -0700 Subject: [PATCH 091/372] mm: readahead: improve mmap_miss heuristic for concurrent faults If two or more threads of an application faulting on the same folio, the mmap_miss counter can be decreased multiple times. It breaks the mmap_miss heuristic and keeps the readahead enabled even under extreme levels of memory pressure. It happens often if file folios backing a multi-threaded application are getting evicted and re-faulted. Fix it by skipping decreasing mmap_miss if the folio is locked. This change was evaluated on several hundred thousands hosts in Google's production over a couple of weeks. The number of containers being stuck in a vicious reclaim cycle for a long time was reduced several fold (~10-20x), as well as the overall fleet-wide cpu time spent in direct memory reclaim was meaningfully reduced. No regressions were observed. Link: https://lkml.kernel.org/r/20250815183224.62007-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reviewed-by: Jan Kara Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index d1fb0b12bff2..1a388b11cfa9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3323,9 +3323,17 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages) return fpin; - mmap_miss = READ_ONCE(ra->mmap_miss); - if (mmap_miss) - WRITE_ONCE(ra->mmap_miss, --mmap_miss); + /* + * If the folio is locked, we're likely racing against another fault. + * Don't touch the mmap_miss counter to avoid decreasing it multiple + * times for a single folio and break the balance with mmap_miss + * increase in do_sync_mmap_readahead(). + */ + if (likely(!folio_test_locked(folio))) { + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss) + WRITE_ONCE(ra->mmap_miss, --mmap_miss); + } if (folio_test_readahead(folio)) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); From 9dc21bbd62edeae6f63e6f25e1edb7167452457b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:53 +0100 Subject: [PATCH 092/372] prctl: extend PR_SET_THP_DISABLE to optionally exclude VM_HUGEPAGE Patch series "prctl: extend PR_SET_THP_DISABLE to only provide THPs when advised", v5. This will allow individual processes to opt-out of THP = "always" into THP = "madvise", without affecting other workloads on the system. This has been extensively discussed on the mailing list and has been summarized very well by David in the first patch which also includes the links to alternatives, please refer to the first patch commit message for the motivation for this series. Patch 1 adds the PR_THP_DISABLE_EXCEPT_ADVISED flag to implement this, along with the MMF changes. Patch 2 is a cleanup patch for tva_flags that will allow the forced collapse case to be transmitted to vma_thp_disabled (which is done in patch 3). Patch 4 adds documentation for PR_SET_THP_DISABLE/PR_GET_THP_DISABLE. Patches 6-7 implement the selftests for PR_SET_THP_DISABLE for completely disabling THPs (old behaviour) and only enabling it at advise (PR_THP_DISABLE_EXCEPT_ADVISED). This patch (of 7): People want to make use of more THPs, for example, moving from the "never" system policy to "madvise", or from "madvise" to "always". While this is great news for every THP desperately waiting to get allocated out there, apparently there are some workloads that require a bit of care during that transition: individual processes may need to opt-out from this behavior for various reasons, and this should be permitted without needing to make all other workloads on the system similarly opt-out. The following scenarios are imaginable: (1) Switch from "none" system policy to "madvise"/"always", but keep THPs disabled for selected workloads. (2) Stay at "none" system policy, but enable THPs for selected workloads, making only these workloads use the "madvise" or "always" policy. (3) Switch from "madvise" system policy to "always", but keep the "madvise" policy for selected workloads: allocate THPs only when advised. (4) Stay at "madvise" system policy, but enable THPs even when not advised for selected workloads -- "always" policy. Once can emulate (2) through (1), by setting the system policy to "madvise"/"always" while disabling THPs for all processes that don't want THPs. It requires configuring all workloads, but that is a user-space problem to sort out. (4) can be emulated through (3) in a similar way. Back when (1) was relevant in the past, as people started enabling THPs, we added PR_SET_THP_DISABLE, so relevant workloads that were not ready yet (i.e., used by Redis) were able to just disable THPs completely. Redis still implements the option to use this interface to disable THPs completely. With PR_SET_THP_DISABLE, we added a way to force-disable THPs for a workload -- a process, including fork+exec'ed process hierarchy. That essentially made us support (1): simply disable THPs for all workloads that are not ready for THPs yet, while still enabling THPs system-wide. The quest for handling (3) and (4) started, but current approaches (completely new prctl, options to set other policies per process, alternatives to prctl -- mctrl, cgroup handling) don't look particularly promising. Likely, the future will use bpf or something similar to implement better policies, in particular to also make better decisions about THP sizes to use, but this will certainly take a while as that work just started. Long story short: a simple enable/disable is not really suitable for the future, so we're not willing to add completely new toggles. While we could emulate (3)+(4) through (1)+(2) by simply disabling THPs completely for these processes, this is a step backwards, because these processes can no longer allocate THPs in regions where THPs were explicitly advised: regions flagged as VM_HUGEPAGE. Apparently, that imposes a problem for relevant workloads, because "not THPs" is certainly worse than "THPs only when advised". Could we simply relax PR_SET_THP_DISABLE, to "disable THPs unless not explicitly advised by the app through MAD_HUGEPAGE"? *maybe*, but this would change the documented semantics quite a bit, and the versatility to use it for debugging purposes, so I am not 100% sure that is what we want -- although it would certainly be much easier. So instead, as an easy way forward for (3) and (4), add an option to make PR_SET_THP_DISABLE disable *less* THPs for a process. In essence, this patch: (A) Adds PR_THP_DISABLE_EXCEPT_ADVISED, to be used as a flag in arg3 of prctl(PR_SET_THP_DISABLE) when disabling THPs (arg2 != 0). prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED). (B) Makes prctl(PR_GET_THP_DISABLE) return 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set while disabling. Previously, it would return 1 if THPs were disabled completely. Now it returns the set flags as well: 3 if PR_THP_DISABLE_EXCEPT_ADVISED was set. (C) Renames MMF_DISABLE_THP to MMF_DISABLE_THP_COMPLETELY, to express the semantics clearly. Fortunately, there are only two instances outside of prctl() code. (D) Adds MMF_DISABLE_THP_EXCEPT_ADVISED to express "no THP except for VMAs with VM_HUGEPAGE" -- essentially "thp=madvise" behavior Fortunately, we only have to extend vma_thp_disabled(). (E) Indicates "THP_enabled: 0" in /proc/pid/status only if THPs are disabled completely Only indicating that THPs are disabled when they are really disabled completely, not only partially. For now, we don't add another interface to obtained whether THPs are disabled partially (PR_THP_DISABLE_EXCEPT_ADVISED was set). If ever required, we could add a new entry. The documented semantics in the man page for PR_SET_THP_DISABLE "is inherited by a child created via fork(2) and is preserved across execve(2)" is maintained. This behavior, for example, allows for disabling THPs for a workload through the launching process (e.g., systemd where we fork() a helper process to then exec()). For now, MADV_COLLAPSE will *fail* in regions without VM_HUGEPAGE and VM_NOHUGEPAGE. As MADV_COLLAPSE is a clear advise that user space thinks a THP is a good idea, we'll enable that separately next (requiring a bit of cleanup first). There is currently not way to prevent that a process will not issue PR_SET_THP_DISABLE itself to re-enable THP. There are not really known users for re-enabling it, and it's against the purpose of the original interface. So if ever required, we could investigate just forbidding to re-enable them, or make this somehow configurable. Link: https://lkml.kernel.org/r/20250815135549.130506-1-usamaarif642@gmail.com Link: https://lkml.kernel.org/r/20250815135549.130506-2-usamaarif642@gmail.com Acked-by: Zi Yan Acked-by: Usama Arif Tested-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Usama Arif Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 5 ++- fs/proc/array.c | 2 +- include/linux/huge_mm.h | 20 +++++++--- include/linux/mm_types.h | 13 +++---- include/uapi/linux/prctl.h | 10 +++++ kernel/sys.c | 59 ++++++++++++++++++++++++------ mm/khugepaged.c | 2 +- 7 files changed, 82 insertions(+), 29 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b7235..915a3e44bc12 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -291,8 +291,9 @@ It's slow but very precise. HugetlbPages size of hugetlb memory portions CoreDumping process's memory is currently being dumped (killing the process may lead to a corrupted core) - THP_enabled process is allowed to use THP (returns 0 when - PR_SET_THP_DISABLE is set on the process + THP_enabled process is allowed to use THP (returns 0 when + PR_SET_THP_DISABLE is set on the process to disable + THP completely, not just partially) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/array.c b/fs/proc/array.c index c286dc12325e..d84b291dd1ed 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -422,7 +422,7 @@ static inline void task_thp_status(struct seq_file *m, struct mm_struct *mm) bool thp_enabled = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE); if (thp_enabled) - thp_enabled = !mm_flags_test(MMF_DISABLE_THP, mm); + thp_enabled = !mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); seq_printf(m, "THP_enabled:\t%d\n", thp_enabled); } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 84b7eebe0d68..22b8b067b295 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -318,16 +318,26 @@ struct thpsize { (transparent_hugepage_flags & \ (1<vm_mm)) + return true; /* - * Explicitly disabled through madvise or prctl, or some - * architectures may disable THP for some mappings, for - * example, s390 kvm. + * Are THPs disabled only for VMAs where we didn't get an explicit + * advise to use them? */ - return (vm_flags & VM_NOHUGEPAGE) || - mm_flags_test(MMF_DISABLE_THP, vma->vm_mm); + if (vm_flags & VM_HUGEPAGE) + return false; + return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } static inline bool thp_disabled_by_hw(void) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 05475b5fd516..d247da2fdb52 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1792,19 +1792,16 @@ enum { #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ +#define MMF_HUGE_ZERO_FOLIO 18 /* mm has ever used the global huge zero folio */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK BIT(MMF_DISABLE_THP) +#define MMF_DISABLE_THP_EXCEPT_ADVISED 23 /* no THP except when advised (e.g., VM_HUGEPAGE) */ +#define MMF_DISABLE_THP_COMPLETELY 24 /* no THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (BIT(MMF_DISABLE_THP_COMPLETELY) | \ + BIT(MMF_DISABLE_THP_EXCEPT_ADVISED)) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index ed3aed264aeb..150b6deebfb1 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -177,7 +177,17 @@ struct prctl_mm_map { #define PR_GET_TID_ADDRESS 40 +/* + * Flags for PR_SET_THP_DISABLE are only applicable when disabling. Bit 0 + * is reserved, so PR_GET_THP_DISABLE can return "1 | flags", to effectively + * return "1" when no flags were specified for PR_SET_THP_DISABLE. + */ #define PR_SET_THP_DISABLE 41 +/* + * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / + * VM_HUGEPAGE). + */ +# define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 /* diff --git a/kernel/sys.c b/kernel/sys.c index 605f7fe9a143..a46d9b75880b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2452,6 +2452,51 @@ static int prctl_get_auxv(void __user *addr, unsigned long len) return sizeof(mm->saved_auxv); } +static int prctl_get_thp_disable(unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + + /* If disabled, we return "1 | flags", otherwise 0. */ + if (mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm)) + return 1; + else if (mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, mm)) + return 1 | PR_THP_DISABLE_EXCEPT_ADVISED; + return 0; +} + +static int prctl_set_thp_disable(bool thp_disable, unsigned long flags, + unsigned long arg4, unsigned long arg5) +{ + struct mm_struct *mm = current->mm; + + if (arg4 || arg5) + return -EINVAL; + + /* Flags are only allowed when disabling. */ + if ((!thp_disable && flags) || (flags & ~PR_THP_DISABLE_EXCEPT_ADVISED)) + return -EINVAL; + if (mmap_write_lock_killable(current->mm)) + return -EINTR; + if (thp_disable) { + if (flags & PR_THP_DISABLE_EXCEPT_ADVISED) { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_set(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } else { + mm_flags_set(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + } else { + mm_flags_clear(MMF_DISABLE_THP_COMPLETELY, mm); + mm_flags_clear(MMF_DISABLE_THP_EXCEPT_ADVISED, mm); + } + mmap_write_unlock(current->mm); + return 0; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2625,20 +2670,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, return -EINVAL; return task_no_new_privs(current) ? 1 : 0; case PR_GET_THP_DISABLE: - if (arg2 || arg3 || arg4 || arg5) - return -EINVAL; - error = !!mm_flags_test(MMF_DISABLE_THP, me->mm); + error = prctl_get_thp_disable(arg2, arg3, arg4, arg5); break; case PR_SET_THP_DISABLE: - if (arg3 || arg4 || arg5) - return -EINVAL; - if (mmap_write_lock_killable(me->mm)) - return -EINTR; - if (arg2) - mm_flags_set(MMF_DISABLE_THP, me->mm); - else - mm_flags_clear(MMF_DISABLE_THP, me->mm); - mmap_write_unlock(me->mm); + error = prctl_set_thp_disable(arg2, arg3, arg4, arg5); break; case PR_MPX_ENABLE_MANAGEMENT: case PR_MPX_DISABLE_MANAGEMENT: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 550eb00116c5..1a416b865997 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -410,7 +410,7 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) { return hpage_collapse_test_exit(mm) || - mm_flags_test(MMF_DISABLE_THP, mm); + mm_flags_test(MMF_DISABLE_THP_COMPLETELY, mm); } static bool hugepage_pmd_enabled(void) From 1f1c061089dcd274befa0c76fb9f6e253a8368c0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:54 +0100 Subject: [PATCH 093/372] mm/huge_memory: convert "tva_flags" to "enum tva_type" When determining which THP orders are eligible for a VMA mapping, we have previously specified tva_flags, however it turns out it is really not necessary to treat these as flags. Rather, we distinguish between distinct modes. The only case where we previously combined flags was with TVA_ENFORCE_SYSFS, but we can avoid this by observing that this is the default, except for MADV_COLLAPSE or an edge cases in collapse_pte_mapped_thp() and hugepage_vma_revalidate(), and adding a mode specifically for this case - TVA_FORCED_COLLAPSE. We have: * smaps handling for showing "THPeligible" * Pagefault handling * khugepaged handling * Forced collapse handling: primarily MADV_COLLAPSE, but also for an edge case in collapse_pte_mapped_thp() Disregarding the edge cases, we only want to ignore sysfs settings only when we are forcing a collapse through MADV_COLLAPSE, otherwise we want to enforce it, hence this patch does the following flag to enum conversions: * TVA_SMAPS | TVA_ENFORCE_SYSFS -> TVA_SMAPS * TVA_IN_PF | TVA_ENFORCE_SYSFS -> TVA_PAGEFAULT * TVA_ENFORCE_SYSFS -> TVA_KHUGEPAGED * 0 -> TVA_FORCED_COLLAPSE With this change, we immediately know if we are in the forced collapse case, which will be valuable next. Link: https://lkml.kernel.org/r/20250815135549.130506-3-usamaarif642@gmail.com Signed-off-by: David Hildenbrand Signed-off-by: Usama Arif Acked-by: Usama Arif Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 ++-- include/linux/huge_mm.h | 30 ++++++++++++++++++------------ mm/huge_memory.c | 8 ++++---- mm/khugepaged.c | 17 ++++++++--------- mm/memory.c | 14 ++++++-------- 5 files changed, 38 insertions(+), 35 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e8e7bef34531..ced01cf3c5ab 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1369,8 +1369,8 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %8u\n", - !!thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_SMAPS | TVA_ENFORCE_SYSFS, THP_ORDERS_ALL)); + !!thp_vma_allowable_orders(vma, vma->vm_flags, TVA_SMAPS, + THP_ORDERS_ALL)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 22b8b067b295..92ea0b9771fa 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -94,12 +94,15 @@ extern struct kobj_attribute thpsize_shmem_enabled_attr; #define THP_ORDERS_ALL \ (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT) -#define TVA_SMAPS (1 << 0) /* Will be used for procfs */ -#define TVA_IN_PF (1 << 1) /* Page fault handler */ -#define TVA_ENFORCE_SYSFS (1 << 2) /* Obey sysfs configuration */ +enum tva_type { + TVA_SMAPS, /* Exposing "THPeligible:" in smaps. */ + TVA_PAGEFAULT, /* Serving a page fault. */ + TVA_KHUGEPAGED, /* Khugepaged collapse. */ + TVA_FORCED_COLLAPSE, /* Forced collapse (e.g. MADV_COLLAPSE). */ +}; -#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ - (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) +#define thp_vma_allowable_order(vma, vm_flags, type, order) \ + (!!thp_vma_allowable_orders(vma, vm_flags, type, BIT(order))) #define split_folio(f) split_folio_to_list(f, NULL) @@ -264,14 +267,14 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders); /** * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma * @vma: the vm area to check * @vm_flags: use these vm_flags instead of vma->vm_flags - * @tva_flags: Which TVA flags to honour + * @type: TVA type * @orders: bitfield of all orders to consider * * Calculates the intersection of the requested hugepage orders and the allowed @@ -285,11 +288,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - /* Optimization to check if required orders are enabled early. */ - if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) { + /* + * Optimization to check if required orders are enabled early. Only + * forced collapse ignores sysfs configs. + */ + if (type != TVA_FORCED_COLLAPSE && vma_is_anonymous(vma)) { unsigned long mask = READ_ONCE(huge_anon_orders_always); if (vm_flags & VM_HUGEPAGE) @@ -303,7 +309,7 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return 0; } - return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); + return __thp_vma_allowable_orders(vma, vm_flags, type, orders); } struct thpsize { @@ -547,7 +553,7 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a2f476e7419a..899d9ac86ecd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -99,12 +99,12 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, vm_flags_t vm_flags, - unsigned long tva_flags, + enum tva_type type, unsigned long orders) { - bool smaps = tva_flags & TVA_SMAPS; - bool in_pf = tva_flags & TVA_IN_PF; - bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS; + const bool smaps = type == TVA_SMAPS; + const bool in_pf = type == TVA_PAGEFAULT; + const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1a416b865997..d3d4f116e14b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -474,8 +474,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!mm_flags_test(MMF_VM_HUGEPAGE, vma->vm_mm) && hugepage_pmd_enabled()) { - if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS, - PMD_ORDER)) + if (thp_vma_allowable_order(vma, vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) __khugepaged_enter(vma->vm_mm); } } @@ -921,7 +920,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, struct collapse_control *cc) { struct vm_area_struct *vma; - unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0; + enum tva_type type = cc->is_khugepaged ? TVA_KHUGEPAGED : + TVA_FORCED_COLLAPSE; if (unlikely(hpage_collapse_test_exit_or_disable(mm))) return SCAN_ANY_PROCESS; @@ -932,7 +932,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!thp_vma_suitable_order(vma, address, PMD_ORDER)) return SCAN_ADDRESS_RANGE; - if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, type, PMD_ORDER)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -1533,9 +1533,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * in the page cache with a single hugepage. If a mm were to fault-in * this memory (mapped by a suitably aligned VMA), we'd get the hugepage * and map it by a PMD, regardless of sysfs THP settings. As such, let's - * analogously elide sysfs THP settings here. + * analogously elide sysfs THP settings here and force collapse. */ - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2432,8 +2432,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!thp_vma_allowable_order(vma, vma->vm_flags, - TVA_ENFORCE_SYSFS, PMD_ORDER)) { + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_KHUGEPAGED, PMD_ORDER)) { skip: progress++; continue; @@ -2767,7 +2766,7 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned long start, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER)) + if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER)) return -EINVAL; cc = kmalloc(sizeof(*cc), GFP_KERNEL); diff --git a/mm/memory.c b/mm/memory.c index 002c28795d8b..7b1e8f137fa3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4515,8 +4515,8 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) * Get a list of all the (large) orders below PMD_ORDER that are enabled * and suitable for swapping THP. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); orders = thp_swap_suitable_orders(swp_offset(entry), vmf->address, orders); @@ -5063,8 +5063,8 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) * for this vma. Then filter out the orders that can't be allocated over * the faulting address and still be fully contained in the vma. */ - orders = thp_vma_allowable_orders(vma, vma->vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1); + orders = thp_vma_allowable_orders(vma, vma->vm_flags, TVA_PAGEFAULT, + BIT(PMD_ORDER) - 1); orders = thp_vma_suitable_orders(vma, vmf->address, orders); if (!orders) @@ -6254,8 +6254,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PUD_ORDER)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -6289,8 +6288,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - thp_vma_allowable_order(vma, vm_flags, - TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) { + thp_vma_allowable_order(vma, vm_flags, TVA_PAGEFAULT, PMD_ORDER)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; From 8cdc4d27019356b0304308eb799484c899b62a87 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Aug 2025 14:54:55 +0100 Subject: [PATCH 094/372] mm/huge_memory: respect MADV_COLLAPSE with PR_THP_DISABLE_EXCEPT_ADVISED Let's allow for making MADV_COLLAPSE succeed on areas that neither have VM_HUGEPAGE nor VM_NOHUGEPAGE when we have THP disabled unless explicitly advised (PR_THP_DISABLE_EXCEPT_ADVISED). MADV_COLLAPSE is a clear advice that we want to collapse. Note that we still respect the VM_NOHUGEPAGE flag, just like MADV_COLLAPSE always does. So consequently, MADV_COLLAPSE is now only refused on VM_NOHUGEPAGE with PR_THP_DISABLE_EXCEPT_ADVISED, including for shmem. Link: https://lkml.kernel.org/r/20250815135549.130506-4-usamaarif642@gmail.com Co-developed-by: Usama Arif Signed-off-by: Usama Arif Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 8 +++++++- include/uapi/linux/prctl.h | 2 +- mm/huge_memory.c | 5 +++-- mm/memory.c | 6 ++++-- mm/shmem.c | 2 +- 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 92ea0b9771fa..1ac0d06fb3c1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -329,7 +329,7 @@ struct thpsize { * through madvise or prctl. */ static inline bool vma_thp_disabled(struct vm_area_struct *vma, - vm_flags_t vm_flags) + vm_flags_t vm_flags, bool forced_collapse) { /* Are THPs disabled for this VMA? */ if (vm_flags & VM_NOHUGEPAGE) @@ -343,6 +343,12 @@ static inline bool vma_thp_disabled(struct vm_area_struct *vma, */ if (vm_flags & VM_HUGEPAGE) return false; + /* + * Forcing a collapse (e.g., madv_collapse), is a clear advice to + * use THPs. + */ + if (forced_collapse) + return false; return mm_flags_test(MMF_DISABLE_THP_EXCEPT_ADVISED, vma->vm_mm); } diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 150b6deebfb1..51c4e8c82b1e 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -185,7 +185,7 @@ struct prctl_mm_map { #define PR_SET_THP_DISABLE 41 /* * Don't disable THPs when explicitly advised (e.g., MADV_HUGEPAGE / - * VM_HUGEPAGE). + * VM_HUGEPAGE, MADV_COLLAPSE). */ # define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) #define PR_GET_THP_DISABLE 42 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 899d9ac86ecd..d89992b65acc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -104,7 +104,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, { const bool smaps = type == TVA_SMAPS; const bool in_pf = type == TVA_PAGEFAULT; - const bool enforce_sysfs = type != TVA_FORCED_COLLAPSE; + const bool forced_collapse = type == TVA_FORCED_COLLAPSE; + const bool enforce_sysfs = !forced_collapse; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ @@ -122,7 +123,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!vma->vm_mm) /* vdso */ return 0; - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags, forced_collapse)) return 0; /* khugepaged doesn't collapse DAX vma, but page fault is fine. */ diff --git a/mm/memory.c b/mm/memory.c index 7b1e8f137fa3..d9de6c056179 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5332,9 +5332,11 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct folio *folio, struct page *pa * It is too late to allocate a small folio, we already have a large * folio in the pagecache: especially s390 KVM cannot tolerate any * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any - * PMD mappings if THPs are disabled. + * PMD mappings if THPs are disabled. As we already have a THP, + * behave as if we are forcing a collapse. */ - if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags)) + if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags, + /* forced_collapse=*/ true)) return ret; if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER)) diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..d945de3a7f0e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1817,7 +1817,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, vm_flags_t vm_flags = vma ? vma->vm_flags : 0; unsigned int global_orders; - if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) + if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags, shmem_huge_force))) return 0; global_orders = shmem_huge_global_enabled(inode, index, write_end, From 7de854910bcb7aca917223caf9feec23035d4c26 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 15 Aug 2025 14:54:56 +0100 Subject: [PATCH 095/372] docs: transhuge: document process level THP controls This includes the PR_SET_THP_DISABLE/PR_GET_THP_DISABLE pair of prctl calls as well the newly introduced PR_THP_DISABLE_EXCEPT_ADVISED flag for the PR_SET_THP_DISABLE prctl call. Link: https://lkml.kernel.org/r/20250815135549.130506-5-usamaarif642@gmail.com Signed-off-by: Usama Arif Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 370fba113460..a16a04841b96 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -225,6 +225,42 @@ to "always" or "madvise"), and it'll be automatically shutdown when PMD-sized THP is disabled (when both the per-size anon control and the top-level control are "never") +process THP controls +-------------------- + +A process can control its own THP behaviour using the ``PR_SET_THP_DISABLE`` +and ``PR_GET_THP_DISABLE`` pair of prctl(2) calls. The THP behaviour set using +``PR_SET_THP_DISABLE`` is inherited across fork(2) and execve(2). These calls +support the following arguments:: + + prctl(PR_SET_THP_DISABLE, 1, 0, 0, 0): + This will disable THPs completely for the process, irrespective + of global THP controls or madvise(..., MADV_COLLAPSE) being used. + + prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED, 0, 0): + This will disable THPs for the process except when the usage of THPs is + advised. Consequently, THPs will only be used when: + - Global THP controls are set to "always" or "madvise" and + madvise(..., MADV_HUGEPAGE) or madvise(..., MADV_COLLAPSE) is used. + - Global THP controls are set to "never" and madvise(..., MADV_COLLAPSE) + is used. This is the same behavior as if THPs would not be disabled on + a process level. + Note that MADV_COLLAPSE is currently always rejected if + madvise(..., MADV_NOHUGEPAGE) is set on an area. + + prctl(PR_SET_THP_DISABLE, 0, 0, 0, 0): + This will re-enable THPs for the process, as if they were never disabled. + Whether THPs will actually be used depends on global THP controls and + madvise() calls. + + prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0): + This returns a value whose bits indicate how THP-disable is configured: + Bits + 1 0 Value Description + |0|0| 0 No THP-disable behaviour specified. + |0|1| 1 THP is entirely disabled for this process. + |1|1| 3 THP-except-advised mode is set for this process. + Khugepaged controls ------------------- From 49850bd02658181f235e3f3fd0a9884f812f0914 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 15 Aug 2025 14:54:57 +0100 Subject: [PATCH 096/372] selftest/mm: extract sz2ord function into vm_util.h The function already has 2 uses and will have a 3rd one in prctl selftests. The pagesize argument is added into the function, as it's not a global variable anymore. No functional change intended with this patch. Link: https://lkml.kernel.org/r/20250815135549.130506-6-usamaarif642@gmail.com Suggested-by: David Hildenbrand Signed-off-by: Usama Arif Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 8 ++++---- tools/testing/selftests/mm/split_huge_page_test.c | 2 +- tools/testing/selftests/mm/uffd-wp-mremap.c | 4 ++-- tools/testing/selftests/mm/vm_util.h | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index b51fbeb93751..6560c26f47d1 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -52,7 +52,7 @@ static int detect_thp_sizes(size_t sizes[], int max) if (!pmdsize) return 0; - orders = 1UL << sz2ord(pmdsize); + orders = 1UL << sz2ord(pmdsize, pagesize); orders |= thp_supported_orders(); for (i = 0; orders && count < max; i++) { @@ -1211,8 +1211,8 @@ static void run_anon_test_case(struct test_case const *test_case) size_t size = thpsizes[i]; struct thp_settings settings = *thp_current_settings(); - settings.hugepages[sz2ord(pmdsize)].enabled = THP_NEVER; - settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; + settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_NEVER; + settings.hugepages[sz2ord(size, pagesize)].enabled = THP_ALWAYS; thp_push_settings(&settings); if (size == pmdsize) { @@ -1863,7 +1863,7 @@ int main(int argc, char **argv) if (pmdsize) { /* Only if THP is supported. */ thp_read_settings(&default_settings); - default_settings.hugepages[sz2ord(pmdsize)].enabled = THP_INHERIT; + default_settings.hugepages[sz2ord(pmdsize, pagesize)].enabled = THP_INHERIT; thp_save_settings(); thp_push_settings(&default_settings); diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 505b5bb1829e..4e3408263a77 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -546,7 +546,7 @@ int main(int argc, char **argv) ksft_exit_fail_msg("Reading PMD pagesize failed\n"); nr_pages = pmd_pagesize / pagesize; - max_order = sz2ord(pmd_pagesize); + max_order = sz2ord(pmd_pagesize, pagesize); tests = 2 + (max_order - 1) + (2 * max_order) + (max_order - 1) * 4 + 2; ksft_set_plan(tests); diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c index e1193550e717..78038c40aaaf 100644 --- a/tools/testing/selftests/mm/uffd-wp-mremap.c +++ b/tools/testing/selftests/mm/uffd-wp-mremap.c @@ -82,9 +82,9 @@ static void *alloc_one_folio(size_t size, bool private, bool hugetlb) struct thp_settings settings = *thp_current_settings(); if (private) - settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; + settings.hugepages[sz2ord(size, pagesize)].enabled = THP_ALWAYS; else - settings.shmem_hugepages[sz2ord(size)].enabled = SHMEM_ALWAYS; + settings.shmem_hugepages[sz2ord(size, pagesize)].enabled = SHMEM_ALWAYS; thp_push_settings(&settings); diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 85f7dae9a0c1..3da56feeb944 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -127,9 +127,9 @@ static inline void log_test_result(int result) ksft_test_result_report(result, "%s\n", test_name); } -static inline int sz2ord(size_t size) +static inline int sz2ord(size_t size, size_t pagesize) { - return __builtin_ctzll(size / getpagesize()); + return __builtin_ctzll(size / pagesize); } void *sys_mremap(void *old_address, unsigned long old_size, From 681f45deca1c7f517299d032783f655e5f2c36b4 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 15 Aug 2025 14:54:58 +0100 Subject: [PATCH 097/372] selftests: prctl: introduce tests for disabling THPs completely The test will set the global system THP setting to never, madvise or always depending on the fixture variant and the 2M setting to inherit before it starts (and reset to original at teardown). The fixture setup will also test if PR_SET_THP_DISABLE prctl call can be made to disable all THPs and skip if it fails. This tests if the process can: - successfully get the policy to disable THPs completely. - never get a hugepage when the THPs are completely disabled with the prctl, including with MADV_HUGE and MADV_COLLAPSE. - successfully reset the policy of the process. - after reset, only get hugepages with: - MADV_COLLAPSE when policy is set to never. - MADV_HUGE and MADV_COLLAPSE when policy is set to madvise. - always when policy is set to "always". - never get a THP with MADV_NOHUGEPAGE. - repeat the above tests in a forked process to make sure the policy is carried across forks. [usamaarif642@gmail.com: return after executing test in child process] Link: https://lkml.kernel.org/r/2d0ea708-ecba-4021-b6ca-e93f1413d60a@gmail.com [usamaarif642@gmail.com: include linux/mman.h for prctl_thp_disable] Link: https://lkml.kernel.org/r/20250910204609.1720498-1-usamaarif642@gmail.com Link: https://lore.kernel.org/all/c8249725-e91d-4c51-b9bb-40305e61e20d@sirena.org.uk/ Link: https://lkml.kernel.org/r/20250815135549.130506-7-usamaarif642@gmail.com Signed-off-by: Usama Arif Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + .../testing/selftests/mm/prctl_thp_disable.c | 178 ++++++++++++++++++ tools/testing/selftests/mm/thp_settings.c | 9 +- tools/testing/selftests/mm/thp_settings.h | 1 + 5 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/mm/prctl_thp_disable.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index e7b23a8a05fe..eb023ea857b3 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -58,3 +58,4 @@ pkey_sighandler_tests_32 pkey_sighandler_tests_64 guard-regions merge +prctl_thp_disable diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index d13b3cef2a2b..2bb8d3ebc17c 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -86,6 +86,7 @@ TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += pagemap_ioctl TEST_GEN_FILES += pfnmap TEST_GEN_FILES += process_madv +TEST_GEN_FILES += prctl_thp_disable TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/prctl_thp_disable.c b/tools/testing/selftests/mm/prctl_thp_disable.c new file mode 100644 index 000000000000..fbbce7c9b2f8 --- /dev/null +++ b/tools/testing/selftests/mm/prctl_thp_disable.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Basic tests for PR_GET/SET_THP_DISABLE prctl calls + * + * Author(s): Usama Arif + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest_harness.h" +#include "thp_settings.h" +#include "vm_util.h" + +enum thp_collapse_type { + THP_COLLAPSE_NONE, + THP_COLLAPSE_MADV_NOHUGEPAGE, + THP_COLLAPSE_MADV_HUGEPAGE, /* MADV_HUGEPAGE before access */ + THP_COLLAPSE_MADV_COLLAPSE, /* MADV_COLLAPSE after access */ +}; + +/* + * Function to mmap a buffer, fault it in, madvise it appropriately (before + * page fault for MADV_HUGE, and after for MADV_COLLAPSE), and check if the + * mmap region is huge. + * Returns: + * 0 if test doesn't give hugepage + * 1 if test gives a hugepage + * -errno if mmap fails + */ +static int test_mmap_thp(enum thp_collapse_type madvise_buf, size_t pmdsize) +{ + char *mem, *mmap_mem; + size_t mmap_size; + int ret; + + /* For alignment purposes, we need twice the THP size. */ + mmap_size = 2 * pmdsize; + mmap_mem = (char *)mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) + return -errno; + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + pmdsize) & ~(pmdsize - 1)); + + if (madvise_buf == THP_COLLAPSE_MADV_HUGEPAGE) + madvise(mem, pmdsize, MADV_HUGEPAGE); + else if (madvise_buf == THP_COLLAPSE_MADV_NOHUGEPAGE) + madvise(mem, pmdsize, MADV_NOHUGEPAGE); + + /* Ensure memory is allocated */ + memset(mem, 1, pmdsize); + + if (madvise_buf == THP_COLLAPSE_MADV_COLLAPSE) + madvise(mem, pmdsize, MADV_COLLAPSE); + + /* HACK: make sure we have a separate VMA that we can check reliably. */ + mprotect(mem, pmdsize, PROT_READ); + + ret = check_huge_anon(mem, 1, pmdsize); + munmap(mmap_mem, mmap_size); + return ret; +} + +static void prctl_thp_disable_completely_test(struct __test_metadata *const _metadata, + size_t pmdsize, + enum thp_enabled thp_policy) +{ + ASSERT_EQ(prctl(PR_GET_THP_DISABLE, NULL, NULL, NULL, NULL), 1); + + /* tests after prctl overrides global policy */ + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_NONE, pmdsize), 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_NOHUGEPAGE, pmdsize), 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_HUGEPAGE, pmdsize), 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_COLLAPSE, pmdsize), 0); + + /* Reset to global policy */ + ASSERT_EQ(prctl(PR_SET_THP_DISABLE, 0, NULL, NULL, NULL), 0); + + /* tests after prctl is cleared, and only global policy is effective */ + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_NONE, pmdsize), + thp_policy == THP_ALWAYS ? 1 : 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_NOHUGEPAGE, pmdsize), 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_HUGEPAGE, pmdsize), + thp_policy == THP_NEVER ? 0 : 1); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_COLLAPSE, pmdsize), 1); +} + +FIXTURE(prctl_thp_disable_completely) +{ + struct thp_settings settings; + size_t pmdsize; +}; + +FIXTURE_VARIANT(prctl_thp_disable_completely) +{ + enum thp_enabled thp_policy; +}; + +FIXTURE_VARIANT_ADD(prctl_thp_disable_completely, never) +{ + .thp_policy = THP_NEVER, +}; + +FIXTURE_VARIANT_ADD(prctl_thp_disable_completely, madvise) +{ + .thp_policy = THP_MADVISE, +}; + +FIXTURE_VARIANT_ADD(prctl_thp_disable_completely, always) +{ + .thp_policy = THP_ALWAYS, +}; + +FIXTURE_SETUP(prctl_thp_disable_completely) +{ + if (!thp_available()) + SKIP(return, "Transparent Hugepages not available\n"); + + self->pmdsize = read_pmd_pagesize(); + if (!self->pmdsize) + SKIP(return, "Unable to read PMD size\n"); + + if (prctl(PR_SET_THP_DISABLE, 1, NULL, NULL, NULL)) + SKIP(return, "Unable to disable THPs completely for the process\n"); + + thp_save_settings(); + thp_read_settings(&self->settings); + self->settings.thp_enabled = variant->thp_policy; + self->settings.hugepages[sz2ord(self->pmdsize, getpagesize())].enabled = THP_INHERIT; + thp_write_settings(&self->settings); +} + +FIXTURE_TEARDOWN(prctl_thp_disable_completely) +{ + thp_restore_settings(); +} + +TEST_F(prctl_thp_disable_completely, nofork) +{ + prctl_thp_disable_completely_test(_metadata, self->pmdsize, variant->thp_policy); +} + +TEST_F(prctl_thp_disable_completely, fork) +{ + int ret = 0; + pid_t pid; + + /* Make sure prctl changes are carried across fork */ + pid = fork(); + ASSERT_GE(pid, 0); + + if (!pid) { + prctl_thp_disable_completely_test(_metadata, self->pmdsize, variant->thp_policy); + return; + } + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + ASSERT_EQ(ret, 0); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index bad60ac52874..574bd0f8ae48 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -382,10 +382,17 @@ unsigned long thp_shmem_supported_orders(void) return __thp_supported_orders(true); } -bool thp_is_enabled(void) +bool thp_available(void) { if (access(THP_SYSFS, F_OK) != 0) return false; + return true; +} + +bool thp_is_enabled(void) +{ + if (!thp_available()) + return false; int mode = thp_read_string("enabled", thp_enabled_strings); diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index 6c07f70beee9..76eeb712e5f1 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -84,6 +84,7 @@ void thp_set_read_ahead_path(char *path); unsigned long thp_supported_orders(void); unsigned long thp_shmem_supported_orders(void); +bool thp_available(void); bool thp_is_enabled(void); #endif /* __THP_SETTINGS_H__ */ From 6bb961448418f3b7c2b2f22b0cc2d766c4f17d95 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 15 Aug 2025 14:54:59 +0100 Subject: [PATCH 098/372] selftests: prctl: introduce tests for disabling THPs except for madvise The test will set the global system THP setting to never, madvise or always depending on the fixture variant and the 2M setting to inherit before it starts (and reset to original at teardown). The fixture setup will also test if PR_SET_THP_DISABLE prctl call can be made with PR_THP_DISABLE_EXCEPT_ADVISED and skip if it fails. This tests if the process can: - successfully get the policy to disable THPs expect for madvise. - get hugepages only on MADV_HUGE and MADV_COLLAPSE if the global policy is madvise/always and only with MADV_COLLAPSE if the global policy is never. - successfully reset the policy of the process. - after reset, only get hugepages with: - MADV_COLLAPSE when policy is set to never. - MADV_HUGE and MADV_COLLAPSE when policy is set to madvise. - always when policy is set to "always". - never get a THP with MADV_NOHUGEPAGE. - repeat the above tests in a forked process to make sure the policy is carried across forks. Test results: ./prctl_thp_disable TAP version 13 1..12 ok 1 prctl_thp_disable_completely.never.nofork ok 2 prctl_thp_disable_completely.never.fork ok 3 prctl_thp_disable_completely.madvise.nofork ok 4 prctl_thp_disable_completely.madvise.fork ok 5 prctl_thp_disable_completely.always.nofork ok 6 prctl_thp_disable_completely.always.fork ok 7 prctl_thp_disable_except_madvise.never.nofork ok 8 prctl_thp_disable_except_madvise.never.fork ok 9 prctl_thp_disable_except_madvise.madvise.nofork ok 10 prctl_thp_disable_except_madvise.madvise.fork ok 11 prctl_thp_disable_except_madvise.always.nofork ok 12 prctl_thp_disable_except_madvise.always.fork [usamaarif642@gmail.com: return after executing test in child process] Link: https://lkml.kernel.org/r/3dca2de4-9a6a-4efe-a86c-83f9509831fc@gmail.com Link: https://lkml.kernel.org/r/20250815135549.130506-8-usamaarif642@gmail.com Signed-off-by: Usama Arif Acked-by: David Hildenbrand Cc: Arnd Bergmann Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Rik van Riel Cc: Ryan Roberts Cc: SeongJae Park Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yafang Cc: Zi Yan Signed-off-by: Andrew Morton --- .../testing/selftests/mm/prctl_thp_disable.c | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/tools/testing/selftests/mm/prctl_thp_disable.c b/tools/testing/selftests/mm/prctl_thp_disable.c index fbbce7c9b2f8..84b4a4b345af 100644 --- a/tools/testing/selftests/mm/prctl_thp_disable.c +++ b/tools/testing/selftests/mm/prctl_thp_disable.c @@ -17,6 +17,10 @@ #include "thp_settings.h" #include "vm_util.h" +#ifndef PR_THP_DISABLE_EXCEPT_ADVISED +#define PR_THP_DISABLE_EXCEPT_ADVISED (1 << 1) +#endif + enum thp_collapse_type { THP_COLLAPSE_NONE, THP_COLLAPSE_MADV_NOHUGEPAGE, @@ -175,4 +179,113 @@ TEST_F(prctl_thp_disable_completely, fork) ASSERT_EQ(ret, 0); } +static void prctl_thp_disable_except_madvise_test(struct __test_metadata *const _metadata, + size_t pmdsize, + enum thp_enabled thp_policy) +{ + ASSERT_EQ(prctl(PR_GET_THP_DISABLE, NULL, NULL, NULL, NULL), 3); + + /* tests after prctl overrides global policy */ + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_NONE, pmdsize), 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_NOHUGEPAGE, pmdsize), 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_HUGEPAGE, pmdsize), + thp_policy == THP_NEVER ? 0 : 1); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_COLLAPSE, pmdsize), 1); + + /* Reset to global policy */ + ASSERT_EQ(prctl(PR_SET_THP_DISABLE, 0, NULL, NULL, NULL), 0); + + /* tests after prctl is cleared, and only global policy is effective */ + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_NONE, pmdsize), + thp_policy == THP_ALWAYS ? 1 : 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_NOHUGEPAGE, pmdsize), 0); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_HUGEPAGE, pmdsize), + thp_policy == THP_NEVER ? 0 : 1); + + ASSERT_EQ(test_mmap_thp(THP_COLLAPSE_MADV_COLLAPSE, pmdsize), 1); +} + +FIXTURE(prctl_thp_disable_except_madvise) +{ + struct thp_settings settings; + size_t pmdsize; +}; + +FIXTURE_VARIANT(prctl_thp_disable_except_madvise) +{ + enum thp_enabled thp_policy; +}; + +FIXTURE_VARIANT_ADD(prctl_thp_disable_except_madvise, never) +{ + .thp_policy = THP_NEVER, +}; + +FIXTURE_VARIANT_ADD(prctl_thp_disable_except_madvise, madvise) +{ + .thp_policy = THP_MADVISE, +}; + +FIXTURE_VARIANT_ADD(prctl_thp_disable_except_madvise, always) +{ + .thp_policy = THP_ALWAYS, +}; + +FIXTURE_SETUP(prctl_thp_disable_except_madvise) +{ + if (!thp_available()) + SKIP(return, "Transparent Hugepages not available\n"); + + self->pmdsize = read_pmd_pagesize(); + if (!self->pmdsize) + SKIP(return, "Unable to read PMD size\n"); + + if (prctl(PR_SET_THP_DISABLE, 1, PR_THP_DISABLE_EXCEPT_ADVISED, NULL, NULL)) + SKIP(return, "Unable to set PR_THP_DISABLE_EXCEPT_ADVISED\n"); + + thp_save_settings(); + thp_read_settings(&self->settings); + self->settings.thp_enabled = variant->thp_policy; + self->settings.hugepages[sz2ord(self->pmdsize, getpagesize())].enabled = THP_INHERIT; + thp_write_settings(&self->settings); +} + +FIXTURE_TEARDOWN(prctl_thp_disable_except_madvise) +{ + thp_restore_settings(); +} + +TEST_F(prctl_thp_disable_except_madvise, nofork) +{ + prctl_thp_disable_except_madvise_test(_metadata, self->pmdsize, variant->thp_policy); +} + +TEST_F(prctl_thp_disable_except_madvise, fork) +{ + int ret = 0; + pid_t pid; + + /* Make sure prctl changes are carried across fork */ + pid = fork(); + ASSERT_GE(pid, 0); + + if (!pid) { + prctl_thp_disable_except_madvise_test(_metadata, self->pmdsize, + variant->thp_policy); + return; + } + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + ASSERT_EQ(ret, 0); +} + TEST_HARNESS_MAIN From 4e915656a38afe8aeebb283493f49c22d675a9fc Mon Sep 17 00:00:00 2001 From: Enze Li Date: Fri, 15 Aug 2025 17:21:10 +0800 Subject: [PATCH 099/372] mm/damon/Kconfig: make DAMON_STAT_ENABLED_DEFAULT depend on DAMON_STAT The DAMON_STAT_ENABLED_DEFAULT option is strongly tied to DAMON_STAT option -- enabling it alone is meaningless. This patch makes DAMON_STAT_ENABLED_DEFAULT depend on DAMON_STAT, ensuring functional consistency. Link: https://lkml.kernel.org/r/20250815092110.811757-1-lienze@kylinos.cn Fixes: 369c415e6073 ("mm/damon: introduce DAMON_STAT module") Signed-off-by: Enze Li Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index b3171f9406c1..8c868f7035fc 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -104,7 +104,7 @@ config DAMON_STAT config DAMON_STAT_ENABLED_DEFAULT bool "Enable DAMON_STAT by default" - depends on DAMON_PADDR + depends on DAMON_STAT default DAMON_STAT help Whether to enable DAMON_STAT by default. Users can disable it in From 53fbef56e07df822ea3029109ffca25328c2e5ac Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:51 +0100 Subject: [PATCH 100/372] mm: introduce memdesc_flags_t Patch series "Add and use memdesc_flags_t". At some point struct page will be separated from struct slab and struct folio. This is a step towards that by introducing a type for the 'flags' word of all three structures. This gives us a certain amount of type safety by establishing that some of these unsigned longs are different from other unsigned longs in that they contain things like node ID, section number and zone number in the upper bits. That lets us have functions that can be easily called by anyone who has a slab, folio or page (but not easily by anyone else) to get the node or zone. There's going to be some unusual merge problems with this as some odd bits of the kernel decide they want to print out the flags value or something similar by writing page->flags and now they'll need to write page->flags.f instead. That's most of the churn here. Maybe we should be removing these things from the debug output? This patch (of 11): Wrap the unsigned long flags in a typedef. In upcoming patches, this will provide a strong hint that you can't just pass a random unsigned long to functions which take this as an argument. [willy@infradead.org: s/flags/flags.f/ in several architectures] Link: https://lkml.kernel.org/r/aKMgPRLD-WnkPxYm@casper.infradead.org [nicola.vetrini@gmail.com: mips: fix compilation error] Link: https://lore.kernel.org/lkml/CA+G9fYvkpmqGr6wjBNHY=dRp71PLCoi2341JxOudi60yqaeUdg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20250825214245.1838158-1-nicola.vetrini@gmail.com Link: https://lkml.kernel.org/r/20250805172307.1302730-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250805172307.1302730-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- arch/arc/mm/cache.c | 8 +++--- arch/arc/mm/tlb.c | 2 +- arch/arm/include/asm/hugetlb.h | 2 +- arch/arm/mm/copypage-v4mc.c | 2 +- arch/arm/mm/copypage-v6.c | 2 +- arch/arm/mm/copypage-xscale.c | 2 +- arch/arm/mm/dma-mapping.c | 2 +- arch/arm/mm/fault-armv.c | 2 +- arch/arm/mm/flush.c | 10 +++---- arch/arm64/include/asm/hugetlb.h | 6 ++-- arch/arm64/include/asm/mte.h | 16 +++++------ arch/arm64/mm/flush.c | 8 +++--- arch/csky/abiv1/cacheflush.c | 6 ++-- arch/mips/include/asm/cacheflush.h | 6 ++-- arch/nios2/mm/cacheflush.c | 6 ++-- arch/openrisc/include/asm/cacheflush.h | 2 +- arch/openrisc/mm/cache.c | 2 +- arch/parisc/kernel/cache.c | 6 ++-- arch/powerpc/include/asm/cacheflush.h | 4 +-- arch/powerpc/include/asm/kvm_ppc.h | 4 +-- arch/powerpc/mm/book3s64/hash_utils.c | 4 +-- arch/powerpc/mm/pgtable.c | 12 ++++---- arch/riscv/include/asm/cacheflush.h | 4 +-- arch/riscv/include/asm/hugetlb.h | 2 +- arch/riscv/mm/cacheflush.c | 4 +-- arch/s390/include/asm/hugetlb.h | 2 +- arch/s390/kernel/uv.c | 12 ++++---- arch/s390/mm/gmap.c | 2 +- arch/s390/mm/hugetlbpage.c | 2 +- arch/sh/include/asm/hugetlb.h | 2 +- arch/sh/mm/cache-sh4.c | 2 +- arch/sh/mm/cache-sh7705.c | 2 +- arch/sh/mm/cache.c | 14 ++++----- arch/sh/mm/kmap.c | 2 +- arch/sparc/mm/init_64.c | 10 +++---- arch/x86/mm/pat/memtype.c | 6 ++-- arch/xtensa/mm/cache.c | 12 ++++---- fs/fuse/dev.c | 2 +- fs/gfs2/glops.c | 2 +- fs/jffs2/file.c | 4 +-- fs/nilfs2/page.c | 2 +- fs/proc/page.c | 4 +-- fs/ubifs/file.c | 6 ++-- include/linux/mm.h | 32 ++++++++++----------- include/linux/mm_inline.h | 12 ++++---- include/linux/mm_types.h | 8 ++++-- include/linux/mmzone.h | 2 +- include/linux/page-flags.h | 40 +++++++++++++------------- include/linux/pgalloc_tag.h | 7 +++-- include/trace/events/page_ref.h | 4 +-- mm/filemap.c | 8 +++--- mm/huge_memory.c | 4 +-- mm/memory-failure.c | 12 ++++---- mm/mmzone.c | 4 +-- mm/page_alloc.c | 12 ++++---- mm/swap.c | 8 +++--- mm/vmscan.c | 18 ++++++------ mm/workingset.c | 2 +- 58 files changed, 195 insertions(+), 190 deletions(-) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 9106ceac323c..7d2f93dc1e91 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -704,7 +704,7 @@ static inline void arc_slc_enable(void) void flush_dcache_folio(struct folio *folio) { - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); return; } EXPORT_SYMBOL(flush_dcache_folio); @@ -889,8 +889,8 @@ void copy_user_highpage(struct page *to, struct page *from, copy_page(kto, kfrom); - clear_bit(PG_dc_clean, &dst->flags); - clear_bit(PG_dc_clean, &src->flags); + clear_bit(PG_dc_clean, &dst->flags.f); + clear_bit(PG_dc_clean, &src->flags.f); kunmap_atomic(kto); kunmap_atomic(kfrom); @@ -900,7 +900,7 @@ void clear_user_page(void *to, unsigned long u_vaddr, struct page *page) { struct folio *folio = page_folio(page); clear_page(to); - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); } EXPORT_SYMBOL(clear_user_page); diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index cae4a7aae0ed..ed6915ba76ec 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -488,7 +488,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, */ if (vma->vm_flags & VM_EXEC) { struct folio *folio = page_folio(page); - int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags.f); if (dirty) { unsigned long offset = offset_in_folio(folio, paddr); nr = folio_nr_pages(folio); diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index b766c4b373f6..700055b1ccb3 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -17,7 +17,7 @@ static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index 7ddd82b9fe8b..ed843bb22020 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -67,7 +67,7 @@ void v4_mc_copy_user_highpage(struct page *to, struct page *from, struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index a1a71f36d850..0710dba5c0bf 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -73,7 +73,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to, unsigned int offset = CACHE_COLOUR(vaddr); unsigned long kfrom, kto; - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); /* FIXME: not highmem safe */ diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index f1e29d3e8193..e16af68d709f 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -87,7 +87,7 @@ void xscale_mc_copy_user_highpage(struct page *to, struct page *from, struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + if (!test_and_set_bit(PG_dcache_clean, &src->flags.f)) __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 88c2d68a69c9..08641a936394 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -718,7 +718,7 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off, if (size < sz) break; if (!offset) - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); offset = 0; size -= sz; if (!size) diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 39fd5df73317..91e488767783 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -203,7 +203,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, folio = page_folio(pfn_to_page(pfn)); mapping = folio_flush_mapping(folio); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 5219158d54cf..19470d938b23 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -304,7 +304,7 @@ void __sync_icache_dcache(pte_t pteval) else mapping = NULL; - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(mapping, folio); if (pte_exec(pteval)) @@ -343,8 +343,8 @@ void flush_dcache_folio(struct folio *folio) return; if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); return; } @@ -352,14 +352,14 @@ void flush_dcache_folio(struct folio *folio) if (!cache_ops_need_broadcast() && mapping && !folio_mapped(folio)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { __flush_dcache_folio(mapping, folio); if (mapping && cache_is_vivt()) __flush_dcache_aliases(mapping, folio); else if (mapping) __flush_icache_all(); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 2a8155c4a882..44c1f757bfcf 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -21,12 +21,12 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); #ifdef CONFIG_ARM64_MTE if (system_supports_mte()) { - clear_bit(PG_mte_tagged, &folio->flags); - clear_bit(PG_mte_lock, &folio->flags); + clear_bit(PG_mte_tagged, &folio->flags.f); + clear_bit(PG_mte_lock, &folio->flags.f); } #endif } diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index 6567df8ec8ca..3b5069f4683d 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -48,12 +48,12 @@ static inline void set_page_mte_tagged(struct page *page) * before the page flags update. */ smp_wmb(); - set_bit(PG_mte_tagged, &page->flags); + set_bit(PG_mte_tagged, &page->flags.f); } static inline bool page_mte_tagged(struct page *page) { - bool ret = test_bit(PG_mte_tagged, &page->flags); + bool ret = test_bit(PG_mte_tagged, &page->flags.f); VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); @@ -82,7 +82,7 @@ static inline bool try_page_mte_tagging(struct page *page) { VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page))); - if (!test_and_set_bit(PG_mte_lock, &page->flags)) + if (!test_and_set_bit(PG_mte_lock, &page->flags.f)) return true; /* @@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page) * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ - smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged)); + smp_cond_load_acquire(&page->flags.f, VAL & (1UL << PG_mte_tagged)); return false; } @@ -173,13 +173,13 @@ static inline void folio_set_hugetlb_mte_tagged(struct folio *folio) * before the folio flags update. */ smp_wmb(); - set_bit(PG_mte_tagged, &folio->flags); + set_bit(PG_mte_tagged, &folio->flags.f); } static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio) { - bool ret = test_bit(PG_mte_tagged, &folio->flags); + bool ret = test_bit(PG_mte_tagged, &folio->flags.f); VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); @@ -196,7 +196,7 @@ static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) { VM_WARN_ON_ONCE(!folio_test_hugetlb(folio)); - if (!test_and_set_bit(PG_mte_lock, &folio->flags)) + if (!test_and_set_bit(PG_mte_lock, &folio->flags.f)) return true; /* @@ -204,7 +204,7 @@ static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio) * already. Check if the PG_mte_tagged flag has been set or wait * otherwise. */ - smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged)); + smp_cond_load_acquire(&folio->flags.f, VAL & (1UL << PG_mte_tagged)); return false; } diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 013eead9b695..fbf08b543c3f 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -53,11 +53,11 @@ void __sync_icache_dcache(pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { sync_icache_aliases((unsigned long)folio_address(folio), (unsigned long)folio_address(folio) + folio_size(folio)); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); @@ -69,8 +69,8 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); */ void flush_dcache_folio(struct folio *folio) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } EXPORT_SYMBOL(flush_dcache_folio); diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 171e8fb32285..4bc0aad3cf8a 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -25,12 +25,12 @@ void flush_dcache_folio(struct folio *folio) mapping = folio_flush_mapping(folio); if (mapping && !folio_mapped(folio)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { dcache_wbinv_all(); if (mapping) icache_inv_all(); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); @@ -56,7 +56,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, return; folio = page_folio(pfn_to_page(pfn)); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) dcache_wbinv_all(); if (folio_flush_mapping(folio)) { diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index 1f14132b3fc9..5d283ef89d90 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -37,11 +37,11 @@ #define PG_dcache_dirty PG_arch_1 #define folio_test_dcache_dirty(folio) \ - test_bit(PG_dcache_dirty, &(folio)->flags) + test_bit(PG_dcache_dirty, &(folio)->flags.f) #define folio_set_dcache_dirty(folio) \ - set_bit(PG_dcache_dirty, &(folio)->flags) + set_bit(PG_dcache_dirty, &(folio)->flags.f) #define folio_clear_dcache_dirty(folio) \ - clear_bit(PG_dcache_dirty, &(folio)->flags) + clear_bit(PG_dcache_dirty, &(folio)->flags.f) extern void (*flush_cache_all)(void); extern void (*__flush_cache_all)(void); diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 0ee9c5f02e08..8321182eb927 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -187,7 +187,7 @@ void flush_dcache_folio(struct folio *folio) /* Flush this page if there are aliases. */ if (mapping && !mapping_mapped(mapping)) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } else { __flush_dcache_folio(folio); if (mapping) { @@ -195,7 +195,7 @@ void flush_dcache_folio(struct folio *folio) flush_aliases(mapping, folio); flush_icache_range(start, start + folio_size(folio)); } - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } EXPORT_SYMBOL(flush_dcache_folio); @@ -227,7 +227,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, return; folio = page_folio(pfn_to_page(pfn)); - if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + if (!test_and_set_bit(PG_dcache_clean, &folio->flags.f)) __flush_dcache_folio(folio); mapping = folio_flush_mapping(folio); diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 0e60af486ec1..cd8f971c0fec 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -75,7 +75,7 @@ static inline void sync_icache_dcache(struct page *page) static inline void flush_dcache_folio(struct folio *folio) { - clear_bit(PG_dc_clean, &folio->flags); + clear_bit(PG_dc_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c index 0f265b8e73ec..f33df46dae4e 100644 --- a/arch/openrisc/mm/cache.c +++ b/arch/openrisc/mm/cache.c @@ -83,7 +83,7 @@ void update_cache(struct vm_area_struct *vma, unsigned long address, { unsigned long pfn = pte_val(*pte) >> PAGE_SHIFT; struct folio *folio = page_folio(pfn_to_page(pfn)); - int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags.f); /* * Since icaches do not snoop for updated data on OpenRISC, we diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 37ca484cc495..4c5240d3a3c7 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -122,10 +122,10 @@ void __update_cache(pte_t pte) pfn = folio_pfn(folio); nr = folio_nr_pages(folio); if (folio_flush_mapping(folio) && - test_bit(PG_dcache_dirty, &folio->flags)) { + test_bit(PG_dcache_dirty, &folio->flags.f)) { while (nr--) flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); - clear_bit(PG_dcache_dirty, &folio->flags); + clear_bit(PG_dcache_dirty, &folio->flags.f); } else if (parisc_requires_coherency()) while (nr--) flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); @@ -481,7 +481,7 @@ void flush_dcache_folio(struct folio *folio) pgoff_t pgoff; if (mapping && !mapping_mapped(mapping)) { - set_bit(PG_dcache_dirty, &folio->flags); + set_bit(PG_dcache_dirty, &folio->flags.f); return; } diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index f2656774aaa9..1fea42928f64 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -40,8 +40,8 @@ static inline void flush_dcache_folio(struct folio *folio) if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) return; /* avoid an atomic op if possible */ - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index ca3829d47ab7..0953f2daa466 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -939,9 +939,9 @@ static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) /* Clear i-cache for new pages */ folio = page_folio(pfn_to_page(pfn)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 4693c464fc5a..3aee3af614af 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1562,11 +1562,11 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_dcache_clean, &folio->flags) && + if (!test_bit(PG_dcache_clean, &folio->flags.f) && !folio_test_reserved(folio)) { if (trap == INTERRUPT_INST_STORAGE) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } else pp |= HPTE_R_N; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index dfaa9fd86f7e..56d7e8960e77 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -87,9 +87,9 @@ static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) struct folio *folio = maybe_pte_to_folio(pte); if (!folio) return pte; - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } return pte; @@ -127,13 +127,13 @@ static inline pte_t set_pte_filter(pte_t pte, unsigned long addr) return pte; /* If the page clean, we move on */ - if (test_bit(PG_dcache_clean, &folio->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); return pte; } @@ -175,12 +175,12 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_dcache_clean, &folio->flags)) + if (test_bit(PG_dcache_clean, &folio->flags.f)) goto bail; /* Clean the page and set PG_dcache_clean */ flush_dcache_icache_folio(folio); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); bail: return pte_mkexec(pte); diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 6086b38d5427..0092513c3376 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -23,8 +23,8 @@ static inline void local_flush_icache_range(unsigned long start, static inline void flush_dcache_folio(struct folio *folio) { - if (test_bit(PG_dcache_clean, &folio->flags)) - clear_bit(PG_dcache_clean, &folio->flags); + if (test_bit(PG_dcache_clean, &folio->flags.f)) + clear_bit(PG_dcache_clean, &folio->flags.f); } #define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 446126497768..0872d43fc0c0 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -7,7 +7,7 @@ static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index 4ca5aafce22e..d83a612464f6 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -101,9 +101,9 @@ void flush_icache_pte(struct mm_struct *mm, pte_t pte) { struct folio *folio = page_folio(pte_page(pte)); - if (!test_bit(PG_dcache_clean, &folio->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags.f)) { flush_icache_mm(mm, false); - set_bit(PG_dcache_clean, &folio->flags); + set_bit(PG_dcache_clean, &folio->flags.f); } } #endif /* CONFIG_MMU */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 931fcc413598..69131736daaa 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -39,7 +39,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 47f574cd1728..93b2a01bae40 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -144,7 +144,7 @@ int uv_destroy_folio(struct folio *folio) folio_get(folio); rc = uv_destroy(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); return rc; } @@ -193,7 +193,7 @@ int uv_convert_from_secure_folio(struct folio *folio) folio_get(folio); rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); folio_put(folio); return rc; } @@ -289,7 +289,7 @@ static int __make_folio_secure(struct folio *folio, struct uv_cb_header *uvcb) expected = expected_folio_refs(folio) + 1; if (!folio_ref_freeze(folio, expected)) return -EBUSY; - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); /* * If the UVC does not succeed or fail immediately, we don't want to * loop for long, or we might get stall notifications. @@ -483,18 +483,18 @@ int arch_make_folio_accessible(struct folio *folio) * convert_to_secure. * As secure pages are never large folios, both variants can co-exists. */ - if (!test_bit(PG_arch_1, &folio->flags)) + if (!test_bit(PG_arch_1, &folio->flags.f)) return 0; rc = uv_pin_shared(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } rc = uv_convert_from_secure(folio_to_phys(folio)); if (!rc) { - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); return 0; } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index c7defe4ed1f6..8ff6bba107e8 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2272,7 +2272,7 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, start = pmd_val(*pmd) & HPAGE_MASK; end = start + HPAGE_SIZE; __storage_key_init_range(start, end); - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); cond_resched(); return 0; } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index e88c02c9e642..72e8fa136af5 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -155,7 +155,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) paddr = rste & PMD_MASK; } - if (!test_and_set_bit(PG_arch_1, &folio->flags)) + if (!test_and_set_bit(PG_arch_1, &folio->flags.f)) __storage_key_init_range(paddr, paddr + size); } diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 4a92e6e4d627..974512f359f0 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -14,7 +14,7 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, static inline void arch_clear_hugetlb_flags(struct folio *folio) { - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 46393b00137e..83fb34b39ca7 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -114,7 +114,7 @@ static void sh4_flush_dcache_folio(void *arg) struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else #endif { diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c index b509a407588f..71f8be9fc8e0 100644 --- a/arch/sh/mm/cache-sh7705.c +++ b/arch/sh/mm/cache-sh7705.c @@ -138,7 +138,7 @@ static void sh7705_flush_dcache_folio(void *arg) struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); else { unsigned long pfn = folio_pfn(folio); unsigned int i, nr = folio_nr_pages(folio); diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index 6ebdeaff3021..c3f028bed049 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c @@ -64,14 +64,14 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, struct folio *folio = page_folio(page); if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(vto, src, len); kunmap_coherent(vto); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } if (vma->vm_flags & VM_EXEC) @@ -85,14 +85,14 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, struct folio *folio = page_folio(page); if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(dst, vfrom, len); kunmap_coherent(vfrom); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &folio->flags); + clear_bit(PG_dcache_clean, &folio->flags.f); } } @@ -105,7 +105,7 @@ void copy_user_highpage(struct page *to, struct page *from, vto = kmap_atomic(to); if (boot_cpu_data.dcache.n_aliases && folio_mapped(src) && - test_bit(PG_dcache_clean, &src->flags)) { + test_bit(PG_dcache_clean, &src->flags.f)) { vfrom = kmap_coherent(from, vaddr); copy_page(vto, vfrom); kunmap_coherent(vfrom); @@ -148,7 +148,7 @@ void __update_cache(struct vm_area_struct *vma, if (pfn_valid(pfn)) { struct folio *folio = page_folio(pfn_to_page(pfn)); - int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags); + int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags.f); if (dirty) __flush_purge_region(folio_address(folio), folio_size(folio)); @@ -162,7 +162,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr) if (pages_do_alias(addr, vmaddr)) { if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && - test_bit(PG_dcache_clean, &folio->flags)) { + test_bit(PG_dcache_clean, &folio->flags.f)) { void *kaddr; kaddr = kmap_coherent(page, vmaddr); diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index fa50e8f6e7a9..c9f32d5a54b8 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -31,7 +31,7 @@ void *kmap_coherent(struct page *page, unsigned long addr) enum fixed_addresses idx; unsigned long vaddr; - BUG_ON(!test_bit(PG_dcache_clean, &folio->flags)); + BUG_ON(!test_bit(PG_dcache_clean, &folio->flags.f)); preempt_disable(); pagefault_disable(); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 7ed58bf3aaca..df9f7c444c39 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -224,7 +224,7 @@ inline void flush_dcache_folio_impl(struct folio *folio) ((1UL<flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) + (((folio)->flags.f >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) static inline void set_dcache_dirty(struct folio *folio, int this_cpu) { @@ -243,7 +243,7 @@ static inline void set_dcache_dirty(struct folio *folio, int this_cpu) "bne,pn %%xcc, 1b\n\t" " nop" : /* no outputs */ - : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags) + : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags.f) : "g1", "g7"); } @@ -265,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu " nop\n" "2:" : /* no outputs */ - : "r" (cpu), "r" (mask), "r" (&folio->flags), + : "r" (cpu), "r" (mask), "r" (&folio->flags.f), "i" (PG_dcache_cpu_mask), "i" (PG_dcache_cpu_shift) : "g1", "g7"); @@ -292,7 +292,7 @@ static void flush_dcache(unsigned long pfn) struct folio *folio = page_folio(page); unsigned long pg_flags; - pg_flags = folio->flags; + pg_flags = folio->flags.f; if (pg_flags & (1UL << PG_dcache_dirty)) { int cpu = ((pg_flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask); @@ -480,7 +480,7 @@ void flush_dcache_folio(struct folio *folio) mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) { - bool dirty = test_bit(PG_dcache_dirty, &folio->flags); + bool dirty = test_bit(PG_dcache_dirty, &folio->flags.f); if (dirty) { int dirty_cpu = dcache_dirty_cpu(folio); diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index c09284302dd3..b68200a0e0c6 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -126,7 +126,7 @@ __setup("debugpat", pat_debug_setup); static inline enum page_cache_mode get_page_memtype(struct page *pg) { - unsigned long pg_flags = pg->flags & _PGMT_MASK; + unsigned long pg_flags = pg->flags.f & _PGMT_MASK; if (pg_flags == _PGMT_WB) return _PAGE_CACHE_MODE_WB; @@ -161,10 +161,10 @@ static inline void set_page_memtype(struct page *pg, break; } - old_flags = READ_ONCE(pg->flags); + old_flags = READ_ONCE(pg->flags.f); do { new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; - } while (!try_cmpxchg(&pg->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&pg->flags.f, &old_flags, new_flags)); } #else static inline enum page_cache_mode get_page_memtype(struct page *pg) diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 23be0e7516ce..5354df52d61f 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -134,8 +134,8 @@ void flush_dcache_folio(struct folio *folio) */ if (mapping && !mapping_mapped(mapping)) { - if (!test_bit(PG_arch_1, &folio->flags)) - set_bit(PG_arch_1, &folio->flags); + if (!test_bit(PG_arch_1, &folio->flags.f)) + set_bit(PG_arch_1, &folio->flags.f); return; } else { @@ -232,7 +232,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, #if (DCACHE_WAY_SIZE > PAGE_SIZE) - if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags)) { + if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags.f)) { unsigned long phys = folio_pfn(folio) * PAGE_SIZE; unsigned long tmp; @@ -247,10 +247,10 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, } preempt_enable(); - clear_bit(PG_arch_1, &folio->flags); + clear_bit(PG_arch_1, &folio->flags.f); } #else - if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags) + if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags.f) && (vma->vm_flags & VM_EXEC) != 0) { for (i = 0; i < nr; i++) { void *paddr = kmap_local_folio(folio, i * PAGE_SIZE); @@ -258,7 +258,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, __invalidate_icache_page((unsigned long)paddr); kunmap_local(paddr); } - set_bit(PG_arch_1, &folio->flags); + set_bit(PG_arch_1, &folio->flags.f); } #endif } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index e80cd8f2c049..8a89f0aa1d4d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -935,7 +935,7 @@ static int fuse_check_folio(struct folio *folio) { if (folio_mapped(folio) || folio->mapping != NULL || - (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & + (folio->flags.f & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_lru | diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fe0faad4892f..0c0a80b3baca 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -40,7 +40,7 @@ static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, - bh->b_folio->mapping, bh->b_folio->flags); + bh->b_folio->mapping, bh->b_folio->flags.f); fs_err(sdp, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index dd3dff95cb24..b697f3c259ef 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -230,7 +230,7 @@ static int jffs2_write_begin(const struct kiocb *iocb, goto release_sem; } } - jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags); + jffs2_dbg(1, "end write_begin(). folio->flags %lx\n", folio->flags.f); release_sem: mutex_unlock(&c->alloc_sem); @@ -259,7 +259,7 @@ static int jffs2_write_end(const struct kiocb *iocb, jffs2_dbg(1, "%s(): ino #%lu, page at 0x%llx, range %d-%d, flags %lx\n", __func__, inode->i_ino, folio_pos(folio), - start, end, folio->flags); + start, end, folio->flags.f); /* We need to avoid deadlock with page_cache_read() in jffs2_garbage_collect_pass(). So the folio must be diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 806b056d2260..56c4da417b6a 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -167,7 +167,7 @@ void nilfs_folio_bug(struct folio *folio) printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", folio, folio_ref_count(folio), - (unsigned long long)folio->index, folio->flags, m, ino); + (unsigned long long)folio->index, folio->flags.f, m, ino); head = folio_buffers(folio); if (head) { diff --git a/fs/proc/page.c b/fs/proc/page.c index ba3568e97fd1..771e0b6bc630 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -163,7 +163,7 @@ u64 stable_page_flags(const struct page *page) snapshot_page(&ps, page); folio = &ps.folio_snapshot; - k = folio->flags; + k = folio->flags.f; mapping = (unsigned long)folio->mapping; is_anon = mapping & FOLIO_MAPPING_ANON; @@ -238,7 +238,7 @@ u64 stable_page_flags(const struct page *page) if (u & (1 << KPF_HUGE)) u |= kpf_copy_bit(k, KPF_HWPOISON, PG_hwpoison); else - u |= kpf_copy_bit(ps.page_snapshot.flags, KPF_HWPOISON, PG_hwpoison); + u |= kpf_copy_bit(ps.page_snapshot.flags.f, KPF_HWPOISON, PG_hwpoison); #endif u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e75a6cec67be..ca41ce8208c4 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -107,7 +107,7 @@ static int do_readpage(struct folio *folio) size_t offset = 0; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, folio->index, i_size, folio->flags); + inode->i_ino, folio->index, i_size, folio->flags.f); ubifs_assert(c, !folio_test_checked(folio)); ubifs_assert(c, !folio->private); @@ -600,7 +600,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", - inode->i_ino, folio->index, i_size, folio->flags); + inode->i_ino, folio->index, i_size, folio->flags.f); end_index = (i_size - 1) >> PAGE_SHIFT; if (!i_size || folio->index > end_index) { @@ -988,7 +988,7 @@ static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc) int err, len = folio_size(folio); dbg_gen("ino %lu, pg %lu, pg flags %#lx", - inode->i_ino, folio->index, folio->flags); + inode->i_ino, folio->index, folio->flags.f); ubifs_assert(c, folio->private != NULL); /* Is the folio fully outside @i_size? (truncate in progress) */ diff --git a/include/linux/mm.h b/include/linux/mm.h index b61e2d4858cf..da562f23f50c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1024,7 +1024,7 @@ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; - if (!test_bit(PG_head, &folio->flags)) + if (!test_bit(PG_head, &folio->flags.f)) return 0; return folio_large_order(folio); } @@ -1554,7 +1554,7 @@ static inline bool is_nommu_shared_mapping(vm_flags_t flags) */ static inline int page_zone_id(struct page *page) { - return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; + return (page->flags.f >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS @@ -1562,7 +1562,7 @@ int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { - return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; + return (PF_POISONED_CHECK(page)->flags.f >> NODES_PGSHIFT) & NODES_MASK; } #endif @@ -1637,14 +1637,14 @@ static inline void page_cpupid_reset_last(struct page *page) #else static inline int folio_last_cpupid(struct folio *folio) { - return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; + return (folio->flags.f >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { - page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; + page->flags.f |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ @@ -1740,7 +1740,7 @@ static inline u8 page_kasan_tag(const struct page *page) u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { - tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; + tag = (page->flags.f >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } @@ -1755,12 +1755,12 @@ static inline void page_kasan_tag_set(struct page *page, u8 tag) return; tag ^= 0xff; - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(page->flags.f); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) @@ -1804,13 +1804,13 @@ static inline pg_data_t *folio_pgdat(const struct folio *folio) #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { - page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); - page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; + page->flags.f &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); + page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { - return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; + return (page->flags.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif @@ -2015,14 +2015,14 @@ static inline bool folio_is_longterm_pinnable(struct folio *folio) static inline void set_page_zone(struct page *page, enum zone_type zone) { - page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); - page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; + page->flags.f &= ~(ZONES_MASK << ZONES_PGSHIFT); + page->flags.f |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { - page->flags &= ~(NODES_MASK << NODES_PGSHIFT); - page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; + page->flags.f &= ~(NODES_MASK << NODES_PGSHIFT); + page->flags.f |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, @@ -2064,7 +2064,7 @@ static inline long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; - if (!test_bit(PG_head, &folio->flags)) + if (!test_bit(PG_head, &folio->flags.f)) return 1; return folio_large_nr_pages(folio); } diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 89b518ff097e..150302b4a905 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -143,7 +143,7 @@ static inline int lru_tier_from_refs(int refs, bool workingset) static inline int folio_lru_refs(struct folio *folio) { - unsigned long flags = READ_ONCE(folio->flags); + unsigned long flags = READ_ONCE(folio->flags.f); if (!(flags & BIT(PG_referenced))) return 0; @@ -156,7 +156,7 @@ static inline int folio_lru_refs(struct folio *folio) static inline int folio_lru_gen(struct folio *folio) { - unsigned long flags = READ_ONCE(folio->flags); + unsigned long flags = READ_ONCE(folio->flags.f); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -268,7 +268,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + set_mask_bits(&folio->flags.f, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ @@ -293,7 +293,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; - flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); + flags = set_mask_bits(&folio->flags.f, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); @@ -304,9 +304,9 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, static inline void folio_migrate_refs(struct folio *new, struct folio *old) { - unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; + unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK; - set_mask_bits(&new->flags, LRU_REFS_MASK, refs); + set_mask_bits(&new->flags.f, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d247da2fdb52..d934a3a5b443 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -34,6 +34,10 @@ struct address_space; struct futex_private_hash; struct mem_cgroup; +typedef struct { + unsigned long f; +} memdesc_flags_t; + /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -72,7 +76,7 @@ struct mem_cgroup; #endif struct page { - unsigned long flags; /* Atomic flags, some possibly + memdesc_flags_t flags; /* Atomic flags, some possibly * updated asynchronously */ /* * Five words (20/40 bytes) are available in this union. @@ -383,7 +387,7 @@ struct folio { union { struct { /* public: */ - unsigned long flags; + memdesc_flags_t flags; union { struct list_head lru; /* private: avoid cluttering the output */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9d3ea9085556..990560cd99ee 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1186,7 +1186,7 @@ static inline bool zone_is_empty(struct zone *zone) static inline enum zone_type page_zonenum(const struct page *page) { ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; + return (page->flags.f >> ZONES_PGSHIFT) & ZONES_MASK; } static inline enum zone_type folio_zonenum(const struct folio *folio) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8d3fa3a91ce4..d53a86e68c89 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -217,7 +217,7 @@ static __always_inline const struct page *page_fixed_fake_head(const struct page * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && - test_bit(PG_head, &page->flags)) { + test_bit(PG_head, &page->flags.f)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least @@ -325,14 +325,14 @@ static __always_inline int PageTail(const struct page *page) static __always_inline int PageCompound(const struct page *page) { - return test_bit(PG_head, &page->flags) || + return test_bit(PG_head, &page->flags.f) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { - return READ_ONCE(page->flags) == PAGE_POISON_PATTERN; + return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM @@ -349,8 +349,8 @@ static const unsigned long *const_folio_flags(const struct folio *folio, const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); - VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); - return &page[n].flags; + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); + return &page[n].flags.f; } static unsigned long *folio_flags(struct folio *folio, unsigned n) @@ -358,8 +358,8 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); - VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); - return &page[n].flags; + VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); + return &page[n].flags.f; } /* @@ -449,37 +449,37 @@ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ -{ return test_bit(PG_##lname, &policy(page, 0)->flags); } +{ return test_bit(PG_##lname, &policy(page, 0)->flags.f); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ -{ set_bit(PG_##lname, &policy(page, 1)->flags); } +{ set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ -{ clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ -{ __set_bit(PG_##lname, &policy(page, 1)->flags); } +{ __set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ -{ __clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ __clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ -{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ -{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } +{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ @@ -846,7 +846,7 @@ static __always_inline bool folio_test_head(const struct folio *folio) static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); - return test_bit(PG_head, &page->flags) && !page_is_fake_head(page); + return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) @@ -1170,28 +1170,28 @@ static __always_inline int PageAnonExclusive(const struct page *page) */ if (PageHuge(page)) page = compound_head(page); - return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); - __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); + __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } #ifdef CONFIG_MMU @@ -1241,7 +1241,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ static inline int folio_has_private(const struct folio *folio) { - return !!(folio->flags & PAGE_FLAGS_PRIVATE); + return !!(folio->flags.f & PAGE_FLAGS_PRIVATE); } #undef PF_ANY diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 8a7f4f802c57..38a82d65e58e 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -107,7 +107,8 @@ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, if (static_key_enabled(&mem_profiling_compressed)) { pgalloc_tag_idx idx; - idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask; + idx = (page->flags.f >> alloc_tag_ref_offs) & + alloc_tag_ref_mask; idx_to_ref(idx, ref); handle->page = page; } else { @@ -149,11 +150,11 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code idx = (unsigned long)ref_to_idx(ref); idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs; do { - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(page->flags.f); flags = old_flags; flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs); flags |= idx; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&page->flags.f, &old_flags, flags))); } else { if (WARN_ON(!handle.ref || !ref)) return; diff --git a/include/trace/events/page_ref.h b/include/trace/events/page_ref.h index fe33a255b7d0..ea6b5c4baf3d 100644 --- a/include/trace/events/page_ref.h +++ b/include/trace/events/page_ref.h @@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_template, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->flags = page->flags; + __entry->flags = page->flags.f; __entry->count = page_ref_count(page); __entry->mapcount = atomic_read(&page->_mapcount); __entry->mapping = page->mapping; @@ -77,7 +77,7 @@ DECLARE_EVENT_CLASS(page_ref_mod_and_test_template, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->flags = page->flags; + __entry->flags = page->flags.f; __entry->count = page_ref_count(page); __entry->mapcount = atomic_read(&page->_mapcount); __entry->mapping = page->mapping; diff --git a/mm/filemap.c b/mm/filemap.c index 1a388b11cfa9..f3a6c24897f4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1140,10 +1140,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, */ flags = wait->flags; if (flags & WQ_FLAG_EXCLUSIVE) { - if (test_bit(key->bit_nr, &key->folio->flags)) + if (test_bit(key->bit_nr, &key->folio->flags.f)) return -1; if (flags & WQ_FLAG_CUSTOM) { - if (test_and_set_bit(key->bit_nr, &key->folio->flags)) + if (test_and_set_bit(key->bit_nr, &key->folio->flags.f)) return -1; flags |= WQ_FLAG_DONE; } @@ -1226,9 +1226,9 @@ static inline bool folio_trylock_flag(struct folio *folio, int bit_nr, struct wait_queue_entry *wait) { if (wait->flags & WQ_FLAG_EXCLUSIVE) { - if (test_and_set_bit(bit_nr, &folio->flags)) + if (test_and_set_bit(bit_nr, &folio->flags.f)) return false; - } else if (test_bit(bit_nr, &folio->flags)) + } else if (test_bit(bit_nr, &folio->flags.f)) return false; wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d89992b65acc..aac5f0a2cb54 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3303,8 +3303,8 @@ static void __split_folio_to_order(struct folio *folio, int old_order, * unreferenced sub-pages of an anonymous THP: we can simply drop * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ - new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - new_folio->flags |= (folio->flags & + new_folio->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; + new_folio->flags.f |= (folio->flags.f & ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_swapcache) | diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fc30ca4804bf..c15ffee7d32b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1707,10 +1707,10 @@ static int identify_page_state(unsigned long pfn, struct page *p, * carried out only if the first check can't determine the page status. */ for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) + if ((p->flags.f & ps->mask) == ps->res) break; - page_flags |= (p->flags & (1UL << PG_dirty)); + page_flags |= (p->flags.f & (1UL << PG_dirty)); if (!ps->mask) for (ps = error_states;; ps++) @@ -2137,7 +2137,7 @@ retry: return action_result(pfn, MF_MSG_FREE_HUGE, res); } - page_flags = folio->flags; + page_flags = folio->flags.f; if (!hwpoison_user_mappings(folio, p, pfn, flags)) { folio_unlock(folio); @@ -2398,7 +2398,7 @@ try_again: * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page * status correctly, we save a copy of the page flags at this time. */ - page_flags = folio->flags; + page_flags = folio->flags.f; /* * __munlock_folio() may clear a writeback folio's LRU flag without @@ -2744,13 +2744,13 @@ static int soft_offline_in_use_page(struct page *page) putback_movable_pages(&pagelist); pr_info("%#lx: %s migration failed %ld, type %pGp\n", - pfn, msg_page[huge], ret, &page->flags); + pfn, msg_page[huge], ret, &page->flags.f); if (ret > 0) ret = -EBUSY; } } else { pr_info("%#lx: %s isolation failed, page count %d, type %pGp\n", - pfn, msg_page[huge], page_count(page), &page->flags); + pfn, msg_page[huge], page_count(page), &page->flags.f); ret = -EBUSY; } return ret; diff --git a/mm/mmzone.c b/mm/mmzone.c index f9baa8882fbf..0c8f181d9d50 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -99,14 +99,14 @@ int folio_xchg_last_cpupid(struct folio *folio, int cpupid) unsigned long old_flags, flags; int last_cpupid; - old_flags = READ_ONCE(folio->flags); + old_flags = READ_ONCE(folio->flags.f); do { flags = old_flags; last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&folio->flags.f, &old_flags, flags))); return last_cpupid; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2ee21e46f0fb..ca9e6b9633f7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -950,7 +950,7 @@ static inline void __free_one_page(struct page *page, bool to_tail; VM_BUG_ON(!zone_is_initialized(zone)); - VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); + VM_BUG_ON_PAGE(page->flags.f & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); @@ -1043,7 +1043,7 @@ static inline bool page_expected_state(struct page *page, page->memcg_data | #endif page_pool_page_is_pp(page) | - (page->flags & check_flags))) + (page->flags.f & check_flags))) return false; return true; @@ -1059,7 +1059,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _refcount"; - if (unlikely(page->flags & flags)) { + if (unlikely(page->flags.f & flags)) { if (flags == PAGE_FLAGS_CHECK_AT_PREP) bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; else @@ -1358,7 +1358,7 @@ __always_inline bool free_pages_prepare(struct page *page, int i; if (compound) { - page[1].flags &= ~PAGE_FLAGS_SECOND; + page[1].flags.f &= ~PAGE_FLAGS_SECOND; #ifdef NR_PAGES_IN_LARGE_FOLIO folio->_nr_pages = 0; #endif @@ -1372,7 +1372,7 @@ __always_inline bool free_pages_prepare(struct page *page, continue; } } - (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + (page + i)->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; } } if (folio_test_anon(folio)) { @@ -1391,7 +1391,7 @@ __always_inline bool free_pages_prepare(struct page *page, } page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page->flags.f &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); page_table_check_free(page, order); pgalloc_tag_sub(page, 1 << order); diff --git a/mm/swap.c b/mm/swap.c index cb164f9ef9e3..6dd22a904b37 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -387,14 +387,14 @@ static void __lru_cache_activate_folio(struct folio *folio) static void lru_gen_inc_refs(struct folio *folio) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return; } @@ -406,7 +406,7 @@ static void lru_gen_inc_refs(struct folio *folio) } new_flags = old_flags + BIT(LRU_REFS_PGOFF); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) @@ -418,7 +418,7 @@ static bool lru_gen_clear_refs(struct folio *folio) if (gen < 0) return true; - set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ diff --git a/mm/vmscan.c b/mm/vmscan.c index b9a1cfeb2ddf..e336577c4454 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -888,11 +888,11 @@ static bool lru_gen_set_refs(struct folio *folio) { /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return false; } - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_workingset)); return true; } #else @@ -3257,13 +3257,13 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) /* promote pages accessed through page tables */ static int folio_update_gen(struct folio *folio, int gen) { - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { - set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, BIT(PG_referenced)); return -1; } @@ -3274,7 +3274,7 @@ static int folio_update_gen(struct folio *folio, int gen) new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } @@ -3285,7 +3285,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags.f); VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); @@ -3302,7 +3302,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai /* for folio_end_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); - } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + } while (!try_cmpxchg(&folio->flags.f, &old_flags, new_flags)); lru_gen_update_size(lruvec, folio, old_gen, new_gen); @@ -4553,7 +4553,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); @@ -4766,7 +4766,7 @@ retry: /* don't add rejected folios to the oldest generation */ if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) - set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); + set_mask_bits(&folio->flags.f, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); diff --git a/mm/workingset.c b/mm/workingset.c index 6e7f4cb1b9a7..68a76a91111f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -318,7 +318,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); } else - set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); + set_mask_bits(&folio->flags.f, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } From 56d578c1300f7efe9605b75714173dd3fda16fe2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:52 +0100 Subject: [PATCH 101/372] mm: convert page_to_section() to memdesc_section() Pass in the memdesc_flags_t instead of a pointer to the page. This will allow us to remove a few conversions to struct page in upcoming patches. Link: https://lkml.kernel.org/r/20250805172307.1302730-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/asm-generic/memory_model.h | 2 +- include/linux/mm.h | 4 ++-- mm/sparse.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 74d0077cc5fa..efa6610acbc7 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -53,7 +53,7 @@ static inline int pfn_valid(unsigned long pfn) */ #define __page_to_pfn(pg) \ ({ const struct page *__pg = (pg); \ - int __sec = page_to_section(__pg); \ + int __sec = memdesc_section(__pg->flags); \ (unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \ }) diff --git a/include/linux/mm.h b/include/linux/mm.h index da562f23f50c..82617c4cfa24 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1808,9 +1808,9 @@ static inline void set_page_section(struct page *page, unsigned long section) page->flags.f |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } -static inline unsigned long page_to_section(const struct page *page) +static inline unsigned long memdesc_section(memdesc_flags_t mdf) { - return (page->flags.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; + return (mdf.f >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif diff --git a/mm/sparse.c b/mm/sparse.c index e6075b622407..7cb42cbfc7f9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -45,7 +45,7 @@ static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; int page_to_nid(const struct page *page) { - return section_to_node_table[page_to_section(page)]; + return section_to_node_table[memdesc_section(page->flags)]; } EXPORT_SYMBOL(page_to_nid); From eb00fdd84ddabd6948d26595bb5e8c1302220d37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:53 +0100 Subject: [PATCH 102/372] mm: introduce memdesc_nid() Remove a conversion from folio to page by passing the folio->flags (which are a copy of the page->flags) to the new memdesc_nid() function. Link: https://lkml.kernel.org/r/20250805172307.1302730-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mm.h | 25 +++++++++++++++---------- mm/sparse.c | 6 +++--- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 82617c4cfa24..00c8a54127d3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1558,17 +1558,22 @@ static inline int page_zone_id(struct page *page) } #ifdef NODE_NOT_IN_PAGE_FLAGS -int page_to_nid(const struct page *page); +int memdesc_nid(memdesc_flags_t mdf); #else -static inline int page_to_nid(const struct page *page) +static inline int memdesc_nid(memdesc_flags_t mdf) { - return (PF_POISONED_CHECK(page)->flags.f >> NODES_PGSHIFT) & NODES_MASK; + return (mdf.f >> NODES_PGSHIFT) & NODES_MASK; } #endif +static inline int page_to_nid(const struct page *page) +{ + return memdesc_nid(PF_POISONED_CHECK(page)->flags); +} + static inline int folio_nid(const struct folio *folio) { - return page_to_nid(&folio->page); + return memdesc_nid(folio->flags); } #ifdef CONFIG_NUMA_BALANCING @@ -1791,14 +1796,14 @@ static inline pg_data_t *page_pgdat(const struct page *page) return NODE_DATA(page_to_nid(page)); } -static inline struct zone *folio_zone(const struct folio *folio) -{ - return page_zone(&folio->page); -} - static inline pg_data_t *folio_pgdat(const struct folio *folio) { - return page_pgdat(&folio->page); + return NODE_DATA(folio_nid(folio)); +} + +static inline struct zone *folio_zone(const struct folio *folio) +{ + return &folio_pgdat(folio)->node_zones[folio_zonenum(folio)]; } #ifdef SECTION_IN_PAGE_FLAGS diff --git a/mm/sparse.c b/mm/sparse.c index 7cb42cbfc7f9..17c50a6415c2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -43,11 +43,11 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(const struct page *page) +int memdesc_nid(memdesc_flags_t mdf) { - return section_to_node_table[memdesc_section(page->flags)]; + return section_to_node_table[memdesc_section(mdf)]; } -EXPORT_SYMBOL(page_to_nid); +EXPORT_SYMBOL(memdesc_nid); static void set_section_nid(unsigned long section_nr, int nid) { From 4aff03fbe508780394039053bebfc4f4800b286e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:54 +0100 Subject: [PATCH 103/372] mm: introduce memdesc_zonenum() Remove a conversion from folio to page by passing the folio->flags (which are a copy of the page->flags) to the new memdesc_zonenum() function. Link: https://lkml.kernel.org/r/20250805172307.1302730-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 990560cd99ee..80a3b6642603 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1183,15 +1183,20 @@ static inline bool zone_is_empty(struct zone *zone) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) +static inline enum zone_type memdesc_zonenum(memdesc_flags_t flags) +{ + ASSERT_EXCLUSIVE_BITS(flags.f, ZONES_MASK << ZONES_PGSHIFT); + return (flags.f >> ZONES_PGSHIFT) & ZONES_MASK; +} + static inline enum zone_type page_zonenum(const struct page *page) { - ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); - return (page->flags.f >> ZONES_PGSHIFT) & ZONES_MASK; + return memdesc_zonenum(page->flags); } static inline enum zone_type folio_zonenum(const struct folio *folio) { - return page_zonenum(&folio->page); + return memdesc_zonenum(folio->flags); } #ifdef CONFIG_ZONE_DEVICE From 87479378acdd79a0ac84be0b823e37e3816433d9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:55 +0100 Subject: [PATCH 104/372] slab: use memdesc_flags_t The slab flags are memdesc flags and contain the same information in the upper bits as the other memdescs (like node ID). Link: https://lkml.kernel.org/r/20250805172307.1302730-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/slab.h | 2 +- mm/slub.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 248b34c839b7..7757331e7c80 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -50,7 +50,7 @@ typedef union { /* Reuses the bits in struct page */ struct slab { - unsigned long flags; + memdesc_flags_t flags; struct kmem_cache *slab_cache; union { diff --git a/mm/slub.c b/mm/slub.c index 8dbeabc6a0f0..af343ca570b5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -657,17 +657,17 @@ static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) */ static inline bool slab_test_pfmemalloc(const struct slab *slab) { - return test_bit(SL_pfmemalloc, &slab->flags); + return test_bit(SL_pfmemalloc, &slab->flags.f); } static inline void slab_set_pfmemalloc(struct slab *slab) { - set_bit(SL_pfmemalloc, &slab->flags); + set_bit(SL_pfmemalloc, &slab->flags.f); } static inline void __slab_clear_pfmemalloc(struct slab *slab) { - __clear_bit(SL_pfmemalloc, &slab->flags); + __clear_bit(SL_pfmemalloc, &slab->flags.f); } /* @@ -675,12 +675,12 @@ static inline void __slab_clear_pfmemalloc(struct slab *slab) */ static __always_inline void slab_lock(struct slab *slab) { - bit_spin_lock(SL_locked, &slab->flags); + bit_spin_lock(SL_locked, &slab->flags.f); } static __always_inline void slab_unlock(struct slab *slab) { - bit_spin_unlock(SL_locked, &slab->flags); + bit_spin_unlock(SL_locked, &slab->flags.f); } static inline bool @@ -1046,7 +1046,7 @@ static void print_slab_info(const struct slab *slab) { pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", slab, slab->objects, slab->inuse, slab->freelist, - &slab->flags); + &slab->flags.f); } void skip_orig_size_check(struct kmem_cache *s, const void *object) @@ -2755,17 +2755,17 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) static inline bool slab_test_node_partial(const struct slab *slab) { - return test_bit(SL_partial, &slab->flags); + return test_bit(SL_partial, &slab->flags.f); } static inline void slab_set_node_partial(struct slab *slab) { - set_bit(SL_partial, &slab->flags); + set_bit(SL_partial, &slab->flags.f); } static inline void slab_clear_node_partial(struct slab *slab) { - clear_bit(SL_partial, &slab->flags); + clear_bit(SL_partial, &slab->flags.f); } /* From 11afccce2ac5fd1ea5e6f0d251e746df782c8cfe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:56 +0100 Subject: [PATCH 105/372] slab: use memdesc_nid() We no longer need to convert from slab to folio to get the nid, we can ask memdesc_nid() for the nid directly. Link: https://lkml.kernel.org/r/20250805172307.1302730-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 7757331e7c80..c41a512dd07c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -174,12 +174,12 @@ static inline void *slab_address(const struct slab *slab) static inline int slab_nid(const struct slab *slab) { - return folio_nid(slab_folio(slab)); + return memdesc_nid(slab->flags); } static inline pg_data_t *slab_pgdat(const struct slab *slab) { - return folio_pgdat(slab_folio(slab)); + return NODE_DATA(slab_nid(slab)); } static inline struct slab *virt_to_slab(const void *addr) From 89ef6ad6fa849b780b5a5caae9068261603e1738 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:57 +0100 Subject: [PATCH 106/372] mm: introduce memdesc_is_zone_device() Remove the conversion from folio to page in folio_is_zone_device() by introducing memdesc_is_zone_device() which takes a memdesc_flags_t from either a page or a folio. Link: https://lkml.kernel.org/r/20250805172307.1302730-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 80a3b6642603..fe13ad175fed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1200,14 +1200,14 @@ static inline enum zone_type folio_zonenum(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE -static inline bool is_zone_device_page(const struct page *page) +static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { - return page_zonenum(page) == ZONE_DEVICE; + return memdesc_zonenum(mdf) == ZONE_DEVICE; } static inline struct dev_pagemap *page_pgmap(const struct page *page) { - VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page); + VM_WARN_ON_ONCE_PAGE(!memdesc_is_zone_device(page->flags), page); return page_folio(page)->pgmap; } @@ -1222,9 +1222,9 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page) static inline bool zone_device_pages_have_same_pgmap(const struct page *a, const struct page *b) { - if (is_zone_device_page(a) != is_zone_device_page(b)) + if (memdesc_is_zone_device(a->flags) != memdesc_is_zone_device(b->flags)) return false; - if (!is_zone_device_page(a)) + if (!memdesc_is_zone_device(a->flags)) return true; return page_pgmap(a) == page_pgmap(b); } @@ -1232,7 +1232,7 @@ static inline bool zone_device_pages_have_same_pgmap(const struct page *a, extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else -static inline bool is_zone_device_page(const struct page *page) +static inline bool memdesc_is_zone_device(memdesc_flags_t mdf) { return false; } @@ -1247,9 +1247,14 @@ static inline struct dev_pagemap *page_pgmap(const struct page *page) } #endif +static inline bool is_zone_device_page(const struct page *page) +{ + return memdesc_is_zone_device(page->flags); +} + static inline bool folio_is_zone_device(const struct folio *folio) { - return is_zone_device_page(&folio->page); + return memdesc_is_zone_device(folio->flags); } static inline bool is_zone_movable_page(const struct page *page) From 7cfe9cafb6adebc13a246bebafcd69cd37add4e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:58 +0100 Subject: [PATCH 107/372] mm: reimplement folio_is_device_private() For callers of folio_is_device_private(), we save a folio->page->folio conversion. Callers of is_device_private_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_device_private_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 4aa151914eab..5d18cb7a70e5 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -157,16 +157,17 @@ static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) return 1 << pgmap->vmemmap_shift; } +static inline bool folio_is_device_private(const struct folio *folio) +{ + return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && + folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_PRIVATE; +} + static inline bool is_device_private_page(const struct page *page) { return IS_ENABLED(CONFIG_DEVICE_PRIVATE) && - is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE; -} - -static inline bool folio_is_device_private(const struct folio *folio) -{ - return is_device_private_page(&folio->page); + folio_is_device_private(page_folio(page)); } static inline bool is_pci_p2pdma_page(const struct page *page) From bd0dbbb3fd902c7eea7eb166d91bda4530a8de96 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:22:59 +0100 Subject: [PATCH 108/372] mm: reimplement folio_is_device_coherent() For callers of folio_is_device_coherent(), we save a folio->page->folio conversion. Callers of is_device_coherent_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_device_coherent_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 5d18cb7a70e5..06d29794abe6 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -177,15 +177,15 @@ static inline bool is_pci_p2pdma_page(const struct page *page) page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; } -static inline bool is_device_coherent_page(const struct page *page) -{ - return is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_COHERENT; -} - static inline bool folio_is_device_coherent(const struct folio *folio) { - return is_device_coherent_page(&folio->page); + return folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_COHERENT; +} + +static inline bool is_device_coherent_page(const struct page *page) +{ + return folio_is_device_coherent(page_folio(page)); } static inline bool is_fsdax_page(const struct page *page) From c995ac3aa3747ec2a0373e5f319a22e0cb31d613 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:23:00 +0100 Subject: [PATCH 109/372] mm: reimplement folio_is_fsdax() For callers of folio_is_fsdax(), we save a folio->page->folio conversion. Callers of is_fsdax_page() simply move the conversion of page->folio from the implementation of page_pgmap() to is_fsdax_page(). Link: https://lkml.kernel.org/r/20250805172307.1302730-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memremap.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 06d29794abe6..450d4bb6835c 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -188,15 +188,15 @@ static inline bool is_device_coherent_page(const struct page *page) return folio_is_device_coherent(page_folio(page)); } -static inline bool is_fsdax_page(const struct page *page) -{ - return is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX; -} - static inline bool folio_is_fsdax(const struct folio *folio) { - return is_fsdax_page(&folio->page); + return folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_FS_DAX; +} + +static inline bool is_fsdax_page(const struct page *page) +{ + return folio_is_fsdax(page_folio(page)); } #ifdef CONFIG_ZONE_DEVICE From 88df6ab2f34b60837ebdab64b2514f356d5ebb65 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 5 Aug 2025 18:23:01 +0100 Subject: [PATCH 110/372] mm: add folio_is_pci_p2pdma() Reimplement is_pci_p2pdma_page() in terms of folio_is_pci_p2pdma(). Moves the page_folio() call from inside page_pgmap() to is_pci_p2pdma_page(). This removes a page_folio() call from try_grab_folio() which already has a folio and can pass it in. Link: https://lkml.kernel.org/r/20250805172307.1302730-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Shakeel Butt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/memremap.h | 10 ++++++++-- mm/gup.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 450d4bb6835c..aa1b6aa877a0 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -170,11 +170,17 @@ static inline bool is_device_private_page(const struct page *page) folio_is_device_private(page_folio(page)); } +static inline bool folio_is_pci_p2pdma(const struct folio *folio) +{ + return IS_ENABLED(CONFIG_PCI_P2PDMA) && + folio_is_zone_device(folio) && + folio->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; +} + static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_PCI_P2PDMA) && - is_zone_device_page(page) && - page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA; + folio_is_pci_p2pdma(page_folio(page)); } static inline bool folio_is_device_coherent(const struct folio *folio) diff --git a/mm/gup.c b/mm/gup.c index 331d22bf7b2d..b2a78f029127 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -148,7 +148,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs, if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) return -ENOMEM; - if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page))) + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && folio_is_pci_p2pdma(folio))) return -EREMOTEIO; if (flags & FOLL_GET) From 4bd22a7ae5742df90abfdc0931a0f1bded65c1de Mon Sep 17 00:00:00 2001 From: liuqiqi Date: Tue, 12 Aug 2025 15:02:10 +0800 Subject: [PATCH 111/372] mm: fix duplicate accounting of free pages in should_reclaim_retry() In the zone_reclaimable_pages() function, if the page counts for NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_INACTIVE_ANON, and NR_ZONE_ACTIVE_ANON are all zero, the function returns the number of free pages as the result. In this case, when should_reclaim_retry() calculates reclaimable pages, it will inadvertently double-count the free pages in its accounting. static inline bool should_reclaim_retry(gfp_t gfp_mask, unsigned order, struct alloc_context *ac, int alloc_flags, bool did_some_progress, int *no_progress_loops) { ... available = reclaimable = zone_reclaimable_pages(zone); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); This may result in an increase in the number of retries of __alloc_pages_slowpath(), causing increased kswapd load. Link: https://lkml.kernel.org/r/20250812070210.1624218-1-liuqiqi@kylinos.cn Fixes: 6aaced5abd32 ("mm: vmscan: account for free pages to prevent infinite Loop in throttle_direct_reclaim()") Signed-off-by: liuqiqi Reviewed-by: Ye Liu Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/vmscan.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index e336577c4454..c88eb223ade2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -398,14 +398,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL)) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); - /* - * If there are no reclaimable file-backed or anonymous pages, - * ensure zones with sufficient free pages are not skipped. - * This prevents zones like DMA32 from being ignored in reclaim - * scenarios where they can still help alleviate memory pressure. - */ - if (nr == 0) - nr = zone_page_state_snapshot(zone, NR_FREE_PAGES); + return nr; } @@ -6495,7 +6488,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { - if (!zone_reclaimable_pages(zone)) + if (!zone_reclaimable_pages(zone) && zone_page_state_snapshot(zone, NR_FREE_PAGES)) continue; pfmemalloc_reserve += min_wmark_pages(zone); From f6a4a150f1ec2ef9a1241adc173d8a67ff19633f Mon Sep 17 00:00:00 2001 From: Sang-Heon Jeon Date: Sun, 17 Aug 2025 11:13:48 +0900 Subject: [PATCH 112/372] mm/damon/tests/core-kunit: add damos_commit_filter test Add unit test to verify that damos_commmit_filter() change dest value well. Link: https://lkml.kernel.org/r/20250817021348.570692-1-ekffu200098@gmail.com Signed-off-by: Sang-Heon Jeon Reviewed-by: SeongJae Park Cc: Honggyu Kim Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index dfedfff19940..5f5dc9db2e90 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -419,6 +419,22 @@ static void damos_test_new_filter(struct kunit *test) damos_destroy_filter(filter); } +static void damos_test_commit_filter(struct kunit *test) +{ + struct damos_filter *src_filter = damos_new_filter( + DAMOS_FILTER_TYPE_ANON, true, true); + struct damos_filter *dst_filter = damos_new_filter( + DAMOS_FILTER_TYPE_ACTIVE, false, false); + + damos_commit_filter(dst_filter, src_filter); + KUNIT_EXPECT_EQ(test, dst_filter->type, src_filter->type); + KUNIT_EXPECT_EQ(test, dst_filter->matching, src_filter->matching); + KUNIT_EXPECT_EQ(test, dst_filter->allow, src_filter->allow); + + damos_destroy_filter(src_filter); + damos_destroy_filter(dst_filter); +} + static void damos_test_filter_out(struct kunit *test) { struct damon_target *t; @@ -594,6 +610,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_attrs), KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), + KUNIT_CASE(damos_test_commit_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), KUNIT_CASE(damon_test_set_filters_default_reject), From c4408277c0d773b7601def781702f7215f894ca7 Mon Sep 17 00:00:00 2001 From: Chi Zhiling Date: Mon, 28 Jul 2025 16:39:51 +0800 Subject: [PATCH 113/372] mm/filemap: do not use is_partially_uptodate for entire folio Patch series "Tiny optimization for large read operations". This series contains two patches, 1. Skip calling is_partially_uptodate for entire folio to save time, I have reviewed the mpage and iomap implementations and didn't spot any issues, but this change likely needs more thorough review. 2. Skip calling filemap_uptodate if there are ready folios in the batch, This might save a few milliseconds in practice, but I didn't observe measurable improvements in my tests. This patch (of 2): When a folio is marked as non-uptodate, it means the folio contains some non-uptodate data. Therefore, calling is_partially_uptodate() to recheck the entire folio is redundant. If all data in a folio is actually up-to-date but the folio lacks the uptodate flag, it will still be treated as non-uptodate in many other places. Thus, there should be no special case handling for filemap. Link: https://lkml.kernel.org/r/20250728083952.75518-1-chizhiling@163.com Link: https://lkml.kernel.org/r/20250728083952.75518-2-chizhiling@163.com Signed-off-by: Chi Zhiling Cc: Matthew Wilcox (Oracle) Cc: Jan Kara Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/filemap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index f3a6c24897f4..d6f95513241f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2447,6 +2447,9 @@ static bool filemap_range_uptodate(struct address_space *mapping, pos -= folio_pos(folio); } + if (pos == 0 && count >= folio_size(folio)) + return false; + return mapping->a_ops->is_partially_uptodate(folio, pos, count); } From 35224da7e30b08cffa7aa5d893a91b7332b61c2a Mon Sep 17 00:00:00 2001 From: Chi Zhiling Date: Mon, 28 Jul 2025 16:39:52 +0800 Subject: [PATCH 114/372] mm/filemap: skip non-uptodate folio if there are available folios When reading data exceeding the maximum IO size, the operation is split into multiple IO requests, but the data isn't immediately copied to userspace after each IO completion. For example, when reading 2560k data from a device with 1280k maximum IO size, the following sequence occurs: 1. read 1280k 2. copy 41 pages and issue read ahead for next 1280k 3. copy 31 pages to user buffer 4. wait the next 1280k 5. copy 8 pages to user buffer 6. copy 20 folios(64k) to user buffer The 8 pages in step 5 are copied after the second 1280k completes(step 4) due to waiting for a non-uptodate folio in filemap_update_page. We can copy the 8 pages before the second 1280k completes(step 4) to reduce the latency of this read operation. After applying the patch, these 8 pages will be copied before the next IO completes: 1. read 1280k 2. copy 41 pages and issue read ahead for next 1280k 3. copy 31 pages to user buffer 4. copy 8 pages to user buffer 5. wait the next 1280k 6. copy 20 folios(64k) to user buffer This patch drops a setting of IOCB_NOWAIT for AIO, which is fine because filemap_read will set it again for AIO. The final solution provided by Matthew Wilcox: Link: https://lore.kernel.org/linux-fsdevel/aIDy076Sxt544qja@casper.infradead.org/ Link: https://lkml.kernel.org/r/20250728083952.75518-3-chizhiling@163.com Signed-off-by: Chi Zhiling Suggested-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Jan Kara Signed-off-by: Andrew Morton --- mm/filemap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index d6f95513241f..6e954156bb77 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2622,9 +2622,10 @@ retry: goto err; } if (!folio_test_uptodate(folio)) { - if ((iocb->ki_flags & IOCB_WAITQ) && - folio_batch_count(fbatch) > 1) - iocb->ki_flags |= IOCB_NOWAIT; + if (folio_batch_count(fbatch) > 1) { + err = -EAGAIN; + goto err; + } err = filemap_update_page(iocb, mapping, count, folio, need_uptodate); if (err) From b322e88b3d553e85b4e15779491c70022783faa4 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Thu, 14 Aug 2025 18:23:33 +0800 Subject: [PATCH 115/372] mm/hugetlb: early exit from hugetlb_pages_alloc_boot() when max_huge_pages=0 Optimize hugetlb_pages_alloc_boot() to return immediately when max_huge_pages is 0, avoiding unnecessary CPU cycles and the below log message when hugepages aren't configured in the kernel command line. [ 3.702280] HugeTLB: allocation took 0ms with hugepage_allocation_threads=32 Link: https://lkml.kernel.org/r/20250814102333.4428-1-lirongqing@baidu.com Signed-off-by: Li RongQing Reviewed-by: Dev Jain Tested-by: Dev Jain Reviewed-by: Jane Chu Acked-by: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 753f99b4c718..514fab5a20ef 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3654,6 +3654,9 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) return; } + if (!h->max_huge_pages) + return; + /* do node specific alloc */ if (hugetlb_hstate_alloc_pages_specific_nodes(h)) return; From 9eff16bd3a4b97b3e13337eddaa12abe6e0b2a78 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 18 Aug 2025 14:46:18 -0400 Subject: [PATCH 116/372] mm/huge_memory: add new_order and offset to split_huge_pages*() pr_debug Patch series "Better split_huge_page_test result check", v5. This patchset uses kpageflags to get after-split folio orders for a better split_huge_page_test result check[1]. The added gather_after_split_folio_orders() scans through a VPN range and collects the numbers of folios at different orders. check_after_split_folio_orders() compares the result of gather_after_split_folio_orders() to a given list of numbers of different orders. This patchset also adds new order and in folio offset to the split huge page debugfs's pr_debug()s; This patch (of 5): They are useful information for debugging split huge page tests. Link: https://lkml.kernel.org/r/20250818184622.1521620-1-ziy@nvidia.com Link: https://lkml.kernel.org/r/20250818184622.1521620-2-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: Donet Tom Reviewed-by: wang lian Reviewed-by: Baolin Wang Reviewed-by: Barry Song Acked-by: David Hildenbrand Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/huge_memory.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aac5f0a2cb54..2a47cd3bb649 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4320,8 +4320,8 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, goto out; } - pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n", - pid, vaddr_start, vaddr_end); + pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n", + pid, vaddr_start, vaddr_end, new_order, in_folio_offset); mmap_read_lock(mm); /* @@ -4431,8 +4431,8 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, if (IS_ERR(candidate)) goto out; - pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n", - file_path, off_start, off_end); + pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx], new_order: %u, in_folio_offset: %ld\n", + file_path, off_start, off_end, new_order, in_folio_offset); mapping = candidate->f_mapping; min_order = mapping_min_folio_order(mapping); From 72a07c03909b430b454d7557d2b8fe08b01da42d Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 18 Aug 2025 14:46:19 -0400 Subject: [PATCH 117/372] selftests/mm: mark all functions static in split_huge_page_test.c All functions are only used within the file. Link: https://lkml.kernel.org/r/20250818184622.1521620-3-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: wang lian Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Donet Tom Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- .../selftests/mm/split_huge_page_test.c | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 4e3408263a77..001a6ff24bf7 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -37,7 +37,7 @@ uint64_t pmd_pagesize; #define PFN_MASK ((1UL<<55)-1) #define KPF_THP (1UL<<22) -int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +static int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) { uint64_t paddr; uint64_t page_flags; @@ -135,7 +135,7 @@ static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, int nr_hp rss_anon_before, rss_anon_after); } -void split_pmd_zero_pages(void) +static void split_pmd_zero_pages(void) { char *one_page; int nr_hpages = 4; @@ -147,7 +147,7 @@ void split_pmd_zero_pages(void) free(one_page); } -void split_pmd_thp_to_order(int order) +static void split_pmd_thp_to_order(int order) { char *one_page; size_t len = 4 * pmd_pagesize; @@ -181,7 +181,7 @@ void split_pmd_thp_to_order(int order) free(one_page); } -void split_pte_mapped_thp(void) +static void split_pte_mapped_thp(void) { char *one_page, *pte_mapped, *pte_mapped2; size_t len = 4 * pmd_pagesize; @@ -264,7 +264,7 @@ void split_pte_mapped_thp(void) close(kpageflags_fd); } -void split_file_backed_thp(int order) +static void split_file_backed_thp(int order) { int status; int fd; @@ -366,7 +366,7 @@ out: ksft_exit_fail_msg("Error occurred\n"); } -bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template, +static bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template, const char **thp_fs_loc) { if (xfs_path) { @@ -382,7 +382,7 @@ bool prepare_thp_fs(const char *xfs_path, char *thp_fs_template, return true; } -void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp) +static void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp) { int status; @@ -395,8 +395,8 @@ void cleanup_thp_fs(const char *thp_fs_loc, bool created_tmp) strerror(errno)); } -int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, int *fd, - char **addr) +static int create_pagecache_thp_and_fd(const char *testfile, size_t fd_size, + int *fd, char **addr) { size_t i; unsigned char buf[1024]; @@ -462,8 +462,8 @@ err_out_unlink: return -1; } -void split_thp_in_pagecache_to_order_at(size_t fd_size, const char *fs_loc, - int order, int offset) +static void split_thp_in_pagecache_to_order_at(size_t fd_size, + const char *fs_loc, int order, int offset) { int fd; char *addr; From bd66448f2a0e8da0c7ff01ce2e6dbc2766cf2692 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 18 Aug 2025 14:46:20 -0400 Subject: [PATCH 118/372] selftests/mm: reimplement is_backed_by_thp() with more precise check and rename it to is_backed_by_folio(). is_backed_by_folio() checks if the given vaddr is backed a folio with a given order. It does so by: 1. getting the pfn of the vaddr; 2. checking kpageflags of the pfn; if order is greater than 0: 3. checking kpageflags of the head pfn; 4. checking kpageflags of all tail pfns. pmd_order is added to split_huge_page_test.c and replaces max_order. [ziy@nvidia.com: reduce code duplication, per David] Link: https://lkml.kernel.org/r/F54782D6-65A3-4D35-AE03-8ADE636EE258@nvidia.com Link: https://lkml.kernel.org/r/20250818184622.1521620-4-ziy@nvidia.com Signed-off-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: wang lian Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Donet Tom Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- .../selftests/mm/split_huge_page_test.c | 89 ++++++++++++++----- tools/testing/selftests/mm/vm_util.c | 13 +++ tools/testing/selftests/mm/vm_util.h | 4 + 3 files changed, 82 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 001a6ff24bf7..7be06dae663e 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -25,6 +25,7 @@ uint64_t pagesize; unsigned int pageshift; uint64_t pmd_pagesize; +unsigned int pmd_order; #define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" #define SMAP_PATH "/proc/self/smaps" @@ -34,26 +35,67 @@ uint64_t pmd_pagesize; #define PID_FMT_OFFSET "%d,0x%lx,0x%lx,%d,%d" #define PATH_FMT "%s,0x%lx,0x%lx,%d" -#define PFN_MASK ((1UL<<55)-1) -#define KPF_THP (1UL<<22) - -static int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +static bool is_backed_by_folio(char *vaddr, int order, int pagemap_fd, + int kpageflags_fd) { - uint64_t paddr; - uint64_t page_flags; + const unsigned long nr_pages = 1UL << order; + unsigned long pfn_head; + uint64_t pfn_flags; + unsigned long pfn; + unsigned long i; - if (pagemap_file) { - pread(pagemap_file, &paddr, sizeof(paddr), - ((long)vaddr >> pageshift) * sizeof(paddr)); + pfn = pagemap_get_pfn(pagemap_fd, vaddr); - if (kpageflags_file) { - pread(kpageflags_file, &page_flags, sizeof(page_flags), - (paddr & PFN_MASK) * sizeof(page_flags)); + /* non present page */ + if (pfn == -1UL) + return false; - return !!(page_flags & KPF_THP); - } + if (pageflags_get(pfn, kpageflags_fd, &pfn_flags)) + goto fail; + + /* check for order-0 pages */ + if (!order) { + if (pfn_flags & (KPF_THP | KPF_COMPOUND_HEAD | KPF_COMPOUND_TAIL)) + return false; + return true; } - return 0; + + /* non THP folio */ + if (!(pfn_flags & KPF_THP)) + return false; + + pfn_head = pfn & ~(nr_pages - 1); + + if (pageflags_get(pfn_head, kpageflags_fd, &pfn_flags)) + goto fail; + + /* head PFN has no compound_head flag set */ + if (!(pfn_flags & (KPF_THP | KPF_COMPOUND_HEAD))) + return false; + + /* check all tail PFN flags */ + for (i = 1; i < nr_pages; i++) { + if (pageflags_get(pfn_head + i, kpageflags_fd, &pfn_flags)) + goto fail; + if (!(pfn_flags & (KPF_THP | KPF_COMPOUND_TAIL))) + return false; + } + + /* + * check the PFN after this folio, but if its flags cannot be obtained, + * assume this folio has the expected order + */ + if (pageflags_get(pfn_head + nr_pages, kpageflags_fd, &pfn_flags)) + return true; + + /* this folio is bigger than the given order */ + if (pfn_flags & (KPF_THP | KPF_COMPOUND_TAIL)) + return false; + + return true; +fail: + ksft_exit_fail_msg("Failed to get folio info\n"); + return false; } static void write_file(const char *path, const char *buf, size_t buflen) @@ -234,7 +276,7 @@ static void split_pte_mapped_thp(void) thp_size = 0; for (i = 0; i < pagesize * 4; i++) if (i % pagesize == 0 && - is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + is_backed_by_folio(&pte_mapped[i], pmd_order, pagemap_fd, kpageflags_fd)) thp_size++; if (thp_size != 4) @@ -251,7 +293,7 @@ static void split_pte_mapped_thp(void) ksft_exit_fail_msg("%ld byte corrupted\n", i); if (i % pagesize == 0 && - is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + !is_backed_by_folio(&pte_mapped[i], 0, pagemap_fd, kpageflags_fd)) thp_size++; } @@ -525,7 +567,6 @@ int main(int argc, char **argv) const char *fs_loc; bool created_tmp; int offset; - unsigned int max_order; unsigned int nr_pages; unsigned int tests; @@ -546,28 +587,28 @@ int main(int argc, char **argv) ksft_exit_fail_msg("Reading PMD pagesize failed\n"); nr_pages = pmd_pagesize / pagesize; - max_order = sz2ord(pmd_pagesize, pagesize); - tests = 2 + (max_order - 1) + (2 * max_order) + (max_order - 1) * 4 + 2; + pmd_order = sz2ord(pmd_pagesize, pagesize); + tests = 2 + (pmd_order - 1) + (2 * pmd_order) + (pmd_order - 1) * 4 + 2; ksft_set_plan(tests); fd_size = 2 * pmd_pagesize; split_pmd_zero_pages(); - for (i = 0; i < max_order; i++) + for (i = 0; i < pmd_order; i++) if (i != 1) split_pmd_thp_to_order(i); split_pte_mapped_thp(); - for (i = 0; i < max_order; i++) + for (i = 0; i < pmd_order; i++) split_file_backed_thp(i); created_tmp = prepare_thp_fs(optional_xfs_path, fs_loc_template, &fs_loc); - for (i = max_order - 1; i >= 0; i--) + for (i = pmd_order - 1; i >= 0; i--) split_thp_in_pagecache_to_order_at(fd_size, fs_loc, i, -1); - for (i = 0; i < max_order; i++) + for (i = 0; i < pmd_order; i++) for (offset = 0; offset < nr_pages; offset += MAX(nr_pages / 4, 1 << i)) diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 6a239aa413e2..741fc129313d 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -338,6 +338,19 @@ int detect_hugetlb_page_sizes(size_t sizes[], int max) return count; } +int pageflags_get(unsigned long pfn, int kpageflags_fd, uint64_t *flags) +{ + size_t count; + + count = pread(kpageflags_fd, flags, sizeof(*flags), + pfn * sizeof(*flags)); + + if (count != sizeof(*flags)) + return -1; + + return 0; +} + /* If `ioctls' non-NULL, the allowed ioctls will be returned into the var */ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, bool miss, bool wp, bool minor, uint64_t *ioctls) diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 3da56feeb944..ab8722f482ae 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -18,6 +18,9 @@ #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) +#define KPF_COMPOUND_HEAD BIT_ULL(15) +#define KPF_COMPOUND_TAIL BIT_ULL(16) +#define KPF_THP BIT_ULL(22) /* * Ignore the checkpatch warning, we must read from x but don't want to do * anything with it in order to trigger a read page fault. We therefore must use @@ -85,6 +88,7 @@ bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); int64_t allocate_transhuge(void *ptr, int pagemap_fd); unsigned long default_huge_page_size(void); int detect_hugetlb_page_sizes(size_t sizes[], int max); +int pageflags_get(unsigned long pfn, int kpageflags_fd, uint64_t *flags); int uffd_register(int uffd, void *addr, uint64_t len, bool miss, bool wp, bool minor); From fca418e59afafbad3e3656905d3cee241cc0b7a2 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 18 Aug 2025 14:46:21 -0400 Subject: [PATCH 119/372] selftests/mm: add check_after_split_folio_orders() helper The helper gathers a folio order statistics of folios within a virtual address range and checks it against a given order list. It aims to provide a more precise folio order check instead of just checking the existence of PMD folios. The helper will be used the upcoming commit. Link: https://lkml.kernel.org/r/20250818184622.1521620-5-ziy@nvidia.com Signed-off-by: Zi Yan Tested-by: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Donet Tom Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: wang lian Cc: Wei Yang Signed-off-by: Andrew Morton --- .../selftests/mm/split_huge_page_test.c | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 7be06dae663e..cd90e8ed8dd1 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -98,6 +98,158 @@ fail: return false; } +static int vaddr_pageflags_get(char *vaddr, int pagemap_fd, int kpageflags_fd, + uint64_t *flags) +{ + unsigned long pfn; + + pfn = pagemap_get_pfn(pagemap_fd, vaddr); + + /* non-present PFN */ + if (pfn == -1UL) + return 1; + + if (pageflags_get(pfn, kpageflags_fd, flags)) + return -1; + + return 0; +} + +/* + * gather_after_split_folio_orders - scan through [vaddr_start, len) and record + * folio orders + * + * @vaddr_start: start vaddr + * @len: range length + * @pagemap_fd: file descriptor to /proc//pagemap + * @kpageflags_fd: file descriptor to /proc/kpageflags + * @orders: output folio order array + * @nr_orders: folio order array size + * + * gather_after_split_folio_orders() scan through [vaddr_start, len) and check + * all folios within the range and record their orders. All order-0 pages will + * be recorded. Non-present vaddr is skipped. + * + * NOTE: the function is used to check folio orders after a split is performed, + * so it assumes [vaddr_start, len) fully maps to after-split folios within that + * range. + * + * Return: 0 - no error, -1 - unhandled cases + */ +static int gather_after_split_folio_orders(char *vaddr_start, size_t len, + int pagemap_fd, int kpageflags_fd, int orders[], int nr_orders) +{ + uint64_t page_flags = 0; + int cur_order = -1; + char *vaddr; + + if (pagemap_fd == -1 || kpageflags_fd == -1) + return -1; + if (!orders) + return -1; + if (nr_orders <= 0) + return -1; + + for (vaddr = vaddr_start; vaddr < vaddr_start + len;) { + char *next_folio_vaddr; + int status; + + status = vaddr_pageflags_get(vaddr, pagemap_fd, kpageflags_fd, + &page_flags); + if (status < 0) + return -1; + + /* skip non present vaddr */ + if (status == 1) { + vaddr += psize(); + continue; + } + + /* all order-0 pages with possible false postive (non folio) */ + if (!(page_flags & (KPF_COMPOUND_HEAD | KPF_COMPOUND_TAIL))) { + orders[0]++; + vaddr += psize(); + continue; + } + + /* skip non thp compound pages */ + if (!(page_flags & KPF_THP)) { + vaddr += psize(); + continue; + } + + /* vpn points to part of a THP at this point */ + if (page_flags & KPF_COMPOUND_HEAD) + cur_order = 1; + else { + vaddr += psize(); + continue; + } + + next_folio_vaddr = vaddr + (1UL << (cur_order + pshift())); + + if (next_folio_vaddr >= vaddr_start + len) + break; + + while ((status = vaddr_pageflags_get(next_folio_vaddr, + pagemap_fd, kpageflags_fd, + &page_flags)) >= 0) { + /* + * non present vaddr, next compound head page, or + * order-0 page + */ + if (status == 1 || + (page_flags & KPF_COMPOUND_HEAD) || + !(page_flags & (KPF_COMPOUND_HEAD | KPF_COMPOUND_TAIL))) { + if (cur_order < nr_orders) { + orders[cur_order]++; + cur_order = -1; + vaddr = next_folio_vaddr; + } + break; + } + + cur_order++; + next_folio_vaddr = vaddr + (1UL << (cur_order + pshift())); + } + + if (status < 0) + return status; + } + if (cur_order > 0 && cur_order < nr_orders) + orders[cur_order]++; + return 0; +} + +static int check_after_split_folio_orders(char *vaddr_start, size_t len, + int pagemap_fd, int kpageflags_fd, int orders[], int nr_orders) +{ + int *vaddr_orders; + int status; + int i; + + vaddr_orders = (int *)malloc(sizeof(int) * nr_orders); + + if (!vaddr_orders) + ksft_exit_fail_msg("Cannot allocate memory for vaddr_orders"); + + memset(vaddr_orders, 0, sizeof(int) * nr_orders); + status = gather_after_split_folio_orders(vaddr_start, len, pagemap_fd, + kpageflags_fd, vaddr_orders, nr_orders); + if (status) + ksft_exit_fail_msg("gather folio info failed\n"); + + for (i = 0; i < nr_orders; i++) + if (vaddr_orders[i] != orders[i]) { + ksft_print_msg("order %d: expected: %d got %d\n", i, + orders[i], vaddr_orders[i]); + status = -1; + } + + free(vaddr_orders); + return status; +} + static void write_file(const char *path, const char *buf, size_t buflen) { int fd; From c55ed758e04717564347c40688d40bf231cd7964 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 18 Aug 2025 14:46:22 -0400 Subject: [PATCH 120/372] selftests/mm: check after-split folio orders in split_huge_page_test Instead of just checking the existence of PMD folios before and after folio split tests, use check_folio_orders() to check after-split folio orders. The split ranges in split_thp_in_pagecache_to_order_at() are changed to [addr, addr + pagesize) for every pmd_pagesize. It prevents folios within the range being split multiple times due to debugfs split function always perform splits with a pagesize step for a given range. The following tests are not changed: 1. split_pte_mapped_thp: the test already uses kpageflags to check; 2. split_file_backed_thp: no vaddr available. Link: https://lkml.kernel.org/r/20250818184622.1521620-6-ziy@nvidia.com Signed-off-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Donet Tom Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: wang lian Cc: Wei Yang Signed-off-by: Andrew Morton --- .../selftests/mm/split_huge_page_test.c | 88 ++++++++++++++----- 1 file changed, 64 insertions(+), 24 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index cd90e8ed8dd1..10ae65ea032f 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -26,6 +26,7 @@ uint64_t pagesize; unsigned int pageshift; uint64_t pmd_pagesize; unsigned int pmd_order; +int *expected_orders; #define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" #define SMAP_PATH "/proc/self/smaps" @@ -35,6 +36,11 @@ unsigned int pmd_order; #define PID_FMT_OFFSET "%d,0x%lx,0x%lx,%d,%d" #define PATH_FMT "%s,0x%lx,0x%lx,%d" +const char *pagemap_proc = "/proc/self/pagemap"; +const char *kpageflags_proc = "/proc/kpageflags"; +int pagemap_fd; +int kpageflags_fd; + static bool is_backed_by_folio(char *vaddr, int order, int pagemap_fd, int kpageflags_fd) { @@ -367,6 +373,13 @@ static void split_pmd_thp_to_order(int order) if (one_page[i] != (char)i) ksft_exit_fail_msg("%ld byte corrupted\n", i); + memset(expected_orders, 0, sizeof(int) * (pmd_order + 1)); + expected_orders[order] = 4 << (pmd_order - order); + + if (check_after_split_folio_orders(one_page, len, pagemap_fd, + kpageflags_fd, expected_orders, + (pmd_order + 1))) + ksft_exit_fail_msg("Unexpected THP split\n"); if (!check_huge_anon(one_page, 0, pmd_pagesize)) ksft_exit_fail_msg("Still AnonHugePages not split\n"); @@ -381,22 +394,6 @@ static void split_pte_mapped_thp(void) size_t len = 4 * pmd_pagesize; uint64_t thp_size; size_t i; - const char *pagemap_template = "/proc/%d/pagemap"; - const char *kpageflags_proc = "/proc/kpageflags"; - char pagemap_proc[255]; - int pagemap_fd; - int kpageflags_fd; - - if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) - ksft_exit_fail_msg("get pagemap proc error: %s\n", strerror(errno)); - - pagemap_fd = open(pagemap_proc, O_RDONLY); - if (pagemap_fd == -1) - ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno)); - - kpageflags_fd = open(kpageflags_proc, O_RDONLY); - if (kpageflags_fd == -1) - ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno)); one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); @@ -454,8 +451,6 @@ static void split_pte_mapped_thp(void) ksft_test_result_pass("Split PTE-mapped huge pages successful\n"); munmap(one_page, len); - close(pagemap_fd); - close(kpageflags_fd); } static void split_file_backed_thp(int order) @@ -660,6 +655,7 @@ static void split_thp_in_pagecache_to_order_at(size_t fd_size, const char *fs_loc, int order, int offset) { int fd; + char *split_addr; char *addr; size_t i; char testfile[INPUT_MAX]; @@ -673,14 +669,33 @@ static void split_thp_in_pagecache_to_order_at(size_t fd_size, err = create_pagecache_thp_and_fd(testfile, fd_size, &fd, &addr); if (err) return; + err = 0; - if (offset == -1) - write_debugfs(PID_FMT, getpid(), (uint64_t)addr, - (uint64_t)addr + fd_size, order); - else - write_debugfs(PID_FMT_OFFSET, getpid(), (uint64_t)addr, - (uint64_t)addr + fd_size, order, offset); + memset(expected_orders, 0, sizeof(int) * (pmd_order + 1)); + /* + * use [split_addr, split_addr + pagesize) range to split THPs, since + * the debugfs function always split a range with pagesize step and + * providing a full [addr, addr + fd_size) range can trigger multiple + * splits, complicating after-split result checking. + */ + if (offset == -1) { + for (split_addr = addr; split_addr < addr + fd_size; split_addr += pmd_pagesize) + write_debugfs(PID_FMT, getpid(), (uint64_t)split_addr, + (uint64_t)split_addr + pagesize, order); + + expected_orders[order] = fd_size / (pagesize << order); + } else { + int times = fd_size / pmd_pagesize; + + for (split_addr = addr; split_addr < addr + fd_size; split_addr += pmd_pagesize) + write_debugfs(PID_FMT_OFFSET, getpid(), (uint64_t)split_addr, + (uint64_t)split_addr + pagesize, order, offset); + + for (i = order + 1; i < pmd_order; i++) + expected_orders[i] = times; + expected_orders[order] = 2 * times; + } for (i = 0; i < fd_size; i++) if (*(addr + i) != (char)i) { @@ -689,6 +704,14 @@ static void split_thp_in_pagecache_to_order_at(size_t fd_size, goto out; } + if (check_after_split_folio_orders(addr, fd_size, pagemap_fd, + kpageflags_fd, expected_orders, + (pmd_order + 1))) { + ksft_print_msg("Unexpected THP split\n"); + err = 1; + goto out; + } + if (!check_huge_file(addr, 0, pmd_pagesize)) { ksft_print_msg("Still FilePmdMapped not split\n"); err = EXIT_FAILURE; @@ -740,9 +763,22 @@ int main(int argc, char **argv) nr_pages = pmd_pagesize / pagesize; pmd_order = sz2ord(pmd_pagesize, pagesize); + + expected_orders = (int *)malloc(sizeof(int) * (pmd_order + 1)); + if (!expected_orders) + ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno)); + tests = 2 + (pmd_order - 1) + (2 * pmd_order) + (pmd_order - 1) * 4 + 2; ksft_set_plan(tests); + pagemap_fd = open(pagemap_proc, O_RDONLY); + if (pagemap_fd == -1) + ksft_exit_fail_msg("read pagemap: %s\n", strerror(errno)); + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + if (kpageflags_fd == -1) + ksft_exit_fail_msg("read kpageflags: %s\n", strerror(errno)); + fd_size = 2 * pmd_pagesize; split_pmd_zero_pages(); @@ -767,6 +803,10 @@ int main(int argc, char **argv) split_thp_in_pagecache_to_order_at(fd_size, fs_loc, i, offset); cleanup_thp_fs(fs_loc, created_tmp); + close(pagemap_fd); + close(kpageflags_fd); + free(expected_orders); + ksft_finished(); return 0; From 63ec0c26b682ca235953490dfa82b8fa4d4ab4a0 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Aug 2025 14:18:03 +0800 Subject: [PATCH 121/372] tmpfs: preserve SB_I_VERSION on remount Now tmpfs enables i_version by default and tmpfs does not modify it. But SB_I_VERSION can also be modified via sb_flags, and reconfigure_super() always overwrites the existing flags with the latest ones. This means that if tmpfs is remounted without specifying iversion, the default i_version will be unexpectedly disabled. To ensure iversion remains enabled, SB_I_VERSION is now always set for fc->sb_flags in shmem_init_fs_context(), instead of for sb->s_flags in shmem_fill_super(). Link: https://lkml.kernel.org/r/20250819061803.1496443-1-libaokun@huaweicloud.com Fixes: 36f05cab0a2c ("tmpfs: add support for an i_version counter") Signed-off-by: Baokun Li Reviewed-by: Baolin Wang Tested-by: Baolin Wang Reviewed-by: Jeff Layton Acked-by: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index d945de3a7f0e..13cc51df3893 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5081,7 +5081,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC | SB_I_VERSION; + sb->s_flags |= SB_NOSEC; #if IS_ENABLED(CONFIG_UNICODE) if (!ctx->encoding && ctx->strict_encoding) { @@ -5385,6 +5385,9 @@ int shmem_init_fs_context(struct fs_context *fc) fc->fs_private = ctx; fc->ops = &shmem_fs_context_ops; +#ifdef CONFIG_TMPFS + fc->sb_flags |= SB_I_VERSION; +#endif return 0; } From b27f292de6b1a39e9fb0f83c79dfe902a9ea86c3 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 19 Aug 2025 08:00:46 +0000 Subject: [PATCH 122/372] selftests/mm: put general ksm operation into vm_util Patch series "test that rmap behaves as expected", v4. As David suggested, currently we don't have a high level test case to verify the behavior of rmap. This patch set introduce the verification on rmap by migration. Patch 1 is a preparation to move ksm related operations into vm_util. Patch 2 is the new test case for rmap. Currently it covers following four scenarios: * anonymous page * shmem page * pagecache page * ksm page This patch (of 2): There are some general ksm operations could be used by other related test cases. Put them into vm_util for common use. This is a preparation patch for later use. Link: https://lkml.kernel.org/r/20250819080047.10063-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20250819080047.10063-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Suggested-by: David Hildenbrand Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Rik van Riel Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Harry Yoo Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 142 +++--------------- tools/testing/selftests/mm/vm_util.c | 123 +++++++++++++++ tools/testing/selftests/mm/vm_util.h | 7 + 3 files changed, 154 insertions(+), 118 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 534aa405cac7..712f43c87736 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -38,11 +38,6 @@ enum ksm_merge_mode { }; static int mem_fd; -static int ksm_fd; -static int ksm_full_scans_fd; -static int proc_self_ksm_stat_fd; -static int proc_self_ksm_merging_pages_fd; -static int ksm_use_zero_pages_fd; static int pagemap_fd; static size_t pagesize; @@ -75,88 +70,6 @@ static bool range_maps_duplicates(char *addr, unsigned long size) return false; } -static long get_my_ksm_zero_pages(void) -{ - char buf[200]; - char *substr_ksm_zero; - size_t value_pos; - ssize_t read_size; - unsigned long my_ksm_zero_pages; - - if (!proc_self_ksm_stat_fd) - return 0; - - read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0); - if (read_size < 0) - return -errno; - - buf[read_size] = 0; - - substr_ksm_zero = strstr(buf, "ksm_zero_pages"); - if (!substr_ksm_zero) - return 0; - - value_pos = strcspn(substr_ksm_zero, "0123456789"); - my_ksm_zero_pages = strtol(substr_ksm_zero + value_pos, NULL, 10); - - return my_ksm_zero_pages; -} - -static long get_my_merging_pages(void) -{ - char buf[10]; - ssize_t ret; - - if (proc_self_ksm_merging_pages_fd < 0) - return proc_self_ksm_merging_pages_fd; - - ret = pread(proc_self_ksm_merging_pages_fd, buf, sizeof(buf) - 1, 0); - if (ret <= 0) - return -errno; - buf[ret] = 0; - - return strtol(buf, NULL, 10); -} - -static long ksm_get_full_scans(void) -{ - char buf[10]; - ssize_t ret; - - ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); - if (ret <= 0) - return -errno; - buf[ret] = 0; - - return strtol(buf, NULL, 10); -} - -static int ksm_merge(void) -{ - long start_scans, end_scans; - - /* Wait for two full scans such that any possible merging happened. */ - start_scans = ksm_get_full_scans(); - if (start_scans < 0) - return start_scans; - if (write(ksm_fd, "1", 1) != 1) - return -errno; - do { - end_scans = ksm_get_full_scans(); - if (end_scans < 0) - return end_scans; - } while (end_scans < start_scans + 2); - - return 0; -} - -static int ksm_unmerge(void) -{ - if (write(ksm_fd, "2", 1) != 1) - return -errno; - return 0; -} - static char *__mmap_and_merge_range(char val, unsigned long size, int prot, enum ksm_merge_mode mode) { @@ -165,12 +78,12 @@ static char *__mmap_and_merge_range(char val, unsigned long size, int prot, int ret; /* Stabilize accounting by disabling KSM completely. */ - if (ksm_unmerge()) { + if (ksm_stop() < 0) { ksft_print_msg("Disabling (unmerging) KSM failed\n"); return err_map; } - if (get_my_merging_pages() > 0) { + if (ksm_get_self_merging_pages() > 0) { ksft_print_msg("Still pages merged\n"); return err_map; } @@ -220,7 +133,7 @@ static char *__mmap_and_merge_range(char val, unsigned long size, int prot, } /* Run KSM to trigger merging and wait. */ - if (ksm_merge()) { + if (ksm_start() < 0) { ksft_print_msg("Running KSM failed\n"); goto unmap; } @@ -229,7 +142,7 @@ static char *__mmap_and_merge_range(char val, unsigned long size, int prot, * Check if anything was merged at all. Ignore the zero page that is * accounted differently (depending on kernel support). */ - if (val && !get_my_merging_pages()) { + if (val && !ksm_get_self_merging_pages()) { ksft_print_msg("No pages got merged\n"); goto unmap; } @@ -276,7 +189,7 @@ static void test_unmerge(void) ksft_test_result(!range_maps_duplicates(map, size), "Pages were unmerged\n"); unmap: - ksm_unmerge(); + ksm_stop(); munmap(map, size); } @@ -289,15 +202,12 @@ static void test_unmerge_zero_pages(void) ksft_print_msg("[RUN] %s\n", __func__); - if (proc_self_ksm_stat_fd < 0) { - ksft_test_result_skip("open(\"/proc/self/ksm_stat\") failed\n"); + if (ksm_get_self_zero_pages() < 0) { + ksft_test_result_skip("accessing \"/proc/self/ksm_stat\" failed\n"); return; } - if (ksm_use_zero_pages_fd < 0) { - ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); - return; - } - if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { + + if (ksm_use_zero_pages() < 0) { ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); return; } @@ -309,7 +219,7 @@ static void test_unmerge_zero_pages(void) /* Check if ksm_zero_pages is updated correctly after KSM merging */ pages_expected = size / pagesize; - if (pages_expected != get_my_ksm_zero_pages()) { + if (pages_expected != ksm_get_self_zero_pages()) { ksft_test_result_fail("'ksm_zero_pages' updated after merging\n"); goto unmap; } @@ -322,7 +232,7 @@ static void test_unmerge_zero_pages(void) /* Check if ksm_zero_pages is updated correctly after unmerging */ pages_expected /= 2; - if (pages_expected != get_my_ksm_zero_pages()) { + if (pages_expected != ksm_get_self_zero_pages()) { ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n"); goto unmap; } @@ -332,7 +242,7 @@ static void test_unmerge_zero_pages(void) *((unsigned int *)&map[offs]) = offs; /* Now we should have no zeropages remaining. */ - if (get_my_ksm_zero_pages()) { + if (ksm_get_self_zero_pages()) { ksft_test_result_fail("'ksm_zero_pages' updated after write fault\n"); goto unmap; } @@ -341,7 +251,7 @@ static void test_unmerge_zero_pages(void) ksft_test_result(!range_maps_duplicates(map, size), "KSM zero pages were unmerged\n"); unmap: - ksm_unmerge(); + ksm_stop(); munmap(map, size); } @@ -370,7 +280,7 @@ static void test_unmerge_discarded(void) ksft_test_result(!range_maps_duplicates(map, size), "Pages were unmerged\n"); unmap: - ksm_unmerge(); + ksm_stop(); munmap(map, size); } @@ -457,7 +367,7 @@ static void test_unmerge_uffd_wp(void) close_uffd: close(uffd); unmap: - ksm_unmerge(); + ksm_stop(); munmap(map, size); } #endif @@ -521,7 +431,7 @@ static int test_child_ksm(void) else if (map == MAP_MERGE_SKIP) return 3; - ksm_unmerge(); + ksm_stop(); munmap(map, size); return 0; } @@ -654,7 +564,7 @@ static void test_prctl_unmerge(void) ksft_test_result(!range_maps_duplicates(map, size), "Pages were unmerged\n"); unmap: - ksm_unmerge(); + ksm_stop(); munmap(map, size); } @@ -688,7 +598,7 @@ static void test_prot_none(void) ksft_test_result(!range_maps_duplicates(map, size), "Pages were unmerged\n"); unmap: - ksm_unmerge(); + ksm_stop(); munmap(map, size); } @@ -697,19 +607,15 @@ static void init_global_file_handles(void) mem_fd = open("/proc/self/mem", O_RDWR); if (mem_fd < 0) ksft_exit_fail_msg("opening /proc/self/mem failed\n"); - ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); - if (ksm_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); - ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); - if (ksm_full_scans_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + if (ksm_stop() < 0) + ksft_exit_skip("accessing \"/sys/kernel/mm/ksm/run\") failed\n"); + if (ksm_get_full_scans() < 0) + ksft_exit_skip("accessing \"/sys/kernel/mm/ksm/full_scans\") failed\n"); pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); - proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); - proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages", - O_RDONLY); - ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + if (ksm_get_self_merging_pages() < 0) + ksft_exit_skip("accessing \"/proc/self/ksm_merging_pages\") failed\n"); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 741fc129313d..56e9bd541edd 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -578,3 +578,126 @@ bool detect_huge_zeropage(void) close(fd); return enabled; } + +long ksm_get_self_zero_pages(void) +{ + int proc_self_ksm_stat_fd; + char buf[200]; + char *substr_ksm_zero; + size_t value_pos; + ssize_t read_size; + + proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); + if (proc_self_ksm_stat_fd < 0) + return -errno; + + read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0); + close(proc_self_ksm_stat_fd); + if (read_size < 0) + return -errno; + + buf[read_size] = 0; + + substr_ksm_zero = strstr(buf, "ksm_zero_pages"); + if (!substr_ksm_zero) + return 0; + + value_pos = strcspn(substr_ksm_zero, "0123456789"); + return strtol(substr_ksm_zero + value_pos, NULL, 10); +} + +long ksm_get_self_merging_pages(void) +{ + int proc_self_ksm_merging_pages_fd; + char buf[10]; + ssize_t ret; + + proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages", + O_RDONLY); + if (proc_self_ksm_merging_pages_fd < 0) + return -errno; + + ret = pread(proc_self_ksm_merging_pages_fd, buf, sizeof(buf) - 1, 0); + close(proc_self_ksm_merging_pages_fd); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + +long ksm_get_full_scans(void) +{ + int ksm_full_scans_fd; + char buf[10]; + ssize_t ret; + + ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); + if (ksm_full_scans_fd < 0) + return -errno; + + ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); + close(ksm_full_scans_fd); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + +int ksm_use_zero_pages(void) +{ + int ksm_use_zero_pages_fd; + ssize_t ret; + + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + if (ksm_use_zero_pages_fd < 0) + return -errno; + + ret = write(ksm_use_zero_pages_fd, "1", 1); + close(ksm_use_zero_pages_fd); + return ret == 1 ? 0 : -errno; +} + +int ksm_start(void) +{ + int ksm_fd; + ssize_t ret; + long start_scans, end_scans; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + /* Wait for two full scans such that any possible merging happened. */ + start_scans = ksm_get_full_scans(); + if (start_scans < 0) { + close(ksm_fd); + return start_scans; + } + ret = write(ksm_fd, "1", 1); + close(ksm_fd); + if (ret != 1) + return -errno; + do { + end_scans = ksm_get_full_scans(); + if (end_scans < 0) + return end_scans; + } while (end_scans < start_scans + 2); + + return 0; +} + +int ksm_stop(void) +{ + int ksm_fd; + ssize_t ret; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + ret = write(ksm_fd, "2", 1); + close(ksm_fd); + return ret == 1 ? 0 : -errno; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index ab8722f482ae..07c4acfd84b6 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -139,6 +139,13 @@ static inline int sz2ord(size_t size, size_t pagesize) void *sys_mremap(void *old_address, unsigned long old_size, unsigned long new_size, int flags, void *new_address); +long ksm_get_self_zero_pages(void); +long ksm_get_self_merging_pages(void); +long ksm_get_full_scans(void); +int ksm_use_zero_pages(void); +int ksm_start(void); +int ksm_stop(void); + /* * On ppc64 this will only work with radix 2M hugepage size */ From c9615059cab5ad8aa6b96195163a7478fcef194c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 19 Aug 2025 08:00:47 +0000 Subject: [PATCH 123/372] selftests/mm: test that rmap behaves as expected As David suggested, currently we don't have a high level test case to verify the behavior of rmap. This patch introduce the verification on rmap by migration. The general idea is if migrate one shared page between processes, this would be reflected in all related processes. Otherwise, we have problem in rmap. Currently it covers following four scenarios: * anonymous page * shmem page * pagecache page * ksm page Link: https://lkml.kernel.org/r/20250819080047.10063-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Suggested-by: David Hildenbrand Cc: David Hildenbrand Cc: Lorenzo Stoakes Cc: Rik van Riel Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Harry Yoo Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 3 + tools/testing/selftests/mm/rmap.c | 433 ++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 4 + 5 files changed, 442 insertions(+) create mode 100644 tools/testing/selftests/mm/rmap.c diff --git a/MAINTAINERS b/MAINTAINERS index 6dcfbd11efef..cfb0caba3c2d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16199,6 +16199,7 @@ S: Maintained F: include/linux/rmap.h F: mm/page_vma_mapped.c F: mm/rmap.c +F: tools/testing/selftests/mm/rmap.c MEMORY MANAGEMENT - SECRETMEM M: Andrew Morton diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index eb023ea857b3..c2a8586e51a1 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -59,3 +59,4 @@ pkey_sighandler_tests_64 guard-regions merge prctl_thp_disable +rmap diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 2bb8d3ebc17c..5a1dee50b898 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -102,6 +102,7 @@ TEST_GEN_FILES += hugetlb_dio TEST_GEN_FILES += droppable TEST_GEN_FILES += guard-regions TEST_GEN_FILES += merge +TEST_GEN_FILES += rmap ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty @@ -229,6 +230,8 @@ $(OUTPUT)/ksm_tests: LDLIBS += -lnuma $(OUTPUT)/migration: LDLIBS += -lnuma +$(OUTPUT)/rmap: LDLIBS += -lnuma + local_config.mk local_config.h: check_config.sh /bin/sh ./check_config.sh $(CC) diff --git a/tools/testing/selftests/mm/rmap.c b/tools/testing/selftests/mm/rmap.c new file mode 100644 index 000000000000..13f7bccfd0a9 --- /dev/null +++ b/tools/testing/selftests/mm/rmap.c @@ -0,0 +1,433 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * RMAP functional tests + * + * Author(s): Wei Yang + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vm_util.h" + +#define TOTAL_LEVEL 5 +#define MAX_CHILDREN 3 + +#define FAIL_ON_CHECK (1 << 0) +#define FAIL_ON_WORK (1 << 1) + +struct sembuf sem_wait = {0, -1, 0}; +struct sembuf sem_signal = {0, 1, 0}; + +enum backend_type { + ANON, + SHM, + NORM_FILE, +}; + +#define PREFIX "kst_rmap" +#define MAX_FILENAME_LEN 256 +const char *suffixes[] = { + "", + "_shm", + "_file", +}; + +struct global_data; +typedef int (*work_fn)(struct global_data *data); +typedef int (*check_fn)(struct global_data *data); +typedef void (*prepare_fn)(struct global_data *data); + +struct global_data { + int worker_level; + + int semid; + int pipefd[2]; + + unsigned int mapsize; + unsigned int rand_seed; + char *region; + + prepare_fn do_prepare; + work_fn do_work; + check_fn do_check; + + enum backend_type backend; + char filename[MAX_FILENAME_LEN]; + + unsigned long *expected_pfn; +}; + +/* + * Create a process tree with TOTAL_LEVEL height and at most MAX_CHILDREN + * children for each. + * + * It will randomly select one process as 'worker' process which will + * 'do_work' until all processes are created. And all other processes will + * wait until 'worker' finish its work. + */ +void propagate_children(struct __test_metadata *_metadata, struct global_data *data) +{ + pid_t root_pid, pid; + unsigned int num_child; + int status; + int ret = 0; + int curr_child, worker_child; + int curr_level = 1; + bool is_worker = true; + + root_pid = getpid(); +repeat: + num_child = rand_r(&data->rand_seed) % MAX_CHILDREN + 1; + worker_child = is_worker ? rand_r(&data->rand_seed) % num_child : -1; + + for (curr_child = 0; curr_child < num_child; curr_child++) { + pid = fork(); + + if (pid < 0) { + perror("Error: fork\n"); + } else if (pid == 0) { + curr_level++; + + if (curr_child != worker_child) + is_worker = false; + + if (curr_level == TOTAL_LEVEL) + break; + + data->rand_seed += curr_child; + goto repeat; + } + } + + if (data->do_prepare) + data->do_prepare(data); + + close(data->pipefd[1]); + + if (is_worker && curr_level == data->worker_level) { + /* This is the worker process, first wait last process created */ + char buf; + + while (read(data->pipefd[0], &buf, 1) > 0) + ; + + if (data->do_work) + ret = data->do_work(data); + + /* Kick others */ + semctl(data->semid, 0, IPC_RMID); + } else { + /* Wait worker finish */ + semop(data->semid, &sem_wait, 1); + if (data->do_check) + ret = data->do_check(data); + } + + /* Wait all child to quit */ + while (wait(&status) > 0) { + if (WIFEXITED(status)) + ret |= WEXITSTATUS(status); + } + + if (getpid() == root_pid) { + if (ret & FAIL_ON_WORK) + SKIP(return, "Failed in worker"); + + ASSERT_EQ(ret, 0); + } else { + exit(ret); + } +} + +FIXTURE(migrate) +{ + struct global_data data; +}; + +FIXTURE_SETUP(migrate) +{ + struct global_data *data = &self->data; + + if (numa_available() < 0) + SKIP(return, "NUMA not available"); + if (numa_bitmask_weight(numa_all_nodes_ptr) <= 1) + SKIP(return, "Not enough NUMA nodes available"); + + data->mapsize = getpagesize(); + + data->expected_pfn = mmap(0, sizeof(unsigned long), + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(data->expected_pfn, MAP_FAILED); + + /* Prepare semaphore */ + data->semid = semget(IPC_PRIVATE, 1, 0666 | IPC_CREAT); + ASSERT_NE(data->semid, -1); + ASSERT_NE(semctl(data->semid, 0, SETVAL, 0), -1); + + /* Prepare pipe */ + ASSERT_NE(pipe(data->pipefd), -1); + + data->rand_seed = time(NULL); + srand(data->rand_seed); + + data->worker_level = rand() % TOTAL_LEVEL + 1; + + data->do_prepare = NULL; + data->do_work = NULL; + data->do_check = NULL; + + data->backend = ANON; +}; + +FIXTURE_TEARDOWN(migrate) +{ + struct global_data *data = &self->data; + + if (data->region != MAP_FAILED) + munmap(data->region, data->mapsize); + data->region = MAP_FAILED; + if (data->expected_pfn != MAP_FAILED) + munmap(data->expected_pfn, sizeof(unsigned long)); + data->expected_pfn = MAP_FAILED; + semctl(data->semid, 0, IPC_RMID); + data->semid = -1; + + close(data->pipefd[0]); + + switch (data->backend) { + case ANON: + break; + case SHM: + shm_unlink(data->filename); + break; + case NORM_FILE: + unlink(data->filename); + break; + } +} + +void access_region(struct global_data *data) +{ + /* + * Force read "region" to make sure page fault in. + */ + FORCE_READ(*data->region); +} + +int try_to_move_page(char *region) +{ + int ret; + int node; + int status = 0; + int failures = 0; + + ret = move_pages(0, 1, (void **)®ion, NULL, &status, MPOL_MF_MOVE_ALL); + if (ret != 0) { + perror("Failed to get original numa"); + return FAIL_ON_WORK; + } + + /* Pick up a different target node */ + for (node = 0; node <= numa_max_node(); node++) { + if (numa_bitmask_isbitset(numa_all_nodes_ptr, node) && node != status) + break; + } + + if (node > numa_max_node()) { + ksft_print_msg("Couldn't find available numa node for testing\n"); + return FAIL_ON_WORK; + } + + while (1) { + ret = move_pages(0, 1, (void **)®ion, &node, &status, MPOL_MF_MOVE_ALL); + + /* migrate successfully */ + if (!ret) + break; + + /* error happened */ + if (ret < 0) { + ksft_perror("Failed to move pages"); + return FAIL_ON_WORK; + } + + /* migration is best effort; try again */ + if (++failures >= 100) + return FAIL_ON_WORK; + } + + return 0; +} + +int move_region(struct global_data *data) +{ + int ret; + int pagemap_fd; + + ret = try_to_move_page(data->region); + if (ret != 0) + return ret; + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd == -1) + return FAIL_ON_WORK; + *data->expected_pfn = pagemap_get_pfn(pagemap_fd, data->region); + + return 0; +} + +int has_same_pfn(struct global_data *data) +{ + unsigned long pfn; + int pagemap_fd; + + if (data->region == MAP_FAILED) + return 0; + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd == -1) + return FAIL_ON_CHECK; + + pfn = pagemap_get_pfn(pagemap_fd, data->region); + if (pfn != *data->expected_pfn) + return FAIL_ON_CHECK; + + return 0; +} + +TEST_F(migrate, anon) +{ + struct global_data *data = &self->data; + + /* Map an area and fault in */ + data->region = mmap(0, data->mapsize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(data->region, MAP_FAILED); + memset(data->region, 0xcf, data->mapsize); + + data->do_prepare = access_region; + data->do_work = move_region; + data->do_check = has_same_pfn; + + propagate_children(_metadata, data); +} + +TEST_F(migrate, shm) +{ + int shm_fd; + struct global_data *data = &self->data; + + snprintf(data->filename, MAX_FILENAME_LEN, "%s%s", PREFIX, suffixes[SHM]); + shm_fd = shm_open(data->filename, O_CREAT | O_RDWR, 0666); + ASSERT_NE(shm_fd, -1); + ftruncate(shm_fd, data->mapsize); + data->backend = SHM; + + /* Map a shared area and fault in */ + data->region = mmap(0, data->mapsize, PROT_READ | PROT_WRITE, + MAP_SHARED, shm_fd, 0); + ASSERT_NE(data->region, MAP_FAILED); + memset(data->region, 0xcf, data->mapsize); + close(shm_fd); + + data->do_prepare = access_region; + data->do_work = move_region; + data->do_check = has_same_pfn; + + propagate_children(_metadata, data); +} + +TEST_F(migrate, file) +{ + int fd; + struct global_data *data = &self->data; + + snprintf(data->filename, MAX_FILENAME_LEN, "%s%s", PREFIX, suffixes[NORM_FILE]); + fd = open(data->filename, O_CREAT | O_RDWR | O_EXCL, 0666); + ASSERT_NE(fd, -1); + ftruncate(fd, data->mapsize); + data->backend = NORM_FILE; + + /* Map a shared area and fault in */ + data->region = mmap(0, data->mapsize, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + ASSERT_NE(data->region, MAP_FAILED); + memset(data->region, 0xcf, data->mapsize); + close(fd); + + data->do_prepare = access_region; + data->do_work = move_region; + data->do_check = has_same_pfn; + + propagate_children(_metadata, data); +} + +void prepare_local_region(struct global_data *data) +{ + /* Allocate range and set the same data */ + data->region = mmap(NULL, data->mapsize, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON, -1, 0); + if (data->region == MAP_FAILED) + return; + + memset(data->region, 0xcf, data->mapsize); +} + +int merge_and_migrate(struct global_data *data) +{ + int pagemap_fd; + int ret = 0; + + if (data->region == MAP_FAILED) + return FAIL_ON_WORK; + + if (ksm_start() < 0) + return FAIL_ON_WORK; + + ret = try_to_move_page(data->region); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd == -1) + return FAIL_ON_WORK; + *data->expected_pfn = pagemap_get_pfn(pagemap_fd, data->region); + + return ret; +} + +TEST_F(migrate, ksm) +{ + int ret; + struct global_data *data = &self->data; + + if (ksm_stop() < 0) + SKIP(return, "accessing \"/sys/kernel/mm/ksm/run\") failed"); + if (ksm_get_full_scans() < 0) + SKIP(return, "accessing \"/sys/kernel/mm/ksm/full_scan\") failed"); + + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno == EINVAL) + SKIP(return, "PR_SET_MEMORY_MERGE not supported"); + else if (ret) + ksft_exit_fail_perror("PR_SET_MEMORY_MERGE=1 failed"); + + data->do_prepare = prepare_local_region; + data->do_work = merge_and_migrate; + data->do_check = has_same_pfn; + + propagate_children(_metadata, data); +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 471e539d82b8..75b94fdc915f 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -85,6 +85,8 @@ separated by spaces: test handling of page fragment allocation and freeing - vma_merge test VMA merge cases behave as expected +- rmap + test rmap behaves as expected example: ./run_vmtests.sh -t "hmm mmap ksm" EOF @@ -532,6 +534,8 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh aligned CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned +CATEGORY="rmap" run_test ./rmap + echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" | tap_prefix echo "1..${count_total}" | tap_output From 1aca4021f8456470fe92ba795886be0e595b927d Mon Sep 17 00:00:00 2001 From: Xichao Zhao Date: Tue, 19 Aug 2025 15:04:57 +0800 Subject: [PATCH 124/372] lib/test_hmm: drop redundant conversion to bool The result of integer comparison already evaluates to bool. No need for explicit conversion. No functional impact. Link: https://lkml.kernel.org/r/20250819070457.486348-1-zhao.xichao@vivo.com Signed-off-by: Xichao Zhao Reviewed-by: Alistair Popple Cc: Jason Gunthorpe Cc: Leon Romanovsky Signed-off-by: Andrew Morton --- lib/test_hmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 761725bc713c..83e3d8208a54 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -140,7 +140,7 @@ static int dmirror_bounce_init(struct dmirror_bounce *bounce, static bool dmirror_is_private_zone(struct dmirror_device *mdevice) { return (mdevice->zone_device_type == - HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false; + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE); } static enum migrate_vma_direction From 8d4bb46ba7671899a506c555ab71df65ba2e4a60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Aug 2025 08:10:08 +0200 Subject: [PATCH 125/372] ntfs3: stop using write_cache_pages Patch series "remove write_cache_pages()". Kill off write_cache_pages() after converting the last two users to the iterator. This patch (of 3): Stop using the obsolete write_cache_pages and use writeback_iter directly. Link: https://lkml.kernel.org/r/20250818061017.1526853-1-hch@lst.de Link: https://lkml.kernel.org/r/20250818061017.1526853-2-hch@lst.de Signed-off-by: Christoph Hellwig Cc: Kent Overstreet Cc: Konstantin Komarov Cc: Matthew Wilcox (Oracle) Cc: David Hildenbrand Signed-off-by: Andrew Morton --- fs/ntfs3/inode.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 37cbbee7fa58..48b4f73a93ee 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -871,9 +871,9 @@ out: } static int ntfs_resident_writepage(struct folio *folio, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc) { - struct address_space *mapping = data; + struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); int ret; @@ -907,9 +907,14 @@ static int ntfs_writepages(struct address_space *mapping, if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; - if (is_resident(ntfs_i(inode))) - return write_cache_pages(mapping, wbc, ntfs_resident_writepage, - mapping); + if (is_resident(ntfs_i(inode))) { + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = ntfs_resident_writepage(folio, wbc); + return error; + } return mpage_writepages(mapping, wbc, ntfs_get_block); } From e34b21ba1541760e64e913caa10181ea3b7d0761 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Aug 2025 08:10:09 +0200 Subject: [PATCH 126/372] bcachefs: stop using write_cache_pages Stop using the obsolete write_cache_pages and use writeback_iter directly. This basically just open codes write_cache_pages without the indirect call, but there's probably ways to structure the code even nicer as a follow on. Link: https://lkml.kernel.org/r/20250818061017.1526853-3-hch@lst.de Signed-off-by: Christoph Hellwig Cc: David Hildenbrand Cc: Kent Overstreet Cc: Konstantin Komarov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/bcachefs/fs-io-buffered.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 1c54b9b5bd69..fdeaa25189f2 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -655,6 +655,17 @@ do_io: return 0; } +static int bch2_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data) +{ + struct folio *folio = NULL; + int error; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = __bch2_writepage(folio, wbc, data); + return error; +} + int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; @@ -663,7 +674,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); blk_start_plug(&w->plug); - int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); + int ret = bch2_write_cache_pages(mapping, wbc, w); if (w->io) bch2_writepage_do_io(w); blk_finish_plug(&w->plug); From 7bebb41b96b5a898134b757fda520b7b990a91fa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Aug 2025 08:10:10 +0200 Subject: [PATCH 127/372] mm: remove write_cache_pages No users left. Link: https://lkml.kernel.org/r/20250818061017.1526853-4-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: David Hildenbrand Cc: Kent Overstreet Cc: Konstantin Komarov Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/writeback.h | 6 ------ mm/page-writeback.c | 30 ------------------------------ 2 files changed, 36 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a2848d731a46..2a7e134d03ee 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -360,12 +360,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb); struct folio *writeback_iter(struct address_space *mapping, struct writeback_control *wbc, struct folio *folio, int *error); -typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc, - void *data); - -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e248d1c3969..7e1e798e7213 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2590,36 +2590,6 @@ done: } EXPORT_SYMBOL_GPL(writeback_iter); -/** - * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @writepage: function called for each page - * @data: data passed to writepage function - * - * Return: %0 on success, negative error code otherwise - * - * Note: please use writeback_iter() instead. - */ -int write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) -{ - struct folio *folio = NULL; - int error; - - while ((folio = writeback_iter(mapping, wbc, folio, &error))) { - error = writepage(folio, wbc, data); - if (error == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - error = 0; - } - } - - return error; -} -EXPORT_SYMBOL(write_cache_pages); - int do_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; From 658fa653b4d17715c6b4c3686dabbfee6eb15e51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 12 Jun 2025 10:03:13 -0400 Subject: [PATCH 128/372] mm, x86/mm: move creating the tlb_flush event back to x86 code Commit e73ad5ff2f76 ("mm, x86/mm: Make the batched unmap TLB flush API more generic") moved the trace_tlb_flush out of mm/rmap.c and back into x86 specific architecture, but it kept the include to the events/tlb.h file, even though it didn't use that event. Then another commit came in and added more events to the mm/rmap.c file and moved the #define CREATE_TRACE_POINTS define from the x86 specific architecture to the generic mm/rmap.h file to create both the tlb_flush tracepoint and the new tracepoints. But since the tlb_flush tracepoint is only x86 specific, it now creates that tracepoint for all other architectures and this wastes approximately 5K of text and meta data that will not be used. Remove the events/tlb.h from mm/rmap.c and add the define CREATE_TRACE_POINTS back in the x86 code. Link: https://lkml.kernel.org/r/20250612100313.3b9a8b80@batman.local.home Fixes: e73ad5ff2f76 ("mm, x86/mm: Make the batched unmap TLB flush API more generic") Signed-off-by: Steven Rostedt (Google) Reviewed-by: Lorenzo Stoakes Cc: Andy Lutomirski Cc: Borislav Betkov Cc: David Hildenbrand Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/mm/init.c | 1 + mm/rmap.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bb57e93b4caf..8bf6ad4b9400 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -34,6 +34,7 @@ * We need to define the tracepoints somewhere, and tlb.c * is only compiled when SMP=y. */ +#define CREATE_TRACE_POINTS #include #include "mm_internal.h" diff --git a/mm/rmap.c b/mm/rmap.c index 1c5988dbd1e7..4c00348c35fc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -79,7 +79,6 @@ #include #define CREATE_TRACE_POINTS -#include #include #include "internal.h" From e5e758922d1a8ce5ea97140192d395f296bcf32c Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 17 Aug 2025 03:26:46 +0000 Subject: [PATCH 129/372] mm/rmap: not necessary to mask off FOLIO_PAGES_MAPPED At this point, we are in an if branch conditional on (nr < ENTIRELY_MAPPED), and FOLIO_PAGES_MAPPED is equal to (ENTIRELY_MAPPED - 1). This means the upper bits are already cleared. It is not necessary to mask it off. Link: https://lkml.kernel.org/r/20250817032647.29147-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Rik van Riel Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Harry Yoo Signed-off-by: Andrew Morton --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index 4c00348c35fc..2f8e7ce03685 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1748,7 +1748,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr_pages = folio_large_nr_pages(folio); if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; - nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); + nr = nr_pages - nr; /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; From 5d5d75ff646c9b5e54f1c0018097d970dabafb74 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 17 Aug 2025 03:26:47 +0000 Subject: [PATCH 130/372] mm/rmap: use folio_large_nr_pages() when we are sure it is a large folio Non-large folio is handled at the beginning, so it is a large folio for sure. Use folio_large_nr_pages() here like elsewhere. Link: https://lkml.kernel.org/r/20250817032647.29147-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Rik van Riel Cc: Liam R. Howlett Cc: Vlastimil Babka Cc: Harry Yoo Signed-off-by: Andrew Morton --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index 2f8e7ce03685..34333ae3bd80 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1702,7 +1702,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio, nr = folio_sub_return_large_mapcount(folio, nr_pages, vma); if (!nr) { /* Now completely unmapped. */ - nr = folio_nr_pages(folio); + nr = folio_large_nr_pages(folio); } else { partially_mapped = nr < folio_large_nr_pages(folio) && !folio_entire_mapcount(folio); From 0cd01c4a5cc140efb9fc203dd05ffccf3c2197d0 Mon Sep 17 00:00:00 2001 From: gaoxiang17 Date: Thu, 21 Aug 2025 06:38:55 +0800 Subject: [PATCH 131/372] mm/cma: add 'available count' and 'total count' to trace_cma_alloc_start This makes cma info more intuitive during debugging. Show up in the trace as: 279.814717: cma_alloc_start: name=reserved request_count=4 available_count=8096 total_count=8192 align=0 309.790580: cma_alloc_start: name=reserved request_count=4 available_count=8092 total_count=8192 align=0 317.046609: cma_alloc_start: name=reserved request_count=4 available_count=8088 total_count=8192 align=0 Link: https://lkml.kernel.org/r/8a79284879c529f467478552825154b018076e95.1755729178.git.gaoxiang17@xiaomi.com Signed-off-by: gaoxiang17 Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/cma.h | 19 +++++++++++++------ mm/cma.c | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/trace/events/cma.h b/include/trace/events/cma.h index 383c09f583ac..37195edf2498 100644 --- a/include/trace/events/cma.h +++ b/include/trace/events/cma.h @@ -38,25 +38,32 @@ TRACE_EVENT(cma_release, TRACE_EVENT(cma_alloc_start, - TP_PROTO(const char *name, unsigned long count, unsigned int align), + TP_PROTO(const char *name, unsigned long request_count, unsigned long available_count, + unsigned long total_count, unsigned int align), - TP_ARGS(name, count, align), + TP_ARGS(name, request_count, available_count, total_count, align), TP_STRUCT__entry( __string(name, name) - __field(unsigned long, count) + __field(unsigned long, request_count) + __field(unsigned long, available_count) + __field(unsigned long, total_count) __field(unsigned int, align) ), TP_fast_assign( __assign_str(name); - __entry->count = count; + __entry->request_count = request_count; + __entry->available_count = available_count; + __entry->total_count = total_count; __entry->align = align; ), - TP_printk("name=%s count=%lu align=%u", + TP_printk("name=%s request_count=%lu available_count=%lu total_count=%lu align=%u", __get_str(name), - __entry->count, + __entry->request_count, + __entry->available_count, + __entry->total_count, __entry->align) ); diff --git a/mm/cma.c b/mm/cma.c index 2ffa4befb99a..e56ec64d0567 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -864,7 +864,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, if (!count) return page; - trace_cma_alloc_start(name, count, align); + trace_cma_alloc_start(name, count, cma->available_count, cma->count, align); for (r = 0; r < cma->nranges; r++) { page = NULL; From 0b1bf60c324a8135f780d50bffb53d54f8863f88 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Wed, 20 Aug 2025 11:15:47 -0700 Subject: [PATCH 132/372] mm/zswap: reduce the size of the compression buffer to a single page Reduce the compression buffer size from 2 * PAGE_SIZE to only one page, as the compression output (in the success case) should not exceed the length of the input. In the past, Chengming tried to reduce the compression buffer size, but ran into issues with the LZO algorithm (see [2]). Herbert Xu reported that the issue has been fixed (see [3]). Now we should have the guarantee that compressors' output should not exceed one page in the success case, and the algorithm will just report failure otherwise. With this patch, we save one page per cpu (per compression algorithm). Link: https://lkml.kernel.org/r/20250820181547.3794167-1-nphamcs@gmail.com Link: https://lore.kernel.org/linux-mm/20231213-zswap-dstmem-v4-1-f228b059dd89@bytedance.com/ [1] Link: https://lore.kernel.org/lkml/0000000000000b05cd060d6b5511@google.com/ [2] Link: https://lore.kernel.org/linux-mm/aKUmyl5gUFCdXGn-@gondor.apana.org.au/ [3] Co-developed-by: Chengming Zhou Signed-off-by: Chengming Zhou Signed-off-by: Nhat Pham Acked-by: SeongJae Park Reviewed-by: Chengming Zhou Cc: Herbert Xu Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/zswap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 0c8dd8876d8e..e5e1f5687f5e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -831,7 +831,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) u8 *buffer = NULL; int ret; - buffer = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); + buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu)); if (!buffer) { ret = -ENOMEM; goto fail; @@ -959,12 +959,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, sg_init_table(&input, 1); sg_set_page(&input, page, PAGE_SIZE, 0); - /* - * We need PAGE_SIZE * 2 here since there maybe over-compression case, - * and hardware-accelerators may won't check the dst buffer size, so - * giving the dst buffer with enough length to avoid buffer overflow. - */ - sg_init_one(&output, dst, PAGE_SIZE * 2); + sg_init_one(&output, dst, PAGE_SIZE); acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); /* From 9907e1df31c0f4bdcebe16de809121baa754e5b5 Mon Sep 17 00:00:00 2001 From: Shankari Anand Date: Wed, 16 Jul 2025 14:41:58 +0530 Subject: [PATCH 133/372] rust: mm: update ARef and AlwaysRefCounted imports from sync::aref MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update call sites in the mm subsystem to import `ARef` and `AlwaysRefCounted` from `sync::aref` instead of `types`. This aligns with the ongoing effort to move `ARef` and `AlwaysRefCounted` to sync. Link: https://lkml.kernel.org/r/20250716091158.812860-1-shankari.ak0208@gmail.com Signed-off-by: Shankari Anand Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/linux/issues/1173 Acked-by: Alice Ryhl Cc: Alex Gaynor Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Danilo Krummrich Cc: Gary Guo Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Signed-off-by: Andrew Morton --- rust/kernel/mm.rs | 3 ++- rust/kernel/mm/mmput_async.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs index 43f525c0d16c..4764d7b68f2a 100644 --- a/rust/kernel/mm.rs +++ b/rust/kernel/mm.rs @@ -13,7 +13,8 @@ use crate::{ bindings, - types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque}, + sync::aref::{ARef, AlwaysRefCounted}, + types::{NotThreadSafe, Opaque}, }; use core::{ops::Deref, ptr::NonNull}; diff --git a/rust/kernel/mm/mmput_async.rs b/rust/kernel/mm/mmput_async.rs index 9289e05f7a67..b8d2f051225c 100644 --- a/rust/kernel/mm/mmput_async.rs +++ b/rust/kernel/mm/mmput_async.rs @@ -10,7 +10,7 @@ use crate::{ bindings, mm::MmWithUser, - types::{ARef, AlwaysRefCounted}, + sync::aref::{ARef, AlwaysRefCounted}, }; use core::{ops::Deref, ptr::NonNull}; From ce32123b9bc02bb4cd343fa03d1b8bb7f9ce9c51 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 21 Aug 2025 13:29:47 +0000 Subject: [PATCH 134/372] mm: remove is_migrate_highatomic() There are 3 potential reasons for is_migrate_*() helpers: 1. They represent higher-level attributes of migratetypes, like is_migrate_movable() 2. They are ifdef'd, like is_migrate_isolate(). 3. For consistency with an is_migrate_*_page() helper, also like is_migrate_isolate(). It looks like is_migrate_highatomic() was for case 3, but that was removed in commit e0932b6c1f94 ("mm: page_alloc: consolidate free page accounting"). So remove the indirection and go back to a simple comparison. Link: https://lkml.kernel.org/r/20250821-is-migrate-highatomic-v1-1-ddb6e5d7c566@google.com Signed-off-by: Brendan Jackman Reviewed-by: Zi Yan Acked-by: David Hildenbrand Acked-by: Johannes Weiner Reviewed-by: Lorenzo Stoakes Acked-by: SeongJae Park Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 5 ----- mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 45b725c3dc03..45da9ff5694f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1333,11 +1333,6 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; -static inline bool is_migrate_highatomic(enum migratetype migratetype) -{ - return migratetype == MIGRATE_HIGHATOMIC; -} - void setup_zone_pageset(struct zone *zone); struct migration_target_control { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ca9e6b9633f7..baead29b3e67 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -797,7 +797,7 @@ static inline void account_freepages(struct zone *zone, int nr_pages, if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); - else if (is_migrate_highatomic(migratetype)) + else if (migratetype == MIGRATE_HIGHATOMIC) WRITE_ONCE(zone->nr_free_highatomic, zone->nr_free_highatomic + nr_pages); } From 32960f750386283a9103d3cc9aa53415b79e03f5 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Thu, 21 Aug 2025 16:00:38 +0100 Subject: [PATCH 135/372] mm/huge_memory: remove enforce_sysfs from __thp_vma_allowable_orders Using forced_collapse directly is clearer and enforce_sysfs is not really needed. Link: https://lkml.kernel.org/r/20250821150038.2025521-1-usamaarif642@gmail.com Signed-off-by: Usama Arif Acked-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: SeongJae Park Reviewed-by: Baolin Wang Cc: Barry Song Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2a47cd3bb649..26cedfcd7418 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -105,7 +105,6 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, const bool smaps = type == TVA_SMAPS; const bool in_pf = type == TVA_PAGEFAULT; const bool forced_collapse = type == TVA_FORCED_COLLAPSE; - const bool enforce_sysfs = !forced_collapse; unsigned long supported_orders; /* Check the intersection of requested and supported orders. */ @@ -168,14 +167,14 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return orders & shmem_allowable_huge_orders(file_inode(vma->vm_file), vma, vma->vm_pgoff, 0, - !enforce_sysfs); + forced_collapse); if (!vma_is_anonymous(vma)) { /* - * Enforce sysfs THP requirements as necessary. Anonymous vmas + * Enforce THP collapse requirements as necessary. Anonymous vmas * were already handled in thp_vma_allowable_orders(). */ - if (enforce_sysfs && + if (!forced_collapse && (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) && !hugepage_global_always()))) return 0; From 940b1be22578dc67a0aa78338177b97b9a13eb8b Mon Sep 17 00:00:00 2001 From: ally heev Date: Sat, 23 Aug 2025 22:32:08 +0530 Subject: [PATCH 136/372] kselftest: mm: fix typos in test_vmalloc.sh Fix simple typos in function name and console message. Link: https://lkml.kernel.org/r/20250823170208.184149-1-allyheev@gmail.com Signed-off-by: ally heev Cc: David Hildenbrand Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/test_vmalloc.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh index d73b846736f1..d39096723fca 100755 --- a/tools/testing/selftests/mm/test_vmalloc.sh +++ b/tools/testing/selftests/mm/test_vmalloc.sh @@ -47,14 +47,14 @@ check_test_requirements() fi } -run_perfformance_check() +run_performance_check() { echo "Run performance tests to evaluate how fast vmalloc allocation is." echo "It runs all test cases on one single CPU with sequential order." modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 echo "Done." - echo "Ccheck the kernel message buffer to see the summary." + echo "Check the kernel message buffer to see the summary." } run_stability_check() @@ -160,7 +160,7 @@ function run_test() usage else if [[ "$1" = "performance" ]]; then - run_perfformance_check + run_performance_check elif [[ "$1" = "stress" ]]; then run_stability_check elif [[ "$1" = "smoke" ]]; then From a7498388b099f08219bad5f19d00e436da27fbf9 Mon Sep 17 00:00:00 2001 From: Bala-Vignesh-Reddy Date: Thu, 21 Aug 2025 15:41:59 +0530 Subject: [PATCH 137/372] selftests: centralise maybe-unused definition in kselftest.h Several selftests subdirectories duplicated the define __maybe_unused, leading to redundant code. Move to kselftest.h header and remove other definitions. This addresses the duplication noted in the proc-pid-vm warning fix Link: https://lkml.kernel.org/r/20250821101159.2238-1-reddybalavignesh9979@gmail.com Signed-off-by: Bala-Vignesh-Reddy Suggested-by: Andrew Morton Link:https://lore.kernel.org/lkml/20250820143954.33d95635e504e94df01930d0@linux-foundation.org/ Reviewed-by: Wei Yang Acked-by: SeongJae Park Reviewed-by: Ming Lei Acked-by: Mickal Salan [landlock] Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/kselftest.h | 4 ++++ tools/testing/selftests/landlock/audit.h | 6 ++---- tools/testing/selftests/landlock/common.h | 4 ---- tools/testing/selftests/mm/pkey-helpers.h | 3 --- tools/testing/selftests/net/psock_lib.h | 4 ---- tools/testing/selftests/perf_events/watermark_signal.c | 2 -- tools/testing/selftests/ublk/utils.h | 2 -- 7 files changed, 6 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index c3b6d2604b1e..661d31c4b558 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -92,6 +92,10 @@ #endif #define __printf(a, b) __attribute__((format(printf, a, b))) +#ifndef __maybe_unused +#define __maybe_unused __attribute__((__unused__)) +#endif + /* counters */ struct ksft_count { unsigned int ksft_pass; diff --git a/tools/testing/selftests/landlock/audit.h b/tools/testing/selftests/landlock/audit.h index b16986aa6442..02fd1393947a 100644 --- a/tools/testing/selftests/landlock/audit.h +++ b/tools/testing/selftests/landlock/audit.h @@ -20,14 +20,12 @@ #include #include +#include "../kselftest.h" + #ifndef ARRAY_SIZE #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif -#ifndef __maybe_unused -#define __maybe_unused __attribute__((__unused__)) -#endif - #define REGEX_LANDLOCK_PREFIX "^audit([0-9.:]\\+): domain=\\([0-9a-f]\\+\\)" struct audit_filter { diff --git a/tools/testing/selftests/landlock/common.h b/tools/testing/selftests/landlock/common.h index 88a3c78f5d98..9acecae36f51 100644 --- a/tools/testing/selftests/landlock/common.h +++ b/tools/testing/selftests/landlock/common.h @@ -22,10 +22,6 @@ #define TMP_DIR "tmp" -#ifndef __maybe_unused -#define __maybe_unused __attribute__((__unused__)) -#endif - /* TEST_F_FORK() should not be used for new tests. */ #define TEST_F_FORK(fixture_name, test_name) TEST_F(fixture_name, test_name) diff --git a/tools/testing/selftests/mm/pkey-helpers.h b/tools/testing/selftests/mm/pkey-helpers.h index ea404f80e6cb..fa15f006fa68 100644 --- a/tools/testing/selftests/mm/pkey-helpers.h +++ b/tools/testing/selftests/mm/pkey-helpers.h @@ -84,9 +84,6 @@ extern void abort_hooks(void); #ifndef noinline # define noinline __attribute__((noinline)) #endif -#ifndef __maybe_unused -# define __maybe_unused __attribute__((__unused__)) -#endif int sys_pkey_alloc(unsigned long flags, unsigned long init_val); int sys_pkey_free(unsigned long pkey); diff --git a/tools/testing/selftests/net/psock_lib.h b/tools/testing/selftests/net/psock_lib.h index 6e4fef560873..067265b0a554 100644 --- a/tools/testing/selftests/net/psock_lib.h +++ b/tools/testing/selftests/net/psock_lib.h @@ -22,10 +22,6 @@ #define PORT_BASE 8000 -#ifndef __maybe_unused -# define __maybe_unused __attribute__ ((__unused__)) -#endif - static __maybe_unused void pair_udp_setfilter(int fd) { /* the filter below checks for all of the following conditions that diff --git a/tools/testing/selftests/perf_events/watermark_signal.c b/tools/testing/selftests/perf_events/watermark_signal.c index e03fe1b9bba2..b3a72f0ac522 100644 --- a/tools/testing/selftests/perf_events/watermark_signal.c +++ b/tools/testing/selftests/perf_events/watermark_signal.c @@ -17,8 +17,6 @@ #include "../kselftest_harness.h" -#define __maybe_unused __attribute__((__unused__)) - static int sigio_count; static void handle_sigio(int signum __maybe_unused, diff --git a/tools/testing/selftests/ublk/utils.h b/tools/testing/selftests/ublk/utils.h index 36545d1567f1..a852e0b7153e 100644 --- a/tools/testing/selftests/ublk/utils.h +++ b/tools/testing/selftests/ublk/utils.h @@ -2,8 +2,6 @@ #ifndef KUBLK_UTILS_H #define KUBLK_UTILS_H -#define __maybe_unused __attribute__((unused)) - #ifndef min #define min(a, b) ((a) < (b) ? (a) : (b)) #endif From 3615e106e0f7099af5ac8a2d4338b34b7a2dbae1 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 22 Aug 2025 02:57:32 +0000 Subject: [PATCH 138/372] mm/khugepaged: use list_xxx() helper to improve readability In general, khugepaged_scan_mm_slot() iterates khugepaged_scan.mm_head list to get a mm_struct for collapse memory. Use list_xxx() helper would be more obvious to the list iteration operation. No functional change. Link: https://lkml.kernel.org/r/20250822025732.9025-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Lorenzo Stoakes Acked-by: SeongJae Park Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Reviewed-by: Dev Jain Cc: Barry Song Cc: Mariano Pache Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/khugepaged.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index d3d4f116e14b..24e18a7f8a93 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2402,7 +2402,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, mm_slot = khugepaged_scan.mm_slot; slot = &mm_slot->slot; } else { - slot = list_entry(khugepaged_scan.mm_head.next, + slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; @@ -2515,9 +2515,8 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (slot->mm_node.next != &khugepaged_scan.mm_head) { - slot = list_entry(slot->mm_node.next, - struct mm_slot, mm_node); + if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { + slot = list_next_entry(slot, mm_node); khugepaged_scan.mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; From 786eb990cfb78aab94eb74fb32a030e14723a620 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Fri, 22 Aug 2025 14:18:45 +0530 Subject: [PATCH 139/372] drivers/base/node: handle error properly in register_one_node() If register_node() returns an error, it is not handled correctly. The function will proceed further and try to register CPUs under the node, which is not correct. So, in this patch, if register_node() returns an error, we return immediately from the function. Link: https://lkml.kernel.org/r/20250822084845.19219-1-donettom@linux.ibm.com Fixes: 76b67ed9dce6 ("[PATCH] node hotplug: register cpu: remove node struct") Signed-off-by: Donet Tom Acked-by: David Hildenbrand Cc: Alison Schofield Cc: Danilo Krummrich Cc: Dave Jiang Cc: Donet Tom Cc: Greg Kroah-Hartman Cc: Hiroyouki Kamezawa Cc: Joanthan Cameron Cc: Oscar Salvador Cc: "Ritesh Harjani (IBM)" Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/base/node.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/base/node.c b/drivers/base/node.c index 3399594136b2..45d512939c40 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -885,6 +885,11 @@ int register_one_node(int nid) node_devices[nid] = node; error = register_node(node_devices[nid], nid); + if (error) { + node_devices[nid] = NULL; + kfree(node); + return error; + } /* link cpu under this node */ for_each_present_cpu(cpu) { From 79dfed097680084f3d4716fa2c5bc945233bd2c0 Mon Sep 17 00:00:00 2001 From: I Viswanath Date: Mon, 25 Aug 2025 22:36:43 +0530 Subject: [PATCH 140/372] selftests/mm: use calloc instead of malloc in pagemap_ioctl.c As per Documentation/process/deprecated.rst, dynamic size calculations should not be performed in memory allocator arguments due to possible overflows. Replace malloc with calloc to avoid open-ended arithmetic and prevent possible overflows. Link: https://lkml.kernel.org/r/20250825170643.63174-1-viswanathiyyappan@gmail.com Signed-off-by: I Viswanath Reviewed-by: Vishal Moola (Oracle) Acked-by: David Hildenbrand Reviewed by: Donet Tom Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index e6face7c0166..4fc8e578ec7c 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -209,7 +209,7 @@ int userfaultfd_tests(void) wp_addr_range(mem, mem_size); vec_size = mem_size/page_size; - vec = malloc(sizeof(struct page_region) * vec_size); + vec = calloc(vec_size, sizeof(struct page_region)); written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); @@ -247,11 +247,11 @@ int sanity_tests_sd(void) vec_size = num_pages/2; mem_size = num_pages * page_size; - vec = malloc(sizeof(struct page_region) * vec_size); + vec = calloc(vec_size, sizeof(struct page_region)); if (!vec) ksft_exit_fail_msg("error nomem\n"); - vec2 = malloc(sizeof(struct page_region) * vec_size); + vec2 = calloc(vec_size, sizeof(struct page_region)); if (!vec2) ksft_exit_fail_msg("error nomem\n"); @@ -436,7 +436,7 @@ int sanity_tests_sd(void) mem_size = 1050 * page_size; vec_size = mem_size/(page_size*2); - vec = malloc(sizeof(struct page_region) * vec_size); + vec = calloc(vec_size, sizeof(struct page_region)); if (!vec) ksft_exit_fail_msg("error nomem\n"); @@ -491,7 +491,7 @@ int sanity_tests_sd(void) mem_size = 10000 * page_size; vec_size = 50; - vec = malloc(sizeof(struct page_region) * vec_size); + vec = calloc(vec_size, sizeof(struct page_region)); if (!vec) ksft_exit_fail_msg("error nomem\n"); @@ -541,7 +541,7 @@ int sanity_tests_sd(void) vec_size = 1000; mem_size = vec_size * page_size; - vec = malloc(sizeof(struct page_region) * vec_size); + vec = calloc(vec_size, sizeof(struct page_region)); if (!vec) ksft_exit_fail_msg("error nomem\n"); @@ -695,8 +695,8 @@ int base_tests(char *prefix, char *mem, unsigned long long mem_size, int skip) } vec_size = mem_size/page_size; - vec = malloc(sizeof(struct page_region) * vec_size); - vec2 = malloc(sizeof(struct page_region) * vec_size); + vec = calloc(vec_size, sizeof(struct page_region)); + vec2 = calloc(vec_size, sizeof(struct page_region)); /* 1. all new pages must be not be written (dirty) */ written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, @@ -807,8 +807,8 @@ int hpage_unit_tests(void) unsigned long long vec_size = map_size/page_size; struct page_region *vec, *vec2; - vec = malloc(sizeof(struct page_region) * vec_size); - vec2 = malloc(sizeof(struct page_region) * vec_size); + vec = calloc(vec_size, sizeof(struct page_region)); + vec2 = calloc(vec_size, sizeof(struct page_region)); if (!vec || !vec2) ksft_exit_fail_msg("malloc failed\n"); @@ -997,7 +997,7 @@ int unmapped_region_tests(void) void *start = (void *)0x10000000; int written, len = 0x00040000; long vec_size = len / page_size; - struct page_region *vec = malloc(sizeof(struct page_region) * vec_size); + struct page_region *vec = calloc(vec_size, sizeof(struct page_region)); /* 1. Get written pages */ written = pagemap_ioctl(start, len, vec, vec_size, 0, 0, @@ -1062,7 +1062,7 @@ int sanity_tests(void) mem_size = 10 * page_size; vec_size = mem_size / page_size; - vec = malloc(sizeof(struct page_region) * vec_size); + vec = calloc(vec_size, sizeof(struct page_region)); mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); if (mem == MAP_FAILED || vec == MAP_FAILED) ksft_exit_fail_msg("error nomem\n"); From 46afff459925297d9d47c043eb8541fabac9bb0a Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 26 Aug 2025 00:37:21 +0800 Subject: [PATCH 141/372] mm/page-writeback: drop usage of folio_index folio_index is only needed for mixed usage of page cache and swap cache. The remaining three caller in page-writeback are for page cache tag marking. Swap cache space doesn't use tag (explicitly sets mapping_set_no_writeback_tags), so use folio->index here directly. Link: https://lkml.kernel.org/r/20250825163721.17734-1-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e1e798e7213..5f90fd6a7137 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -38,10 +38,10 @@ #include #include #include +#include #include #include "internal.h" -#include "swap.h" /* * Sleep at most 200ms at a time in balance_dirty_pages(). @@ -2705,12 +2705,18 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, { unsigned long flags; + /* + * Shmem writeback relies on swap, and swap writeback is LRU based, + * not using the dirty mark. + */ + VM_WARN_ON_ONCE(folio_test_swapcache(folio) || shmem_mapping(mapping)); + xa_lock_irqsave(&mapping->i_pages, flags); if (folio->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !folio_test_uptodate(folio)); folio_account_dirtied(folio, mapping); - __xa_set_mark(&mapping->i_pages, folio_index(folio), - PAGECACHE_TAG_DIRTY); + __xa_set_mark(&mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); } xa_unlock_irqrestore(&mapping->i_pages, flags); } @@ -2989,7 +2995,7 @@ bool __folio_end_writeback(struct folio *folio) xa_lock_irqsave(&mapping->i_pages, flags); ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); - __xa_clear_mark(&mapping->i_pages, folio_index(folio), + __xa_clear_mark(&mapping->i_pages, folio->index, PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -3026,7 +3032,7 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (mapping && mapping_use_writeback_tags(mapping)) { - XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + XA_STATE(xas, &mapping->i_pages, folio->index); struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; From dfd04add595b97758c8ad1ee970554b7af5c57dd Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Mon, 25 Aug 2025 09:59:26 -0300 Subject: [PATCH 142/372] kmem/tracing: add kmem name to kmem_cache_alloc tracepoint The kmem_cache_free tracepoint includes a "name" field, which allows for easy identification and filtering of specific kmem's. However, the kmem_cache_alloc tracepoint lacks this field, making it difficult to pair corresponding alloc and free events for analysis. Add the "name" field to kmem_cache_alloc to enable consistent tracking and correlation of kmem alloc and free events. Link: https://lkml.kernel.org/r/20250825125927.59816-1-wander@redhat.com Signed-off-by: Wander Lairson Costa Cc: David Hildenbrand Cc: David Rientjes Cc: Martin Liu Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Steven Rostedt Cc: Zi Yan Signed-off-by: Andrew Morton --- include/trace/events/kmem.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 474358773abe..7f93e754da5c 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -22,6 +22,7 @@ TRACE_EVENT(kmem_cache_alloc, TP_STRUCT__entry( __field( unsigned long, call_site ) __field( const void *, ptr ) + __string( name, s->name ) __field( size_t, bytes_req ) __field( size_t, bytes_alloc ) __field( unsigned long, gfp_flags ) @@ -32,6 +33,7 @@ TRACE_EVENT(kmem_cache_alloc, TP_fast_assign( __entry->call_site = call_site; __entry->ptr = ptr; + __assign_str(name); __entry->bytes_req = s->object_size; __entry->bytes_alloc = s->size; __entry->gfp_flags = (__force unsigned long)gfp_flags; @@ -41,9 +43,10 @@ TRACE_EVENT(kmem_cache_alloc, (s->flags & SLAB_ACCOUNT)) : false; ), - TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", + TP_printk("call_site=%pS ptr=%p name=%s bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s", (void *)__entry->call_site, __entry->ptr, + __get_str(name), __entry->bytes_req, __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags), From e76e09bdf9f7d58ccc62d416a4b90700000f145a Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Tue, 26 Aug 2025 14:38:16 +0200 Subject: [PATCH 143/372] kho: make sure kho_scratch argument is fully consumed When specifying fixed sized scratch areas, the parser only parses the three scratch sizes and ignores the rest of the argument. This means the argument can have any bogus trailing characters. For example, "kho_scratch=256M,512M,512Mfoobar" results in successful parsing: [ 0.000000] KHO: scratch areas: lowmem: 256MiB global: 512MiB pernode: 512MiB It is generally a good idea to parse arguments as strictly as possible. In addition, if bogus trailing characters are allowed in the kho_scratch argument, it is possible that some people might end up using them and later extensions to the argument format will cause unexpected breakages. Make sure the argument is fully consumed after all three scratch sizes are parsed. With this change, the bogus argument "kho_scratch=256M,512M,512Mfoobar" results in: [ 0.000000] Malformed early option 'kho_scratch' Link: https://lkml.kernel.org/r/20250826123817.64681-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Pratyush Yadav Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 1a65419e3756..8079fc4b9189 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -451,6 +451,10 @@ static int __init kho_parse_scratch_size(char *p) if (!total_size) return -EINVAL; + /* The string should be fully consumed by now. */ + if (*p) + return -EINVAL; + scratch_size_lowmem = sizes[0]; scratch_size_global = sizes[1]; scratch_size_pernode = sizes[2]; From ef49b7b39d50b9e4f9d63e64f5d8acafe3c71158 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Aug 2025 15:13:44 +0000 Subject: [PATCH 144/372] maple_tree: fix MAPLE_PARENT_RANGE32 and parent pointer docs MAPLE_PARENT_RANGE32 should be 0x02 as a 32 bit node is indicated by the bit pattern 0b010 which is the hex value 0x02. There are no users currently, so there is no associated bug with this wrong value. Fix typo Note -> Node and replace x with b to indicate binary values. Link: https://lkml.kernel.org/r/20250826151344.403286-1-sidhartha.kumar@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 16 ++++++++-------- lib/maple_tree.c | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index bafe143b1f78..41e633264e51 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -57,17 +57,17 @@ * MT_FLAGS_ALLOC_RANGE flag. * * Node types: - * 0x??1 = Root - * 0x?00 = 16 bit nodes - * 0x010 = 32 bit nodes - * 0x110 = 64 bit nodes + * 0b??1 = Root + * 0b?00 = 16 bit nodes + * 0b010 = 32 bit nodes + * 0b110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location - * 0x??1 : Root - * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 - * 0x010 : 32 bit values, type in 0-2, slot in 3-6 - * 0x110 : 64 bit values, type in 0-2, slot in 3-6 + * 0b??1 : Root + * 0b?00 : 16 bit values, type in 0-1, slot in 2-6 + * 0b010 : 32 bit values, type in 0-2, slot in 3-6 + * 0b110 : 64 bit values, type in 0-2, slot in 3-6 */ /* diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b4ee2d29d7a9..c57a4615bdff 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -405,11 +405,11 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt) * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * - * Note types: - * 0x??1 = Root - * 0x?00 = 16 bit nodes - * 0x010 = 32 bit nodes - * 0x110 = 64 bit nodes + * Node types: + * 0b??1 = Root + * 0b?00 = 16 bit nodes + * 0b010 = 32 bit nodes + * 0b110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root @@ -427,7 +427,7 @@ static __always_inline bool mt_is_alloc(struct maple_tree *mt) #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 -#define MAPLE_PARENT_RANGE32 0x04 +#define MAPLE_PARENT_RANGE32 0x02 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* From 6c3826173e6aece0f7375392f509065b6944fd35 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 26 Aug 2025 14:06:54 +0000 Subject: [PATCH 145/372] mm/page_alloc: harmonize should_compact_retry() type Currently order is signed in one version of the function and unsigned in the other. Tidy that up. In page_alloc.c, order is unsigned in the vast majority of cases. But, there is a cluster of exceptions in compaction-related code (probably stemming from the fact that compact_control.order is signed). So, prefer local consistency and make this one signed too. Link: https://lkml.kernel.org/r/20250826-cleanup-should_compact_retry-v1-1-d2ca89727fcf@google.com Signed-off-by: Brendan Jackman Reviewed-by: Zi Yan Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index baead29b3e67..3e37922635d5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4182,7 +4182,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, enum compact_result compact_result, enum compact_priority *compact_priority, int *compaction_retries) From ab1c34c83407b6ae1f1313789cb7ec813c26b5e8 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 26 Aug 2025 17:35:54 +0800 Subject: [PATCH 146/372] mm: shmem: use 'folio' for shmem_partial_swap_usage() It is more straightforward to use the term `folio'. No functional changes. Link: https://lkml.kernel.org/r/a2d39608d99cba1130cacd9cffbafc6949193c08.1756200587.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 13cc51df3893..b4d4f0e9c40a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1006,15 +1006,15 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; unsigned long swapped = 0; unsigned long max = end - 1; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) swapped += 1 << xas_get_order(&xas); if (xas.xa_index == max) break; From 6d11dec130ad16f2b08ef88fb7f3d37dc99d74d6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 26 Aug 2025 17:35:55 +0800 Subject: [PATCH 147/372] mm: shmem: drop the unnecessary folio_nr_pages() We've got the number of pages in the folio earlier, thus remove the redundant folio_nr_pages() call. Link: https://lkml.kernel.org/r/67c80182ebd949e3894908e01e224697c143aabb.1756200587.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index b4d4f0e9c40a..640fecc42f60 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2430,7 +2430,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, goto failed; } folio_wait_writeback(folio); - nr_pages = folio_nr_pages(folio); /* * Some architectures may have to restore extra metadata to the From 060b6c72ce94ebd1190842e81eb9675d7400c5f6 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 9 Sep 2025 11:45:29 +0530 Subject: [PATCH 148/372] selftests/mm/uffd-stress: make test operate on less hugetlb memory Patch series "selftests/mm: uffd-stress fixes", v2. This patchset ensures that the number of hugepages is correctly set in the system so that the uffd-stress test does not fail due to the racy nature of the test. Patch 1 changes the hugepage constraint in the run_vmtests.sh script, whereas patch 2 changes the constraint in the test itself. This patch (of 2): We observed uffd-stress selftest failure on arm64 and intermittent failures on x86 too: running ./uffd-stress hugetlb-private 128 32 bounces: 17, mode: rnd read, ERROR: UFFDIO_COPY error: -12 (errno=12, @uffd-common.c:617) [FAIL] not ok 18 uffd-stress hugetlb-private 128 32 # exit=1 For this particular case, the number of free hugepages from run_vmtests.sh will be 128, and the test will allocate 64 hugepages in the source location. The stress() function will start spawning threads which will operate on the destination location, triggering uffd-operations like UFFDIO_COPY from src to dst, which means that we will require 64 more hugepages for the dst location. Let us observe the locking_thread() function. It will lock the mutex kept at dst, triggering uffd-copy. Suppose that 127 (64 for src and 63 for dst) hugepages have been reserved. In case of BOUNCE_RANDOM, it may happen that two threads trying to lock the mutex at dst, try to do so at the same hugepage number. If one thread succeeds in reserving the last hugepage, then the other thread may fail in alloc_hugetlb_folio(), returning -ENOMEM. I can confirm that this is indeed the case by this hacky patch: :--- a/mm/hugetlb.c ; +++ b/mm/hugetlb.c ; @@ -6929,6 +6929,11 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, ; ; folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); ; if (IS_ERR(folio)) { ; + pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE); ; + if (actual_pte) { ; + ret = -EEXIST; ; + goto out; ; + } ; ret = -ENOMEM; ; goto out; ; } This code path gets triggered indicating that the PMD at which one thread is trying to map a hugepage, gets filled by a racing thread. Therefore, instead of using freepgs to compute the amount of memory, use freepgs - (min(32, nr_cpus) - 1), so that the test still has some extra hugepages to use. The adjustment is a function of min(32, nr_cpus) - the value of nr_parallel in the test - because in the worst case, nr_parallel number of threads will try to map a hugepage on the same PMD, one will win the allocation race, and the other nr_parallel - 1 threads will fail, so we need extra nr_parallel - 1 hugepages to satisfy this request. Note that, in case the adjusted value underflows, there is a check for the number of free hugepages in the test itself, which will fail: get_free_hugepages() < bytes / page_size A negative value will be passed on to bytes which is of type size_t, thus the RHS will become a large value and the check will fail, so we are safe. Link: https://lkml.kernel.org/r/20250909061531.57272-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20250909061531.57272-2-dev.jain@arm.com Signed-off-by: Dev Jain Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/hugetlb.c | 5 +++++ tools/testing/selftests/mm/run_vmtests.sh | 10 +++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 514fab5a20ef..2c06557ee95d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6932,6 +6932,11 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, folio = alloc_hugetlb_folio(dst_vma, dst_addr, false); if (IS_ERR(folio)) { + pte_t *actual_pte = hugetlb_walk(dst_vma, dst_addr, PMD_SIZE); + if (actual_pte) { + ret = -EEXIST; + goto out; + } ret = -ENOMEM; goto out; } diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 75b94fdc915f..9e88cc25b9df 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -324,11 +324,15 @@ CATEGORY="gup_test" run_test ./gup_longterm CATEGORY="userfaultfd" run_test ./uffd-unit-tests uffd_stress_bin=./uffd-stress CATEGORY="userfaultfd" run_test ${uffd_stress_bin} anon 20 16 -# Hugetlb tests require source and destination huge pages. Pass in half -# the size of the free pages we have, which is used for *each*. +# Hugetlb tests require source and destination huge pages. Pass in almost half +# the size of the free pages we have, which is used for *each*. An adjustment +# of (nr_parallel - 1) is done (see nr_parallel in uffd-stress.c) to have some +# extra hugepages - this is done to prevent the test from failing by racily +# reserving more hugepages than strictly required. # uffd-stress expects a region expressed in MiB, so we adjust # half_ufd_size_MB accordingly. -half_ufd_size_MB=$(((freepgs * hpgsize_KB) / 1024 / 2)) +adjustment=$(( (31 < (nr_cpus - 1)) ? 31 : (nr_cpus - 1) )) +half_ufd_size_MB=$((((freepgs - adjustment) * hpgsize_KB) / 1024 / 2)) CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 From 1580cd50b6d40af81fe48611ef7a60545eb8d40b Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 9 Sep 2025 11:45:30 +0530 Subject: [PATCH 149/372] selftests/mm/uffd-stress: stricten constraint on free hugepages needed before the test The test requires at least 2 * (bytes/page_size) hugetlb memory, since we require identical number of hugepages for src and dst location. Fix this. Along with the above, as explained in patch "selftests/mm/uffd-stress: Make test operate on less hugetlb memory", the racy nature of the test requires that we have some extra number of hugepages left beyond what is required. Therefore, stricten this constraint. Link: https://lkml.kernel.org/r/20250909061531.57272-3-dev.jain@arm.com Fixes: 5a6aa60d1823 ("selftests/mm: skip uffd hugetlb tests with insufficient hugepages") Signed-off-by: Dev Jain Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index c0f64df5085c..ecd016329935 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -448,12 +448,6 @@ int main(int argc, char **argv) parse_test_type_arg(argv[1]); bytes = atol(argv[2]) * 1024 * 1024; - if (test_type == TEST_HUGETLB && - get_free_hugepages() < bytes / page_size) { - printf("skip: Skipping userfaultfd... not enough hugepages\n"); - return KSFT_SKIP; - } - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); if (nr_cpus > 32) { /* Don't let calculation below go to zero. */ @@ -464,6 +458,17 @@ int main(int argc, char **argv) nr_parallel = nr_cpus; } + /* + * src and dst each require bytes / page_size number of hugepages. + * Ensure nr_parallel - 1 hugepages on top of that to account + * for racy extra reservation of hugepages. + */ + if (test_type == TEST_HUGETLB && + get_free_hugepages() < 2 * (bytes / page_size) + nr_parallel - 1) { + printf("skip: Skipping userfaultfd... not enough hugepages\n"); + return KSFT_SKIP; + } + nr_pages_per_cpu = bytes / page_size / nr_parallel; if (!nr_pages_per_cpu) { _err("pages_per_cpu = 0, cannot test (%lu / %lu / %lu)", From c090868f59ce888db838cf543d18f731d13ed498 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 26 Aug 2025 11:09:55 +0800 Subject: [PATCH 150/372] Revert "hugetlb: make hugetlb depends on SYSFS or SYSCTL" Commit f8142cf94d47 ("hugetlb: make hugetlb depends on SYSFS or SYSCTL") added dependency on SYSFS or SYSCTL but hugetlb can be used without SYSFS or SYSCTL. So this dependency is wrong and should be removed. For users with CONFIG_SYSFS or CONFIG_SYSCTL on, there should be no difference. For users have CONFIG_SYSFS and CONFIG_SYSCTL both undefined, hugetlbfs can still works perfectly well through cmdline except a possible kismet warning[1] when select CONFIG_HUGETLBFS. IMHO, it might not worth a backport. This reverts commit f8142cf94d4737ea0c3baffb3b9bad8addcb9b6b. It overlooked the scenario of using hugetlb through boot parameters when it was submitted. Link: https://lkml.kernel.org/r/20250826030955.2898709-1-linmiaohe@huawei.com Link: https://lore.kernel.org/all/5c99458f-4a91-485f-8a35-3618a992e2e4@csgroup.eu/ [1] Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202508222032.bwJsQPZ1-lkp@intel.com/ Signed-off-by: Miaohe Lin Cc: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- fs/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/Kconfig b/fs/Kconfig index c654a3642897..187a75440aca 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -250,7 +250,6 @@ config ARCH_SUPPORTS_HUGETLBFS menuconfig HUGETLBFS bool "HugeTLB file system support" depends on ARCH_SUPPORTS_HUGETLBFS - depends on (SYSFS || SYSCTL) select MEMFD_CREATE select PADATA if SMP help From cf1dec76ba8a00b20e51d205f3c9f5c45bc96df2 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Aug 2025 14:55:35 -0700 Subject: [PATCH 151/372] mm/filemap: add AS_KERNEL_FILE Patch series "introduce kernel file mapped folios", v4. Btrfs currently tracks its metadata pages in the page cache, using a fake inode (fs_info->btree_inode) with offsets corresponding to where the metadata is stored in the filesystem's full logical address space. A consequence of this is that when btrfs uses filemap_add_folio(), this usage is charged to the cgroup of whichever task happens to be running at the time. These folios don't belong to any particular user cgroup, so I don't think it makes much sense for them to be charged in that way. Some negative consequences as a result: - A task can be holding some important btrfs locks, then need to lookup some metadata and go into reclaim, extending the duration it holds that lock for, and unfairly pushing its own reclaim pain onto other cgroups. - If that cgroup goes into reclaim, it might reclaim these folios a different non-reclaiming cgroup might need soon. This is naturally offset by LRU reclaim, but still. We have two options for how to manage such file pages: 1. charge them to the root cgroup. 2. don't charge them to any cgroup at all. 2. breaks the invariant that every mapped page has a cgroup. This is workable, but unnecessarily risky. Therefore, go with 1. A very similar proposal to use the root cgroup was previously made by Qu, where he eventually proposed the idea of setting it per address_space. This makes good sense for the btrfs use case, as the behavior should apply to all use of the address_space, not select allocations. I.e., if someone adds another filemap_add_folio() call using btrfs's btree_inode, we would almost certainly want to account that to the root cgroup as well. This patch (of 3): Add the flag AS_KERNEL_FILE to the address_space to indicate that this mapping's memory is exempt from the usual memcg accounting. [boris@bur.io: fix CONFIG_MEMCG build for AS_KERNEL_FILE] Link: https://lkml.kernel.org/r/6de59ddeec81b5c294d337c001ba0061631d4ec6.1755816635.git.boris@bur.io Link: https://lore.kernel.org/linux-mm/b5fef5372ae454a7b6da4f2f75c427aeab6a07d6.1727498749.git.wqu@suse.com/ Link: https://lkml.kernel.org/r/f09c4e2c90351d4cb30a1969f7a863b9238bd291.1755812945.git.boris@bur.io Signed-off-by: Boris Burkov Suggested-by: Qu Wenruo Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 ++ include/linux/pagemap.h | 2 ++ mm/filemap.c | 6 ++++++ 3 files changed, 10 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9fa3afc90dd5..e693978b2022 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1059,6 +1059,8 @@ extern int mem_cgroup_init(void); #define MEM_CGROUP_ID_SHIFT 0 +#define root_mem_cgroup (NULL) + static inline struct mem_cgroup *folio_memcg(struct folio *folio) { return NULL; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 12a12dae727d..f0dfdfb13cd9 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -211,6 +211,8 @@ enum mapping_flags { folio contents */ AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM = 9, + AS_KERNEL_FILE = 10, /* mapping for a fake kernel file that shouldn't + account usage to user cgroups */ /* Bits 16-25 are used for FOLIO_ORDER */ AS_FOLIO_ORDER_BITS = 5, AS_FOLIO_ORDER_MIN = 16, diff --git a/mm/filemap.c b/mm/filemap.c index 6e954156bb77..92ea20356f22 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -960,8 +960,14 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, { void *shadow = NULL; int ret; + struct mem_cgroup *tmp; + bool kernel_file = test_bit(AS_KERNEL_FILE, &mapping->flags); + if (kernel_file) + tmp = set_active_memcg(root_mem_cgroup); ret = mem_cgroup_charge(folio, NULL, gfp); + if (kernel_file) + set_active_memcg(tmp); if (ret) return ret; From e3a9ac4e866ea746475b4026819a8c08ec1142e6 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Aug 2025 14:55:36 -0700 Subject: [PATCH 152/372] mm: add vmstat for kernel_file pages Kernel file pages are tricky to track because they are indistinguishable from files whose usage is accounted to the root cgroup. To maintain good accounting, introduce a vmstat counter tracking kernel file pages. Confirmed that these work as expected at a high level by mounting a btrfs using AS_KERNEL_FILE for metadata pages, and seeing the counter rise with fs usage then go back to a minimal level after drop_caches and finally down to 0 after unmounting the fs. Link: https://lkml.kernel.org/r/08ff633e3a005ed5f7691bfd9f58a5df8e474339.1755812945.git.boris@bur.io Signed-off-by: Boris Burkov Suggested-by: Shakeel Butt Acked-by: Shakeel Butt Tested-by: syzbot@syzkaller.appspotmail.com Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qu Wenruo Cc: Roman Gushchin Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + mm/filemap.c | 7 +++++++ mm/vmstat.c | 1 + 3 files changed, 9 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fe13ad175fed..f3272ef5131b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -259,6 +259,7 @@ enum node_stat_item { NR_HUGETLB, #endif NR_BALLOON_PAGES, + NR_KERNEL_FILE_PAGES, NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/filemap.c b/mm/filemap.c index 92ea20356f22..cd9387b0a5b5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -190,6 +190,9 @@ static void filemap_unaccount_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } + if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, -nr); /* * At this point folio must be either written or cleaned by @@ -989,6 +992,10 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, if (!(gfp & __GFP_WRITE) && shadow) workingset_refault(folio, shadow); folio_add_lru(folio); + if (kernel_file) + mod_node_page_state(folio_pgdat(folio), + NR_KERNEL_FILE_PAGES, + folio_nr_pages(folio)); } return ret; } diff --git a/mm/vmstat.c b/mm/vmstat.c index e74f0b2a1021..e522decf6a72 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1290,6 +1290,7 @@ const char * const vmstat_text[] = { [I(NR_HUGETLB)] = "nr_hugetlb", #endif [I(NR_BALLOON_PAGES)] = "nr_balloon_pages", + [I(NR_KERNEL_FILE_PAGES)] = "nr_kernel_file_pages", #undef I /* system-wide enum vm_stat_item counters */ From b55102826d7d3d41a5777931689c746207308c95 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 21 Aug 2025 14:55:37 -0700 Subject: [PATCH 153/372] btrfs: set AS_KERNEL_FILE on the btree_inode extent_buffers are global and shared so their pages should not belong to any particular cgroup (currently whichever cgroups happens to allocate the extent_buffer). Btrfs tree operations should not arbitrarily block on cgroup reclaim or have the shared extent_buffer pages on a cgroup's reclaim lists. Link: https://lkml.kernel.org/r/2ee99832619a3fdfe80bf4dc9760278662d2d746.1755812945.git.boris@bur.io Signed-off-by: Boris Burkov Acked-by: Shakeel Butt Tested-by: syzbot@syzkaller.appspotmail.com Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qu Wenruo Cc: Roman Gushchin Signed-off-by: Andrew Morton --- fs/btrfs/disk-io.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 70fc4e7cc5a0..7fab5057cf8e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1930,6 +1930,7 @@ static int btrfs_init_btree_inode(struct super_block *sb) BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); __insert_inode_hash(inode, hash); + set_bit(AS_KERNEL_FILE, &inode->i_mapping->flags); fs_info->btree_inode = inode; return 0; From dd3b304b9410e5d99f7d35e8b0b998f4446c5191 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 27 Aug 2025 07:01:04 +0000 Subject: [PATCH 154/372] mm/page_alloc: use xxx_pageblock_isolate() for better reading Patch series "mm/pageblock: improve readability of some pageblock handling", v3. During code reading, found two possible points to improve the readability of pageblock handling. Patch 1: isolate bit is standalone and there are dedicated helpers. Instead of check the bit directly, we could use the helper to do it. Patch 2: remove PB_migratetype_bits and PB_migrate_end to reduce magical computation. This patch (of 2): Since commit e904bce2d9d4 ("mm/page_isolation: make page isolation a standalone bit"), it provides dedicated helper to handle isolation. Change to use these helpers to be better reading. No functional change intended. Link: https://lkml.kernel.org/r/20250827070105.16864-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20250827070105.16864-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3e37922635d5..c524b80a3b72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -538,8 +538,7 @@ static void set_pageblock_migratetype(struct page *page, "Use set_pageblock_isolate() for pageblock isolation"); return; } - VM_WARN_ONCE(get_pfnblock_bit(page, page_to_pfn(page), - PB_migrate_isolate), + VM_WARN_ONCE(get_pageblock_isolate(page), "Use clear_pageblock_isolate() to unisolate pageblock"); /* MIGRATETYPE_AND_ISO_MASK clears PB_migrate_isolate if it is set */ #endif @@ -2058,9 +2057,9 @@ static unsigned long find_large_buddy(unsigned long start_pfn) static inline void toggle_pageblock_isolate(struct page *page, bool isolate) { if (isolate) - set_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); + set_pageblock_isolate(page); else - clear_pfnblock_bit(page, page_to_pfn(page), PB_migrate_isolate); + clear_pageblock_isolate(page); } /** From 98c94f1035fc0c82ab008854a165df2c20c0cb6a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 27 Aug 2025 07:01:05 +0000 Subject: [PATCH 155/372] mm/pageblock-flags: remove PB_migratetype_bits/PB_migrate_end enum pageblock_bits defines the meaning of pageblock bits. Currently PB_migratetype_bits says the lowest 3 bits represents migratetype and PB_migrate_end/MIGRATETYPE_MASK's definition rely on it with magical computation. Remove the definition of PB_migratetype_bits/PB_migrate_end. Use PB_migrate_[0|1|2] to represent lowest bits for migratetype. Then we can simplify related definition. Also, MIGRATETYPE_AND_ISO_MASK is MIGRATETYPE_MASK add isolation bit. Use MIGRATETYPE_MASK in the definition of MIGRATETYPE_AND_ISO_MASK looks cleaner. No functional change intended. Link: https://lkml.kernel.org/r/20250827070105.16864-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Reviewed-by: Zi Yan Cc: Vlastimil Babka Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 12 +++++------- mm/page_alloc.c | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 6a44be0f39f4..e046278a01fa 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -13,12 +13,11 @@ #include -#define PB_migratetype_bits 3 /* Bit indices that affect a whole block of pages */ enum pageblock_bits { - PB_migrate, - PB_migrate_end = PB_migrate + PB_migratetype_bits - 1, - /* 3 bits required for migrate types */ + PB_migrate_0, + PB_migrate_1, + PB_migrate_2, PB_compact_skip,/* If set the block is skipped by compaction */ #ifdef CONFIG_MEMORY_ISOLATION @@ -37,11 +36,10 @@ enum pageblock_bits { #define NR_PAGEBLOCK_BITS (roundup_pow_of_two(__NR_PAGEBLOCK_BITS)) -#define MIGRATETYPE_MASK ((1UL << (PB_migrate_end + 1)) - 1) +#define MIGRATETYPE_MASK (BIT(PB_migrate_0)|BIT(PB_migrate_1)|BIT(PB_migrate_2)) #ifdef CONFIG_MEMORY_ISOLATION -#define MIGRATETYPE_AND_ISO_MASK \ - (((1UL << (PB_migrate_end + 1)) - 1) | BIT(PB_migrate_isolate)) +#define MIGRATETYPE_AND_ISO_MASK (MIGRATETYPE_MASK | BIT(PB_migrate_isolate)) #else #define MIGRATETYPE_AND_ISO_MASK MIGRATETYPE_MASK #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c524b80a3b72..2df6ee6998ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -355,7 +355,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) static __always_inline bool is_standalone_pb_bit(enum pageblock_bits pb_bit) { - return pb_bit > PB_migrate_end && pb_bit < __NR_PAGEBLOCK_BITS; + return pb_bit >= PB_compact_skip && pb_bit < __NR_PAGEBLOCK_BITS; } static __always_inline void @@ -370,7 +370,7 @@ get_pfnblock_bitmap_bitidx(const struct page *page, unsigned long pfn, #else BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); #endif - BUILD_BUG_ON(__MIGRATE_TYPE_END >= (1 << PB_migratetype_bits)); + BUILD_BUG_ON(__MIGRATE_TYPE_END > MIGRATETYPE_MASK); VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitmap = get_pageblock_bitmap(page, pfn); From 09a616cbb371e6b843e536f00e38d6b43d796ac4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:32 -0700 Subject: [PATCH 156/372] mm/damon/core: add damon_ctx->addr_unit Patch series "mm/damon: support ARM32 with LPAE", v3. Previously, DAMON's physical address space monitoring only supported memory ranges below 4GB on LPAE-enabled systems. This was due to the use of 'unsigned long' in 'struct damon_addr_range', which is 32-bit on ARM32 even with LPAE enabled[1]. To add DAMON support for ARM32 with LPAE enabled, a new core layer parameter called 'addr_unit' was introduced[2]. Operations set layer can translate a core layer address to the real address by multiplying the parameter value to the core layer address. Support of the parameter is up to each operations layer implementation, though. For example, operations set implementations for virtual address space can simply ignore the parameter. Add the support on paddr, which is the DAMON operations set implementation for the physical address space, as we have a clear use case for that. This patch (of 11): In some cases, some of the real address that handled by the underlying operations set cannot be handled by DAMON since it uses only 'unsinged long' as the address type. Using DAMON for physical address space monitoring of 32 bit ARM devices with large physical address extension (LPAE) is one example[1]. Add a parameter name 'addr_unit' to core layer to help such cases. DAMON core API callers can set it as the scale factor that will be used by the operations set for translating the core layer's addresses to the real address by multiplying the parameter value to the core layer address. Support of the parameter is up to each operations set layer. The support from the physical address space operations set (paddr) will be added with following commits. Link: https://lkml.kernel.org/r/20250828171242.59810-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250828171242.59810-2-sj@kernel.org Link: https://lore.kernel.org/20250408075553.959388-1-zuoze1@huawei.com [1] Link: https://lore.kernel.org/all/20250416042551.158131-1-sj@kernel.org/ [2] Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index d01bfee80bd6..6fa52f7495d9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -746,7 +746,7 @@ struct damon_attrs { * Accesses to other fields must be protected by themselves. * * @ops: Set of monitoring operations for given use cases. - * + * @addr_unit: Scale factor for core to ops address conversion. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -788,6 +788,7 @@ struct damon_ctx { struct mutex kdamond_lock; struct damon_operations ops; + unsigned long addr_unit; struct list_head adaptive_targets; struct list_head schemes; diff --git a/mm/damon/core.c b/mm/damon/core.c index 52ecc3a4426f..e098389a2f73 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -544,6 +544,8 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.min_nr_regions = 10; ctx->attrs.max_nr_regions = 1000; + ctx->addr_unit = 1; + INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -1245,6 +1247,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) return err; } dst->ops = src->ops; + dst->addr_unit = src->addr_unit; return 0; } From d8096848e73e3740bd3ff1a96f7fb3c6a5578882 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:33 -0700 Subject: [PATCH 157/372] mm/damon/paddr: support addr_unit for access monitoring Add support of addr_unit paramer for access monitoing operations of paddr. Link: https://lkml.kernel.org/r/20250828171242.59810-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 0b67d9321460..d497373c2bd2 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -18,7 +18,13 @@ #include "../internal.h" #include "ops-common.h" -static void damon_pa_mkold(unsigned long paddr) +static phys_addr_t damon_pa_phys_addr( + unsigned long addr, unsigned long addr_unit) +{ + return (phys_addr_t)addr * addr_unit; +} + +static void damon_pa_mkold(phys_addr_t paddr) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); @@ -29,11 +35,12 @@ static void damon_pa_mkold(unsigned long paddr) folio_put(folio); } -static void __damon_pa_prepare_access_check(struct damon_region *r) +static void __damon_pa_prepare_access_check(struct damon_region *r, + unsigned long addr_unit) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); - damon_pa_mkold(r->sampling_addr); + damon_pa_mkold(damon_pa_phys_addr(r->sampling_addr, addr_unit)); } static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) @@ -43,11 +50,11 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(r); + __damon_pa_prepare_access_check(r, ctx->addr_unit); } } -static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) +static bool damon_pa_young(phys_addr_t paddr, unsigned long *folio_sz) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); bool accessed; @@ -62,23 +69,25 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz) } static void __damon_pa_check_access(struct damon_region *r, - struct damon_attrs *attrs) + struct damon_attrs *attrs, unsigned long addr_unit) { - static unsigned long last_addr; + static phys_addr_t last_addr; static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; + phys_addr_t sampling_addr = damon_pa_phys_addr( + r->sampling_addr, addr_unit); /* If the region is in the last checked page, reuse the result */ if (ALIGN_DOWN(last_addr, last_folio_sz) == - ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { + ALIGN_DOWN(sampling_addr, last_folio_sz)) { damon_update_region_access_rate(r, last_accessed, attrs); return; } - last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); + last_accessed = damon_pa_young(sampling_addr, &last_folio_sz); damon_update_region_access_rate(r, last_accessed, attrs); - last_addr = r->sampling_addr; + last_addr = sampling_addr; } static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) @@ -89,7 +98,8 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(r, &ctx->attrs); + __damon_pa_check_access( + r, &ctx->attrs, ctx->addr_unit); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } From 85246435b249b852bd8a1b1e5913a157e6d12f2f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:34 -0700 Subject: [PATCH 158/372] mm/damon/paddr: support addr_unit for DAMOS_PAGEOUT Add support of addr_unit for DAMOS_PAGEOUT action handling from the DAMOS operation implementation for the physical address space. Link: https://lkml.kernel.org/r/20250828171242.59810-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index d497373c2bd2..696aeb0f6c8e 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -24,6 +24,19 @@ static phys_addr_t damon_pa_phys_addr( return (phys_addr_t)addr * addr_unit; } +static unsigned long damon_pa_core_addr( + phys_addr_t pa, unsigned long addr_unit) +{ + /* + * Use div_u64() for avoiding linking errors related with __udivdi3, + * __aeabi_uldivmod, or similar problems. This should also improve the + * performance optimization (read div_u64() comment for the detail). + */ + if (sizeof(pa) == 8 && sizeof(addr_unit) == 4) + return div_u64(pa, addr_unit); + return pa / addr_unit; +} + static void damon_pa_mkold(phys_addr_t paddr) { struct folio *folio = damon_get_folio(PHYS_PFN(paddr)); @@ -135,10 +148,11 @@ static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s) return false; } -static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_pageout(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr, applied; + phys_addr_t addr, applied; LIST_HEAD(folio_list); bool install_young_filter = true; struct damos_filter *filter; @@ -159,8 +173,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, damos_add_filter(s, filter); } - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -170,7 +184,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; folio_clear_referenced(folio); folio_test_clear_young(folio); @@ -189,7 +203,7 @@ put_folio: applied = reclaim_pages(&folio_list); cond_resched(); s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } static inline unsigned long damon_pa_mark_accessed_or_deactivate( @@ -302,9 +316,11 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme, unsigned long *sz_filter_passed) { + unsigned long aunit = ctx->addr_unit; + switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pa_pageout(r, scheme, sz_filter_passed); + return damon_pa_pageout(r, aunit, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: return damon_pa_mark_accessed(r, scheme, sz_filter_passed); case DAMOS_LRU_DEPRIO: From 51a1ebd3a295c8c1a75331a97d0c7820606dd056 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:35 -0700 Subject: [PATCH 159/372] mm/damon/paddr: support addr_unit for DAMOS_LRU_[DE]PRIO Add support of addr_unit for DAMOS_LRU_PRIO and DAMOS_LRU_DEPRIO action handling from the DAMOS operation implementation for the physical address space. Link: https://lkml.kernel.org/r/20250828171242.59810-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 696aeb0f6c8e..2cd1c9953983 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -207,14 +207,15 @@ put_folio: } static inline unsigned long damon_pa_mark_accessed_or_deactivate( - struct damon_region *r, struct damos *s, bool mark_accessed, + struct damon_region *r, unsigned long addr_unit, + struct damos *s, bool mark_accessed, unsigned long *sz_filter_passed) { - unsigned long addr, applied = 0; + phys_addr_t addr, applied = 0; struct folio *folio; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -224,7 +225,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; if (mark_accessed) folio_mark_accessed(folio); @@ -236,20 +237,22 @@ put_folio: folio_put(folio); } s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } static unsigned long damon_pa_mark_accessed(struct damon_region *r, - struct damos *s, unsigned long *sz_filter_passed) + unsigned long addr_unit, struct damos *s, + unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, true, + return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, true, sz_filter_passed); } static unsigned long damon_pa_deactivate_pages(struct damon_region *r, - struct damos *s, unsigned long *sz_filter_passed) + unsigned long addr_unit, struct damos *s, + unsigned long *sz_filter_passed) { - return damon_pa_mark_accessed_or_deactivate(r, s, false, + return damon_pa_mark_accessed_or_deactivate(r, addr_unit, s, false, sz_filter_passed); } @@ -322,9 +325,11 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, case DAMOS_PAGEOUT: return damon_pa_pageout(r, aunit, scheme, sz_filter_passed); case DAMOS_LRU_PRIO: - return damon_pa_mark_accessed(r, scheme, sz_filter_passed); + return damon_pa_mark_accessed(r, aunit, scheme, + sz_filter_passed); case DAMOS_LRU_DEPRIO: - return damon_pa_deactivate_pages(r, scheme, sz_filter_passed); + return damon_pa_deactivate_pages(r, aunit, scheme, + sz_filter_passed); case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: return damon_pa_migrate(r, scheme, sz_filter_passed); From ec1d5bab0689658b105810aaceff68e7078e4697 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:36 -0700 Subject: [PATCH 160/372] mm/damon/paddr: support addr_unit for MIGRATE_{HOT,COLD} Add support of addr_unit for DAMOS_MIGRATE_HOT and DAMOS_MIGRATE_COLD action handling from the DAMOS operation implementation for the physical address space. Link: https://lkml.kernel.org/r/20250828171242.59810-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 2cd1c9953983..0a122a8a9f1c 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -256,15 +256,16 @@ static unsigned long damon_pa_deactivate_pages(struct damon_region *r, sz_filter_passed); } -static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_migrate(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr, applied; + phys_addr_t addr, applied; LIST_HEAD(folio_list); struct folio *folio; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -274,7 +275,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, if (damos_pa_filter_out(s, folio)) goto put_folio; else - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; if (!folio_isolate_lru(folio)) goto put_folio; @@ -286,7 +287,7 @@ put_folio: applied = damon_migrate_pages(&folio_list, s->target_nid); cond_resched(); s->last_applied = folio; - return applied * PAGE_SIZE; + return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, @@ -332,7 +333,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, sz_filter_passed); case DAMOS_MIGRATE_HOT: case DAMOS_MIGRATE_COLD: - return damon_pa_migrate(r, scheme, sz_filter_passed); + return damon_pa_migrate(r, aunit, scheme, sz_filter_passed); case DAMOS_STAT: return damon_pa_stat(r, scheme, sz_filter_passed); default: From 01e7ee33a0caddc7b86d6ea1ae61090c7d2fe4a6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:37 -0700 Subject: [PATCH 161/372] mm/damon/paddr: support addr_unit for DAMOS_STAT Add support of addr_unit for DAMOS_STAT action handling from the DAMOS operation implementation for the physical address space. Link: https://lkml.kernel.org/r/20250828171242.59810-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 0a122a8a9f1c..07a8aead439e 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -290,17 +290,18 @@ put_folio: return damon_pa_core_addr(applied * PAGE_SIZE, addr_unit); } -static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, +static unsigned long damon_pa_stat(struct damon_region *r, + unsigned long addr_unit, struct damos *s, unsigned long *sz_filter_passed) { - unsigned long addr; + phys_addr_t addr; struct folio *folio; if (!damos_ops_has_filter(s)) return 0; - addr = r->ar.start; - while (addr < r->ar.end) { + addr = damon_pa_phys_addr(r->ar.start, addr_unit); + while (addr < damon_pa_phys_addr(r->ar.end, addr_unit)) { folio = damon_get_folio(PHYS_PFN(addr)); if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; @@ -308,7 +309,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, } if (!damos_pa_filter_out(s, folio)) - *sz_filter_passed += folio_size(folio); + *sz_filter_passed += folio_size(folio) / addr_unit; addr += folio_size(folio); folio_put(folio); } @@ -335,7 +336,7 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, case DAMOS_MIGRATE_COLD: return damon_pa_migrate(r, aunit, scheme, sz_filter_passed); case DAMOS_STAT: - return damon_pa_stat(r, scheme, sz_filter_passed); + return damon_pa_stat(r, aunit, scheme, sz_filter_passed); default: /* DAMOS actions that not yet supported by 'paddr'. */ break; From 540a2aebc657b02520a8837778dace8608ec4f05 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:38 -0700 Subject: [PATCH 162/372] mm/damon/sysfs: implement addr_unit file under context dir Only DAMON kernel API callers can use addr_unit parameter. Implement a sysfs file to let DAMON sysfs ABI users use it. Additionally, addr_unit must be set to a non-zero value. Link: https://lkml.kernel.org/r/20250828171242.59810-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 6d2b0dab50cb..98bf15d403b2 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -834,6 +834,7 @@ static const struct damon_sysfs_ops_name damon_sysfs_ops_names[] = { struct damon_sysfs_context { struct kobject kobj; enum damon_ops_id ops_id; + unsigned long addr_unit; struct damon_sysfs_attrs *attrs; struct damon_sysfs_targets *targets; struct damon_sysfs_schemes *schemes; @@ -849,6 +850,7 @@ static struct damon_sysfs_context *damon_sysfs_context_alloc( return NULL; context->kobj = (struct kobject){}; context->ops_id = ops_id; + context->addr_unit = 1; return context; } @@ -997,6 +999,32 @@ static ssize_t operations_store(struct kobject *kobj, return -EINVAL; } +static ssize_t addr_unit_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + + return sysfs_emit(buf, "%lu\n", context->addr_unit); +} + +static ssize_t addr_unit_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_context *context = container_of(kobj, + struct damon_sysfs_context, kobj); + unsigned long input_addr_unit; + int err = kstrtoul(buf, 0, &input_addr_unit); + + if (err) + return err; + if (!input_addr_unit) + return -EINVAL; + + context->addr_unit = input_addr_unit; + return count; +} + static void damon_sysfs_context_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_context, kobj)); @@ -1008,9 +1036,13 @@ static struct kobj_attribute damon_sysfs_context_avail_operations_attr = static struct kobj_attribute damon_sysfs_context_operations_attr = __ATTR_RW_MODE(operations, 0600); +static struct kobj_attribute damon_sysfs_context_addr_unit_attr = + __ATTR_RW_MODE(addr_unit, 0600); + static struct attribute *damon_sysfs_context_attrs[] = { &damon_sysfs_context_avail_operations_attr.attr, &damon_sysfs_context_operations_attr.attr, + &damon_sysfs_context_addr_unit_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_context); @@ -1397,6 +1429,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, err = damon_select_ops(ctx, sys_ctx->ops_id); if (err) return err; + ctx->addr_unit = sys_ctx->addr_unit; err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; From 7b06c471afc8d82c9c246532b74c4ad4db2890d2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:39 -0700 Subject: [PATCH 163/372] Docs/mm/damon/design: document 'address unit' parameter Add 'addr_unit' parameter description on DAMON design document. Link: https://lkml.kernel.org/r/20250828171242.59810-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 2f6ba5c7f4c7..d9d5baa1ec87 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -67,7 +67,7 @@ processes, NUMA nodes, files, and backing memory devices would be supportable. Also, if some architectures or devices support special optimized access check features, those will be easily configurable. -DAMON currently provides below three operation sets. Below two subsections +DAMON currently provides below three operation sets. Below three subsections describe how those work. - vaddr: Monitor virtual address spaces of specific processes @@ -135,6 +135,18 @@ the interference is the responsibility of sysadmins. However, it solves the conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, as Idle page tracking does. +Address Unit +------------ + +DAMON core layer uses ``unsinged long`` type for monitoring target address +ranges. In some cases, the address space for a given operations set could be +too large to be handled with the type. ARM (32-bit) with large physical +address extension is an example. For such cases, a per-operations set +parameter called ``address unit`` is provided. It represents the scale factor +that need to be multiplied to the core layer's address for calculating real +address on the given address space. Support of ``address unit`` parameter is +up to each operations set implementation. ``paddr`` is the only operations set +implementation that supports the parameter. .. _damon_core_logic: From e0c725455fd5a042a50f029c24738ee60b5e1516 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:40 -0700 Subject: [PATCH 164/372] Docs/admin-guide/mm/damon/usage: document addr_unit file Document addr_unit DAMON sysfs file on DAMON usage document. Link: https://lkml.kernel.org/r/20250828171242.59810-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 11 +++++++---- Documentation/mm/damon/design.rst | 2 ++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index ff3a2dda1f02..2cae60b6f3ca 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -61,7 +61,7 @@ comma (","). │ :ref:`kdamonds `/nr_kdamonds │ │ :ref:`0 `/state,pid,refresh_ms │ │ │ :ref:`contexts `/nr_contexts - │ │ │ │ :ref:`0 `/avail_operations,operations + │ │ │ │ :ref:`0 `/avail_operations,operations,addr_unit │ │ │ │ │ :ref:`monitoring_attrs `/ │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us │ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us @@ -188,9 +188,9 @@ details). At the moment, only one context per kdamond is supported, so only contexts// ------------- -In each context directory, two files (``avail_operations`` and ``operations``) -and three directories (``monitoring_attrs``, ``targets``, and ``schemes``) -exist. +In each context directory, three files (``avail_operations``, ``operations`` +and ``addr_unit``) and three directories (``monitoring_attrs``, ``targets``, +and ``schemes``) exist. DAMON supports multiple types of :ref:`monitoring operations `, including those for virtual address @@ -205,6 +205,9 @@ You can set and get what type of monitoring operations DAMON will use for the context by writing one of the keywords listed in ``avail_operations`` file and reading from the ``operations`` file. +``addr_unit`` file is for setting and getting the :ref:`address unit +` parameter of the operations set. + .. _sysfs_monitoring_attrs: contexts//monitoring_attrs/ diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index d9d5baa1ec87..80354f4f42ba 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -135,6 +135,8 @@ the interference is the responsibility of sysadmins. However, it solves the conflict with the reclaim logic using ``PG_idle`` and ``PG_young`` page flags, as Idle page tracking does. +.. _damon_design_addr_unit: + Address Unit ------------ From 56cd19404abae2c17d1e6d1e3c7d23dd79b75039 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 28 Aug 2025 10:12:41 -0700 Subject: [PATCH 165/372] Docs/ABI/damon: document addr_unit file Document addr_unit DAMON sysfs file on DAMON ABI document. Link: https://lkml.kernel.org/r/20250828171242.59810-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 6791d879759e..b6b71db36ca7 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -77,6 +77,13 @@ Description: Writing a keyword for a monitoring operations set ('vaddr' for Note that only the operations sets that listed in 'avail_operations' file are valid inputs. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//addr_unit +Date: Aug 2025 +Contact: SeongJae Park +Description: Writing an integer to this file sets the 'address unit' + parameter of the given operations set of the context. Reading + the file returns the last-written 'address unit' value. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/sample_us Date: Mar 2022 Contact: SeongJae Park From d8f867fa0825fb3e358457566d7326d8aab2406a Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Thu, 28 Aug 2025 10:12:42 -0700 Subject: [PATCH 166/372] mm/damon: add damon_ctx->min_sz_region Adopting addr_unit would make DAMON_MINREGION 'addr_unit * 4096' bytes and cause data alignment issues[1]. Add damon_ctx->min_sz_region to change DAMON_MIN_REGION from a global macro value to per-context variable. Link: https://lkml.kernel.org/r/20250828171242.59810-12-sj@kernel.org Link: https://lore.kernel.org/all/527714dd-0e33-43ab-bbbd-d89670ba79e7@huawei.com [1] Signed-off-by: Quanmin Yan Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: ze zuo Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 ++- mm/damon/core.c | 67 ++++++++++++++++++++---------------- mm/damon/sysfs.c | 8 +++-- mm/damon/tests/core-kunit.h | 21 ++++++----- mm/damon/tests/vaddr-kunit.h | 2 +- mm/damon/vaddr.c | 2 +- 6 files changed, 61 insertions(+), 43 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 6fa52f7495d9..ec8716292c09 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -747,6 +747,7 @@ struct damon_attrs { * * @ops: Set of monitoring operations for given use cases. * @addr_unit: Scale factor for core to ops address conversion. + * @min_sz_region: Minimum region size. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ @@ -789,6 +790,7 @@ struct damon_ctx { struct damon_operations ops; unsigned long addr_unit; + unsigned long min_sz_region; struct list_head adaptive_targets; struct list_head schemes; @@ -877,7 +879,7 @@ static inline void damon_insert_region(struct damon_region *r, void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges); + unsigned int nr_ranges, unsigned long min_sz_region); void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); diff --git a/mm/damon/core.c b/mm/damon/core.c index e098389a2f73..12f136a121a2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -201,6 +201,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * @t: the given target. * @ranges: array of new monitoring target ranges. * @nr_ranges: length of @ranges. + * @min_sz_region: minimum region size. * * This function adds new regions to, or modify existing regions of a * monitoring target to fit in specific ranges. @@ -208,7 +209,7 @@ static int damon_fill_regions_holes(struct damon_region *first, * Return: 0 if success, or negative error code otherwise. */ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, - unsigned int nr_ranges) + unsigned int nr_ranges, unsigned long min_sz_region) { struct damon_region *r, *next; unsigned int i; @@ -245,16 +246,16 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, /* no region intersects with this range */ newr = damon_new_region( ALIGN_DOWN(range->start, - DAMON_MIN_REGION), - ALIGN(range->end, DAMON_MIN_REGION)); + min_sz_region), + ALIGN(range->end, min_sz_region)); if (!newr) return -ENOMEM; damon_insert_region(newr, damon_prev_region(r), r, t); } else { /* resize intersecting regions to fit in this range */ first->ar.start = ALIGN_DOWN(range->start, - DAMON_MIN_REGION); - last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + min_sz_region); + last->ar.end = ALIGN(range->end, min_sz_region); /* fill possible holes in the range */ err = damon_fill_regions_holes(first, last, t); @@ -545,6 +546,7 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.max_nr_regions = 1000; ctx->addr_unit = 1; + ctx->min_sz_region = DAMON_MIN_REGION; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -1127,8 +1129,8 @@ static struct damon_target *damon_nth_target(int n, struct damon_ctx *ctx) * * If @src has no region, @dst keeps current regions. */ -static int damon_commit_target_regions( - struct damon_target *dst, struct damon_target *src) +static int damon_commit_target_regions(struct damon_target *dst, + struct damon_target *src, unsigned long src_min_sz_region) { struct damon_region *src_region; struct damon_addr_range *ranges; @@ -1145,18 +1147,19 @@ static int damon_commit_target_regions( i = 0; damon_for_each_region(src_region, src) ranges[i++] = src_region->ar; - err = damon_set_regions(dst, ranges, i); + err = damon_set_regions(dst, ranges, i, src_min_sz_region); kfree(ranges); return err; } static int damon_commit_target( struct damon_target *dst, bool dst_has_pid, - struct damon_target *src, bool src_has_pid) + struct damon_target *src, bool src_has_pid, + unsigned long src_min_sz_region) { int err; - err = damon_commit_target_regions(dst, src); + err = damon_commit_target_regions(dst, src, src_min_sz_region); if (err) return err; if (dst_has_pid) @@ -1178,7 +1181,8 @@ static int damon_commit_targets( if (src_target) { err = damon_commit_target( dst_target, damon_target_has_pid(dst), - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) return err; } else { @@ -1201,7 +1205,8 @@ static int damon_commit_targets( if (!new_target) return -ENOMEM; err = damon_commit_target(new_target, false, - src_target, damon_target_has_pid(src)); + src_target, damon_target_has_pid(src), + src->min_sz_region); if (err) { damon_destroy_target(new_target, NULL); return err; @@ -1248,6 +1253,7 @@ int damon_commit_ctx(struct damon_ctx *dst, struct damon_ctx *src) } dst->ops = src->ops; dst->addr_unit = src->addr_unit; + dst->min_sz_region = src->min_sz_region; return 0; } @@ -1280,8 +1286,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) if (ctx->attrs.min_nr_regions) sz /= ctx->attrs.min_nr_regions; - if (sz < DAMON_MIN_REGION) - sz = DAMON_MIN_REGION; + if (sz < ctx->min_sz_region) + sz = ctx->min_sz_region; return sz; } @@ -1625,6 +1631,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * @t: The target of the region. * @rp: The pointer to the region. * @s: The scheme to be applied. + * @min_sz_region: minimum region size. * * If a quota of a scheme has exceeded in a quota charge window, the scheme's * action would applied to only a part of the target access pattern fulfilling @@ -1642,7 +1649,7 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, * Return: true if the region should be entirely skipped, false otherwise. */ static bool damos_skip_charged_region(struct damon_target *t, - struct damon_region **rp, struct damos *s) + struct damon_region **rp, struct damos *s, unsigned long min_sz_region) { struct damon_region *r = *rp; struct damos_quota *quota = &s->quota; @@ -1664,11 +1671,11 @@ static bool damos_skip_charged_region(struct damon_target *t, if (quota->charge_addr_from && r->ar.start < quota->charge_addr_from) { sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); + r->ar.start, min_sz_region); if (!sz_to_skip) { - if (damon_sz_region(r) <= DAMON_MIN_REGION) + if (damon_sz_region(r) <= min_sz_region) return true; - sz_to_skip = DAMON_MIN_REGION; + sz_to_skip = min_sz_region; } damon_split_region_at(t, r, sz_to_skip); r = damon_next_region(r); @@ -1693,7 +1700,8 @@ static void damos_update_stat(struct damos *s, } static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos_filter *filter) + struct damon_region *r, struct damos_filter *filter, + unsigned long min_sz_region) { bool matched = false; struct damon_target *ti; @@ -1710,8 +1718,8 @@ static bool damos_filter_match(struct damon_ctx *ctx, struct damon_target *t, matched = target_idx == filter->target_idx; break; case DAMOS_FILTER_TYPE_ADDR: - start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); - end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); + start = ALIGN_DOWN(filter->addr_range.start, min_sz_region); + end = ALIGN_DOWN(filter->addr_range.end, min_sz_region); /* inside the range */ if (start <= r->ar.start && r->ar.end <= end) { @@ -1747,7 +1755,7 @@ static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, s->core_filters_allowed = false; damos_for_each_filter(filter, s) { - if (damos_filter_match(ctx, t, r, filter)) { + if (damos_filter_match(ctx, t, r, filter, ctx->min_sz_region)) { if (filter->allow) s->core_filters_allowed = true; return !filter->allow; @@ -1882,7 +1890,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); + c->min_sz_region); if (!sz) goto update_stat; damon_split_region_at(t, r, sz); @@ -1930,7 +1938,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - if (damos_skip_charged_region(t, &r, s)) + if (damos_skip_charged_region(t, &r, s, c->min_sz_region)) continue; if (!damos_valid_target(c, t, r, s)) @@ -2324,7 +2332,8 @@ static void damon_split_region_at(struct damon_target *t, } /* Split every region in the given target into 'nr_subs' regions */ -static void damon_split_regions_of(struct damon_target *t, int nr_subs) +static void damon_split_regions_of(struct damon_target *t, int nr_subs, + unsigned long min_sz_region) { struct damon_region *r, *next; unsigned long sz_region, sz_sub = 0; @@ -2334,13 +2343,13 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs) sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && - sz_region > 2 * DAMON_MIN_REGION; i++) { + sz_region > 2 * min_sz_region; i++) { /* * Randomly select size of left sub-region to be at * least 10 percent and at most 90% of original region */ sz_sub = ALIGN_DOWN(damon_rand(1, 10) * - sz_region / 10, DAMON_MIN_REGION); + sz_region / 10, min_sz_region); /* Do not allow blank region */ if (sz_sub == 0 || sz_sub >= sz_region) continue; @@ -2380,7 +2389,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) nr_subregions = 3; damon_for_each_target(t, ctx) - damon_split_regions_of(t, nr_subregions); + damon_split_regions_of(t, nr_subregions, ctx->min_sz_region); last_nr_regions = nr_regions; } @@ -2769,7 +2778,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, addr_range.start = *start; addr_range.end = *end; - return damon_set_regions(t, &addr_range, 1); + return damon_set_regions(t, &addr_range, 1, DAMON_MIN_REGION); } /* diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 98bf15d403b2..0ed404c89f80 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1329,7 +1329,8 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, } static int damon_sysfs_set_regions(struct damon_target *t, - struct damon_sysfs_regions *sysfs_regions) + struct damon_sysfs_regions *sysfs_regions, + unsigned long min_sz_region) { struct damon_addr_range *ranges = kmalloc_array(sysfs_regions->nr, sizeof(*ranges), GFP_KERNEL | __GFP_NOWARN); @@ -1351,7 +1352,7 @@ static int damon_sysfs_set_regions(struct damon_target *t, if (ranges[i - 1].end > ranges[i].start) goto out; } - err = damon_set_regions(t, ranges, sysfs_regions->nr); + err = damon_set_regions(t, ranges, sysfs_regions->nr, min_sz_region); out: kfree(ranges); return err; @@ -1372,7 +1373,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target, /* caller will destroy targets */ return -EINVAL; } - return damon_sysfs_set_regions(t, sys_target->regions); + return damon_sysfs_set_regions(t, sys_target->regions, ctx->min_sz_region); } static int damon_sysfs_add_targets(struct damon_ctx *ctx, @@ -1430,6 +1431,7 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, if (err) return err; ctx->addr_unit = sys_ctx->addr_unit; + ctx->min_sz_region = max(DAMON_MIN_REGION / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 5f5dc9db2e90..51369e35298b 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -230,14 +230,14 @@ static void damon_test_split_regions_of(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 22); damon_add_region(r, t); - damon_split_regions_of(t, 2); + damon_split_regions_of(t, 2, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u); damon_free_target(t); t = damon_new_target(); r = damon_new_region(0, 220); damon_add_region(r, t); - damon_split_regions_of(t, 4); + damon_split_regions_of(t, 4, DAMON_MIN_REGION); KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u); damon_free_target(t); damon_destroy_ctx(c); @@ -303,7 +303,7 @@ static void damon_test_set_regions(struct kunit *test) damon_add_region(r1, t); damon_add_region(r2, t); - damon_set_regions(t, &range, 1); + damon_set_regions(t, &range, 1, DAMON_MIN_REGION); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); damon_for_each_region(r, t) { @@ -450,25 +450,29 @@ static void damos_test_filter_out(struct kunit *test) damon_add_region(r, t); /* region in the range */ - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 2; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region after the range */ r->ar.start = DAMON_MIN_REGION * 6; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); /* region started before the range */ r->ar.start = DAMON_MIN_REGION * 1; r->ar.end = DAMON_MIN_REGION * 4; - KUNIT_EXPECT_FALSE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_FALSE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); @@ -481,7 +485,8 @@ static void damos_test_filter_out(struct kunit *test) /* region started in the range */ r->ar.start = DAMON_MIN_REGION * 2; r->ar.end = DAMON_MIN_REGION * 8; - KUNIT_EXPECT_TRUE(test, damos_filter_match(NULL, t, r, f)); + KUNIT_EXPECT_TRUE(test, + damos_filter_match(NULL, t, r, f, DAMON_MIN_REGION)); /* filter should have split the region */ KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index d2b37ccf2cc0..fce38dd53cf8 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -141,7 +141,7 @@ static void damon_do_test_apply_three_regions(struct kunit *test, damon_add_region(r, t); } - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); for (i = 0; i < nr_expected / 2; i++) { r = __nth_region_of(t, i); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 66ef9869eafe..8c048f9b129e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -299,7 +299,7 @@ static void damon_va_update(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { if (damon_va_three_regions(t, three_regions)) continue; - damon_set_regions(t, three_regions, 3); + damon_set_regions(t, three_regions, 3, DAMON_MIN_REGION); } } From 1e332f303ae93ba4d38b480b1bb5a08f833306f6 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 28 Aug 2025 15:03:11 +0200 Subject: [PATCH 167/372] pagevec.h: add `const` to pointer parameters of getter functions For improved const-correctness. Link: https://lkml.kernel.org/r/20250828130311.772993-1-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Cc: David Hildenbrand Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagevec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 5d3a0cccc6bf..63be5a451627 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -51,12 +51,12 @@ static inline void folio_batch_reinit(struct folio_batch *fbatch) fbatch->i = 0; } -static inline unsigned int folio_batch_count(struct folio_batch *fbatch) +static inline unsigned int folio_batch_count(const struct folio_batch *fbatch) { return fbatch->nr; } -static inline unsigned int folio_batch_space(struct folio_batch *fbatch) +static inline unsigned int folio_batch_space(const struct folio_batch *fbatch) { return PAGEVEC_SIZE - fbatch->nr; } From ff0db419b278256a45db9bf6bd3f9a9a2c22b762 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 28 Aug 2025 12:27:58 +0000 Subject: [PATCH 168/372] tools/include: implement a couple of atomic_t ops Patch series "tools: testing: Use existing atomic.h for vma/maple tests", v2. De-duplicating this lets us delete a bit of code. Ulterior motive: I'm working on a new set of the userspace-based unit tests, which will need the atomics API too. That would involve even more duplication, so while the win in this patchset alone is very minimal, it looks a lot more significant with my other WIP patchset. I've tested these commands: make -C tools/testing/vma -j tools/testing/vma/vma make -C tools/testing/radix-tree -j tools/testing/radix-tree/maple Note the EXTRA_CFLAGS patch is actually orthogonal, let me know if you'd prefer I send it separately. This patch (of 4): The VMA tests need an operation equivalent to atomic_inc_unless_negative() to implement a fake mapping_map_writable(). Adding it will enable them to switch to the shared atomic headers and simplify that fake implementation. In order to add that, also add atomic_try_cmpxchg() which can be used to implement it. This is copied from Documentation/atomic_t.txt. Then, implement atomic_inc_unless_negative() itself based on the raw_atomic_dec_unless_positive() in include/linux/atomic/atomic-arch-fallback.h. There's no present need for a highly-optimised version of this (nor any reason to think this implementation is sub-optimal on x86) so just implement this with generic C, no x86-specifics. Link: https://lkml.kernel.org/r/20250828-b4-vma-no-atomic-h-v2-0-02d146a58ed2@google.com Link: https://lkml.kernel.org/r/20250828-b4-vma-no-atomic-h-v2-1-02d146a58ed2@google.com Signed-off-by: Brendan Jackman Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/include/linux/atomic.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/include/linux/atomic.h b/tools/include/linux/atomic.h index 01907b33537e..50c66ba9ada5 100644 --- a/tools/include/linux/atomic.h +++ b/tools/include/linux/atomic.h @@ -12,4 +12,26 @@ void atomic_long_set(atomic_long_t *v, long i); #define atomic_cmpxchg_release atomic_cmpxchg #endif /* atomic_cmpxchg_relaxed */ +static inline bool atomic_try_cmpxchg(atomic_t *ptr, int *oldp, int new) +{ + int ret, old = *oldp; + + ret = atomic_cmpxchg(ptr, old, new); + if (ret != old) + *oldp = ret; + return ret == old; +} + +static inline bool atomic_inc_unless_negative(atomic_t *v) +{ + int c = atomic_read(v); + + do { + if (unlikely(c < 0)) + return false; + } while (!atomic_try_cmpxchg(v, &c, c + 1)); + + return true; +} + #endif /* __TOOLS_LINUX_ATOMIC_H */ From d794cd23dc81f4e61eb166a214ea22a5b6a09d02 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 28 Aug 2025 12:27:59 +0000 Subject: [PATCH 169/372] tools: testing: allow importing arch headers in shared.mk There is an arch/ tree under tools. This contains some useful stuff, to make that available, add it to the -I flags. This requires $(SRCARCH), which is provided by Makefile.arch, so include that.. There still aren't that many headers so also just smush all of them into SHARED_DEPS instead of starting to do any header dependency hocus pocus. Link: https://lkml.kernel.org/r/20250828-b4-vma-no-atomic-h-v2-2-02d146a58ed2@google.com Signed-off-by: Brendan Jackman Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: Pedro Falcato Cc: Jann Horn Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/shared/shared.mk | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk index 923ee2492256..937aaa762332 100644 --- a/tools/testing/shared/shared.mk +++ b/tools/testing/shared/shared.mk @@ -1,6 +1,8 @@ # SPDX-License-Identifier: GPL-2.0 +include ../../scripts/Makefile.arch -CFLAGS += -I../shared -I. -I../../include -I../../../lib -g -Og -Wall \ +CFLAGS += -I../shared -I. -I../../include -I../../arch/$(SRCARCH)/include \ + -I../../../lib -g -Og -Wall \ -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS += -lpthread -lurcu @@ -11,6 +13,7 @@ SHARED_DEPS = Makefile ../shared/shared.mk ../shared/*.h generated/map-shift.h \ generated/bit-length.h generated/autoconf.h \ ../../include/linux/*.h \ ../../include/asm/*.h \ + ../../arch/$(SRCARCH)/include/asm/*.h \ ../../../include/linux/xarray.h \ ../../../include/linux/maple_tree.h \ ../../../include/linux/radix-tree.h \ From 953dad21bb691b77265bf5a9acdec5769596b8e0 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 28 Aug 2025 12:28:00 +0000 Subject: [PATCH 170/372] tools: testing: support EXTRA_CFLAGS in shared.mk This allows the user to set cflags when building tests that use this shared build infrastructure. For example, it enables building with -Werror so that patch-check scripts will fail: make -C tools/testing/vma -j EXTRA_CFLAGS=-Werror Link: https://lkml.kernel.org/r/20250828-b4-vma-no-atomic-h-v2-3-02d146a58ed2@google.com Signed-off-by: Brendan Jackman Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: Pedro Falcato Cc: Jann Horn Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/shared/shared.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk index 937aaa762332..5bcdf26c8a9d 100644 --- a/tools/testing/shared/shared.mk +++ b/tools/testing/shared/shared.mk @@ -4,6 +4,7 @@ include ../../scripts/Makefile.arch CFLAGS += -I../shared -I. -I../../include -I../../arch/$(SRCARCH)/include \ -I../../../lib -g -Og -Wall \ -D_LGPL_SOURCE -fsanitize=address -fsanitize=undefined +CFLAGS += $(EXTRA_CFLAGS) LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS += -lpthread -lurcu LIBS := slab.o find_bit.o bitmap.o hweight.o vsprintf.o From c66ae64401d148733ff564488160b58a659b80a5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 28 Aug 2025 12:28:01 +0000 Subject: [PATCH 171/372] tools: testing: use existing atomic.h for vma/maple tests The shared userspace logic used for unit-testing maple tree and VMA code currently has its own replacements for atomics helpers. This is not needed as the necessary APIs already have userspace implementations in the tools tree. Switching over to that allows deleting a bit of code. Note that the implementation is different; while the version being deleted here is implemented using liburcu, the existing version in tools uses either x86 asm or compiler builtins. It's assumed that both are equally likely to be correct. The tools tree's version of atomic_t is a struct type while the version being deleted was just a typedef of an integer. This means it's no longer valid to call __sync_bool_compare_and_swap() directly on it. One option would be to just peek into the struct and call it on the field, but it seems a little cleaner to just use the corresponding atomic.h API whic has been added recently. Now the fake mapping_map_writable() is copied from the real one. Link: https://lkml.kernel.org/r/20250828-b4-vma-no-atomic-h-v2-4-02d146a58ed2@google.com Signed-off-by: Brendan Jackman Reviewed-by: Lorenzo Stoakes Reviewed-by: Pedro Falcato Cc: Jann Horn Cc: Liam Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/shared/linux/maple_tree.h | 6 ++---- tools/testing/vma/linux/atomic.h | 17 ----------------- tools/testing/vma/vma_internal.h | 12 +++--------- 3 files changed, 5 insertions(+), 30 deletions(-) delete mode 100644 tools/testing/vma/linux/atomic.h diff --git a/tools/testing/shared/linux/maple_tree.h b/tools/testing/shared/linux/maple_tree.h index f67d47d32857..7d0fadef0f11 100644 --- a/tools/testing/shared/linux/maple_tree.h +++ b/tools/testing/shared/linux/maple_tree.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0+ */ -#define atomic_t int32_t -#define atomic_inc(x) uatomic_inc(x) -#define atomic_read(x) uatomic_read(x) -#define atomic_set(x, y) uatomic_set(x, y) +#include + #define U8_MAX UCHAR_MAX #include "../../../../include/linux/maple_tree.h" diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h deleted file mode 100644 index 788c597c4fde..000000000000 --- a/tools/testing/vma/linux/atomic.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ - -#ifndef _LINUX_ATOMIC_H -#define _LINUX_ATOMIC_H - -#define atomic_t int32_t -#define atomic_inc(x) uatomic_inc(x) -#define atomic_read(x) uatomic_read(x) -#define atomic_set(x, y) uatomic_set(x, y) -#define U8_MAX UCHAR_MAX - -#ifndef atomic_cmpxchg_relaxed -#define atomic_cmpxchg_relaxed uatomic_cmpxchg -#define atomic_cmpxchg_release uatomic_cmpxchg -#endif /* atomic_cmpxchg_relaxed */ - -#endif /* _LINUX_ATOMIC_H */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index f13354bf0a1e..437d2a1013be 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -21,6 +21,7 @@ #include +#include #include #include #include @@ -1398,15 +1399,8 @@ static inline bool map_deny_write_exec(unsigned long old, unsigned long new) static inline int mapping_map_writable(struct address_space *mapping) { - int c = atomic_read(&mapping->i_mmap_writable); - - /* Derived from the raw_atomic_inc_unless_negative() implementation. */ - do { - if (c < 0) - return -EPERM; - } while (!__sync_bool_compare_and_swap(&mapping->i_mmap_writable, c, c+1)); - - return 0; + return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? + 0 : -EPERM; } static inline unsigned long move_page_tables(struct pagetable_move_control *pmc) From 204dfefe039263d7b81857c7f320834b5c7dc94e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 28 Aug 2025 09:16:18 +0000 Subject: [PATCH 172/372] mm/page_alloc: find_large_buddy() from start_pfn aligned order We iterate pfn from order 0 to MAX_PAGE_ORDER aligned to find large buddy. While if the order is less than start_pfn aligned order, we would get the same pfn and do the same check again. Iterate from start_pfn aligned order to reduce duplicated work. [richard.weiyang@gmail.com: add comment on assignment of order] Link: https://lkml.kernel.org/r/20250828091618.7869-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20250902025807.11467-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20250828091618.7869-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20250902025807.11467-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Zi Yan Acked-by: Johannes Weiner Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2df6ee6998ab..0873d640f26c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2033,7 +2033,13 @@ static int move_freepages_block(struct zone *zone, struct page *page, /* Look for a buddy that straddles start_pfn */ static unsigned long find_large_buddy(unsigned long start_pfn) { - int order = 0; + /* + * If start_pfn is not an order-0 PageBuddy, next PageBuddy containing + * start_pfn has minimal order of __ffs(start_pfn) + 1. Start checking + * the order with __ffs(start_pfn). If start_pfn is order-0 PageBuddy, + * the starting order does not matter. + */ + int order = start_pfn ? __ffs(start_pfn) : MAX_PAGE_ORDER; struct page *page; unsigned long pfn = start_pfn; From 39b44c8c73312ac535ffdf7c8ecd37ea07d4ef86 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 28 Aug 2025 10:48:20 +0200 Subject: [PATCH 173/372] huge_mm.h: disallow is_huge_zero_folio(NULL) Calling is_huge_zero_folio(NULL) should not be legal - it makes no sense, and a different (theoretical) implementation may dereference the pointer. But currently, lacking any explicit documentation, this call is possible. But if somebody really passes NULL, the function should not return true - this isn't the huge zero folio after all! However, if the `huge_zero_folio` hasn't been allocated yet, it's NULL, and is_huge_zero_folio(NULL) just happens to return true, which is a lie. This weird side effect prevented me from reproducing a kernel crash that occurred when the elements of a folio_batch were NULL - since folios_put_refs() skips huge zero folios, this sometimes causes a crash, but sometimes does not. For debugging, it is better to reveal such bugs reliably and not hide them behind random preconditions like "has the huge zero folio already been created?" To improve detection of such bugs, David Hildenbrand suggested adding a VM_WARN_ON_ONCE(). Link: https://lkml.kernel.org/r/20250828084820.570118-1-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: David Hildenbrand Cc: Dev Jain Cc: Kairui Song Cc: Kemeng Shi Cc: Liam Howlett Cc: Mariano Pache Cc: Nhat Pham Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1ac0d06fb3c1..29ef70022da1 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -501,6 +501,8 @@ extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_folio(const struct folio *folio) { + VM_WARN_ON_ONCE(!folio); + return READ_ONCE(huge_zero_folio) == folio; } From 82b5fe3059a52e1419521ac32703208d03bb15e9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 27 Aug 2025 20:30:22 -0400 Subject: [PATCH 174/372] maple_tree: fix testing for 32 bit builds Patch series "maple_tree: Fix testing for 32bit compiles". The maple tree test suite supports 32bit builds which causes 32bit nodes and index/last values. Some tests have too large values and must be skipped while others depend on certain actions causing the tree to be altered in another measurable way (such as the height decreasing or increasing). Two tests were added that broke 32bit testing, either by compile warnings or failures. These fixes restore the tests to a working order. Building 32bit version can be done on a 32bit platform, or by using a command like: BUILD=32 make clean maple This patch (of 2): Some tests are invalid on 32bit due to the size of the index and last. Making those tests depend on the correct build flags stops compile complaints. Link: https://lkml.kernel.org/r/20250828003023.418966-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20250828003023.418966-2-Liam.Howlett@oracle.com Fixes: 5d659bbb52a2 ("maple_tree: introduce mas_wr_store_type()") Signed-off-by: Liam R. Howlett Reviewed-by: Sidhartha Kumar Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 172700fb7784..90a0db45a33c 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36428,6 +36428,7 @@ static void check_nomem_writer_race(struct maple_tree *mt) */ static inline int check_vma_modification(struct maple_tree *mt) { +#if defined(CONFIG_64BIT) MA_STATE(mas, mt, 0, 0); mtree_lock(mt); @@ -36451,6 +36452,8 @@ static inline int check_vma_modification(struct maple_tree *mt) mas_destroy(&mas); mtree_unlock(mt); +#endif + return 0; } From 103e90626d3a4032d22e8b09ff14600e71cda59c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 27 Aug 2025 20:30:23 -0400 Subject: [PATCH 175/372] maple_tree: testing fix for spanning store on 32b 32 bit nodes have a larger branching factor. This affects the required value to cause a height change. Update the spanning store height test to work for both 64 and 32 bit nodes. Link: https://lkml.kernel.org/r/20250828003023.418966-3-Liam.Howlett@oracle.com Fixes: f9d3a963fef4 ("maple_tree: use height and depth consistently") Signed-off-by: Liam R. Howlett Reviewed-by: Sidhartha Kumar Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 90a0db45a33c..05714c22994e 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36327,13 +36327,18 @@ extern void test_kmem_cache_bulk(void); static inline void check_spanning_store_height(struct maple_tree *mt) { int index = 0; + int last = 140; MA_STATE(mas, mt, 0, 0); mas_lock(&mas); while (mt_height(mt) != 3) { mas_store_gfp(&mas, xa_mk_value(index), GFP_KERNEL); mas_set(&mas, ++index); } - mas_set_range(&mas, 90, 140); + + if (MAPLE_32BIT) + last = 155; /* 32 bit higher branching factor. */ + + mas_set_range(&mas, 90, last); mas_store_gfp(&mas, xa_mk_value(index), GFP_KERNEL); MT_BUG_ON(mt, mas_mt_height(&mas) != 2); mas_unlock(&mas); From 1e338f4d99e6814ede16bad1db1cc463aad8032c Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Sun, 10 Aug 2025 17:57:45 +0500 Subject: [PATCH 176/372] kasan: introduce ARCH_DEFER_KASAN and unify static key across modes Patch series "kasan: unify kasan_enabled() and remove arch-specific implementations", v6. This patch series addresses the fragmentation in KASAN initialization across architectures by introducing a unified approach that eliminates duplicate static keys and arch-specific kasan_arch_is_ready() implementations. The core issue is that different architectures have inconsistent approaches to KASAN readiness tracking: - PowerPC, LoongArch, and UML arch, each implement own kasan_arch_is_ready() - Only HW_TAGS mode had a unified static key (kasan_flag_enabled) - Generic and SW_TAGS modes relied on arch-specific solutions or always-on behavior This patch (of 2): Introduce CONFIG_ARCH_DEFER_KASAN to identify architectures [1] that need to defer KASAN initialization until shadow memory is properly set up, and unify the static key infrastructure across all KASAN modes. [1] PowerPC, UML, LoongArch selects ARCH_DEFER_KASAN. The core issue is that different architectures haveinconsistent approaches to KASAN readiness tracking: - PowerPC, LoongArch, and UML arch, each implement own kasan_arch_is_ready() - Only HW_TAGS mode had a unified static key (kasan_flag_enabled) - Generic and SW_TAGS modes relied on arch-specific solutions or always-on behavior This patch addresses the fragmentation in KASAN initialization across architectures by introducing a unified approach that eliminates duplicate static keys and arch-specific kasan_arch_is_ready() implementations. Let's replace kasan_arch_is_ready() with existing kasan_enabled() check, which examines the static key being enabled if arch selects ARCH_DEFER_KASAN or has HW_TAGS mode support. For other arch, kasan_enabled() checks the enablement during compile time. Now KASAN users can use a single kasan_enabled() check everywhere. Link: https://lkml.kernel.org/r/20250810125746.1105476-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20250810125746.1105476-2-snovitoll@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217049 Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Christophe Leroy Reviewed-by: Ritesh Harjani (IBM) #powerpc Cc: Alexander Gordeev Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Baoquan He Cc: David Gow Cc: Dmitriy Vyukov Cc: Heiko Carstens Cc: Huacai Chen Cc: Marco Elver Cc: Qing Zhang Cc: Sabyrzhan Tasbolatov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 1 + arch/loongarch/include/asm/kasan.h | 7 ------ arch/loongarch/mm/kasan_init.c | 8 +++---- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/kasan.h | 12 ---------- arch/powerpc/mm/kasan/init_32.c | 2 +- arch/powerpc/mm/kasan/init_book3e_64.c | 2 +- arch/powerpc/mm/kasan/init_book3s_64.c | 6 +---- arch/um/Kconfig | 1 + arch/um/include/asm/kasan.h | 5 ++-- arch/um/kernel/mem.c | 13 ++++++++--- include/linux/kasan-enabled.h | 32 ++++++++++++++++++-------- include/linux/kasan.h | 6 +++++ lib/Kconfig.kasan | 12 ++++++++++ mm/kasan/common.c | 17 ++++++++++---- mm/kasan/generic.c | 19 +++++++++++---- mm/kasan/hw_tags.c | 9 +------- mm/kasan/kasan.h | 8 ++++++- mm/kasan/shadow.c | 12 +++++----- mm/kasan/sw_tags.c | 1 + mm/kasan/tags.c | 2 +- 21 files changed, 106 insertions(+), 70 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index f0abc38c40ac..e449e3fcecf9 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -9,6 +9,7 @@ config LOONGARCH select ACPI_PPTT if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_BINFMT_ELF_STATE + select ARCH_NEEDS_DEFER_KASAN select ARCH_DISABLE_KASAN_INLINE select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h index 62f139a9c87d..0e50e5b5e056 100644 --- a/arch/loongarch/include/asm/kasan.h +++ b/arch/loongarch/include/asm/kasan.h @@ -66,7 +66,6 @@ #define XKPRANGE_WC_SHADOW_OFFSET (KASAN_SHADOW_START + XKPRANGE_WC_KASAN_OFFSET) #define XKVRANGE_VC_SHADOW_OFFSET (KASAN_SHADOW_START + XKVRANGE_VC_KASAN_OFFSET) -extern bool kasan_early_stage; extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; #define kasan_mem_to_shadow kasan_mem_to_shadow @@ -75,12 +74,6 @@ void *kasan_mem_to_shadow(const void *addr); #define kasan_shadow_to_mem kasan_shadow_to_mem const void *kasan_shadow_to_mem(const void *shadow_addr); -#define kasan_arch_is_ready kasan_arch_is_ready -static __always_inline bool kasan_arch_is_ready(void) -{ - return !kasan_early_stage; -} - #define addr_has_metadata addr_has_metadata static __always_inline bool addr_has_metadata(const void *addr) { diff --git a/arch/loongarch/mm/kasan_init.c b/arch/loongarch/mm/kasan_init.c index d2681272d8f0..170da98ad4f5 100644 --- a/arch/loongarch/mm/kasan_init.c +++ b/arch/loongarch/mm/kasan_init.c @@ -40,11 +40,9 @@ static pgd_t kasan_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE); #define __pte_none(early, pte) (early ? pte_none(pte) : \ ((pte_val(pte) & _PFN_MASK) == (unsigned long)__pa(kasan_early_shadow_page))) -bool kasan_early_stage = true; - void *kasan_mem_to_shadow(const void *addr) { - if (!kasan_arch_is_ready()) { + if (!kasan_enabled()) { return (void *)(kasan_early_shadow_page); } else { unsigned long maddr = (unsigned long)addr; @@ -298,7 +296,8 @@ void __init kasan_init(void) kasan_populate_early_shadow(kasan_mem_to_shadow((void *)VMALLOC_START), kasan_mem_to_shadow((void *)KFENCE_AREA_END)); - kasan_early_stage = false; + /* Enable KASAN here before kasan_mem_to_shadow(). */ + kasan_init_generic(); /* Populate the linear mapping */ for_each_mem_range(i, &pa_start, &pa_end) { @@ -329,5 +328,4 @@ void __init kasan_init(void) /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; - pr_info("KernelAddressSanitizer initialized.\n"); } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 93402a1d9c9f..4730c676b6bf 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -122,6 +122,7 @@ config PPC # Please keep this list sorted alphabetically. # select ARCH_32BIT_OFF_T if PPC32 + select ARCH_NEEDS_DEFER_KASAN if PPC_RADIX_MMU select ARCH_DISABLE_KASAN_INLINE if PPC_RADIX_MMU select ARCH_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE select ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index b5bbb94c51f6..957a57c1db58 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -53,18 +53,6 @@ #endif #ifdef CONFIG_KASAN -#ifdef CONFIG_PPC_BOOK3S_64 -DECLARE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); - -static __always_inline bool kasan_arch_is_ready(void) -{ - if (static_branch_likely(&powerpc_kasan_enabled_key)) - return true; - return false; -} - -#define kasan_arch_is_ready kasan_arch_is_ready -#endif void kasan_early_init(void); void kasan_mmu_init(void); diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c index 03666d790a53..1d083597464f 100644 --- a/arch/powerpc/mm/kasan/init_32.c +++ b/arch/powerpc/mm/kasan/init_32.c @@ -165,7 +165,7 @@ void __init kasan_init(void) /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_late_init(void) diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c index 60c78aac0f63..0d3a73d6d4b0 100644 --- a/arch/powerpc/mm/kasan/init_book3e_64.c +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -127,7 +127,7 @@ void __init kasan_init(void) /* Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_late_init(void) { } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 7d959544c077..dcafa641804c 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -19,8 +19,6 @@ #include #include -DEFINE_STATIC_KEY_FALSE(powerpc_kasan_enabled_key); - static void __init kasan_init_phys_region(void *start, void *end) { unsigned long k_start, k_end, k_cur; @@ -92,11 +90,9 @@ void __init kasan_init(void) */ memset(kasan_early_shadow_page, 0, PAGE_SIZE); - static_branch_inc(&powerpc_kasan_enabled_key); - /* Enable error messages */ init_task.kasan_depth = 0; - pr_info("KASAN init done\n"); + kasan_init_generic(); } void __init kasan_early_init(void) { } diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 9083bfdb7735..1d4def0db841 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -5,6 +5,7 @@ menu "UML-specific options" config UML bool default y + select ARCH_NEEDS_DEFER_KASAN if STATIC_LINK select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_CPU_FINALIZE_INIT diff --git a/arch/um/include/asm/kasan.h b/arch/um/include/asm/kasan.h index f97bb1f7b851..b54a4e937fd1 100644 --- a/arch/um/include/asm/kasan.h +++ b/arch/um/include/asm/kasan.h @@ -24,10 +24,9 @@ #ifdef CONFIG_KASAN void kasan_init(void); -extern int kasan_um_is_ready; -#ifdef CONFIG_STATIC_LINK -#define kasan_arch_is_ready() (kasan_um_is_ready) +#if defined(CONFIG_STATIC_LINK) && defined(CONFIG_KASAN_INLINE) +#error UML does not work in KASAN_INLINE mode with STATIC_LINK enabled! #endif #else static inline void kasan_init(void) { } diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 76bec7de81b5..32e3b1972dc1 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -21,10 +21,10 @@ #include #include #include +#include #ifdef CONFIG_KASAN -int kasan_um_is_ready; -void kasan_init(void) +void __init kasan_init(void) { /* * kasan_map_memory will map all of the required address space and @@ -32,7 +32,11 @@ void kasan_init(void) */ kasan_map_memory((void *)KASAN_SHADOW_START, KASAN_SHADOW_SIZE); init_task.kasan_depth = 0; - kasan_um_is_ready = true; + /* + * Since kasan_init() is called before main(), + * KASAN is initialized but the enablement is deferred after + * jump_label_init(). See arch_mm_preinit(). + */ } static void (*kasan_init_ptr)(void) @@ -58,6 +62,9 @@ static unsigned long brk_end; void __init arch_mm_preinit(void) { + /* Safe to call after jump_label_init(). Enables KASAN. */ + kasan_init_generic(); + /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); diff --git a/include/linux/kasan-enabled.h b/include/linux/kasan-enabled.h index 6f612d69ea0c..9eca967d8526 100644 --- a/include/linux/kasan-enabled.h +++ b/include/linux/kasan-enabled.h @@ -4,32 +4,46 @@ #include -#ifdef CONFIG_KASAN_HW_TAGS - +#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS) +/* + * Global runtime flag for KASAN modes that need runtime control. + * Used by ARCH_DEFER_KASAN architectures and HW_TAGS mode. + */ DECLARE_STATIC_KEY_FALSE(kasan_flag_enabled); +/* + * Runtime control for shadow memory initialization or HW_TAGS mode. + * Uses static key for architectures that need deferred KASAN or HW_TAGS. + */ static __always_inline bool kasan_enabled(void) { return static_branch_likely(&kasan_flag_enabled); } -static inline bool kasan_hw_tags_enabled(void) +static inline void kasan_enable(void) { - return kasan_enabled(); + static_branch_enable(&kasan_flag_enabled); } - -#else /* CONFIG_KASAN_HW_TAGS */ - -static inline bool kasan_enabled(void) +#else +/* For architectures that can enable KASAN early, use compile-time check. */ +static __always_inline bool kasan_enabled(void) { return IS_ENABLED(CONFIG_KASAN); } +static inline void kasan_enable(void) {} +#endif /* CONFIG_ARCH_DEFER_KASAN || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS +static inline bool kasan_hw_tags_enabled(void) +{ + return kasan_enabled(); +} +#else static inline bool kasan_hw_tags_enabled(void) { return false; } - #endif /* CONFIG_KASAN_HW_TAGS */ #endif /* LINUX_KASAN_ENABLED_H */ diff --git a/include/linux/kasan.h b/include/linux/kasan.h index fe5ce9215821..b509a8d36949 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -543,6 +543,12 @@ void kasan_report_async(void); #endif /* CONFIG_KASAN_HW_TAGS */ +#ifdef CONFIG_KASAN_GENERIC +void __init kasan_init_generic(void); +#else +static inline void kasan_init_generic(void) { } +#endif + #ifdef CONFIG_KASAN_SW_TAGS void __init kasan_init_sw_tags(void); #else diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index f82889a830fa..a4bb610a7a6f 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -19,6 +19,18 @@ config ARCH_DISABLE_KASAN_INLINE Disables both inline and stack instrumentation. Selected by architectures that do not support these instrumentation types. +config ARCH_NEEDS_DEFER_KASAN + bool + +config ARCH_DEFER_KASAN + def_bool y + depends on KASAN && ARCH_NEEDS_DEFER_KASAN + help + Architectures should select this if they need to defer KASAN + initialization until shadow memory is properly set up. This + enables runtime control via static keys. Otherwise, KASAN uses + compile-time constants for better performance. + config CC_HAS_KASAN_GENERIC def_bool $(cc-option, -fsanitize=kernel-address) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 9142964ab9c9..e3765931a31f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -32,6 +32,15 @@ #include "kasan.h" #include "../slab.h" +#if defined(CONFIG_ARCH_DEFER_KASAN) || defined(CONFIG_KASAN_HW_TAGS) +/* + * Definition of the unified static key declared in kasan-enabled.h. + * This provides consistent runtime enable/disable across KASAN modes. + */ +DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); +EXPORT_SYMBOL_GPL(kasan_flag_enabled); +#endif + struct slab *kasan_addr_to_slab(const void *addr) { if (virt_addr_valid(addr)) @@ -246,7 +255,7 @@ static inline void poison_slab_object(struct kmem_cache *cache, void *object, bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, unsigned long ip) { - if (!kasan_arch_is_ready() || is_kfence_address(object)) + if (is_kfence_address(object)) return false; return check_slab_allocation(cache, object, ip); } @@ -254,7 +263,7 @@ bool __kasan_slab_pre_free(struct kmem_cache *cache, void *object, bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, bool still_accessible) { - if (!kasan_arch_is_ready() || is_kfence_address(object)) + if (is_kfence_address(object)) return false; /* @@ -293,7 +302,7 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, bool init, static inline bool check_page_allocation(void *ptr, unsigned long ip) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return false; if (ptr != page_address(virt_to_head_page(ptr))) { @@ -522,7 +531,7 @@ bool __kasan_mempool_poison_object(void *ptr, unsigned long ip) return true; } - if (is_kfence_address(ptr) || !kasan_arch_is_ready()) + if (is_kfence_address(ptr)) return true; slab = folio_slab(folio); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index d54e89f8c3e7..b413c46b3e04 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -36,6 +36,17 @@ #include "kasan.h" #include "../slab.h" +/* + * Initialize Generic KASAN and enable runtime checks. + * This should be called from arch kasan_init() once shadow memory is ready. + */ +void __init kasan_init_generic(void) +{ + kasan_enable(); + + pr_info("KernelAddressSanitizer initialized (generic)\n"); +} + /* * All functions below always inlined so compiler could * perform better optimizations in each of __asan_loadX/__assn_storeX @@ -165,7 +176,7 @@ static __always_inline bool check_region_inline(const void *addr, size_t size, bool write, unsigned long ret_ip) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return true; if (unlikely(size == 0)) @@ -193,7 +204,7 @@ bool kasan_byte_accessible(const void *addr) { s8 shadow_byte; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return true; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); @@ -495,7 +506,7 @@ static void release_alloc_meta(struct kasan_alloc_meta *meta) static void release_free_meta(const void *object, struct kasan_free_meta *meta) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; /* Check if free meta is valid. */ @@ -562,7 +573,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_save_track(&alloc_meta->alloc_track, flags); } -void kasan_save_free_info(struct kmem_cache *cache, void *object) +void __kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9a6927394b54..c8289a3feabf 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -45,13 +45,6 @@ static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; -/* - * Whether KASAN is enabled at all. - * The value remains false until KASAN is initialized by kasan_init_hw_tags(). - */ -DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); -EXPORT_SYMBOL(kasan_flag_enabled); - /* * Whether the selected mode is synchronous, asynchronous, or asymmetric. * Defaults to KASAN_MODE_SYNC. @@ -260,7 +253,7 @@ void __init kasan_init_hw_tags(void) kasan_init_tags(); /* KASAN is now initialized, enable it. */ - static_branch_enable(&kasan_flag_enabled); + kasan_enable(); pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", kasan_mode_info(), diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 129178be5e64..8a9d8a6ea717 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -398,7 +398,13 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, depot_flags_t depot_flags); void kasan_set_track(struct kasan_track *track, depot_stack_handle_t stack); void kasan_save_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void kasan_save_free_info(struct kmem_cache *cache, void *object); + +void __kasan_save_free_info(struct kmem_cache *cache, void *object); +static inline void kasan_save_free_info(struct kmem_cache *cache, void *object) +{ + if (kasan_enabled()) + __kasan_save_free_info(cache, object); +} #ifdef CONFIG_KASAN_GENERIC bool kasan_quarantine_put(struct kmem_cache *cache, void *object); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 11d472a5c4e8..5d2a876035d6 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -125,7 +125,7 @@ void kasan_poison(const void *addr, size_t size, u8 value, bool init) { void *shadow_start, *shadow_end; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; /* @@ -150,7 +150,7 @@ EXPORT_SYMBOL_GPL(kasan_poison); #ifdef CONFIG_KASAN_GENERIC void kasan_poison_last_granule(const void *addr, size_t size) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; if (size & KASAN_GRANULE_MASK) { @@ -408,7 +408,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size, gfp_t gfp_mas unsigned long shadow_start, shadow_end; int ret; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return 0; if (!is_vmalloc_or_module_addr((void *)addr)) @@ -583,7 +583,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long region_start, region_end; unsigned long size; - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); @@ -634,7 +634,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, * with setting memory tags, so the KASAN_VMALLOC_INIT flag is ignored. */ - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return (void *)start; if (!is_vmalloc_or_module_addr(start)) @@ -659,7 +659,7 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size, */ void __kasan_poison_vmalloc(const void *start, unsigned long size) { - if (!kasan_arch_is_ready()) + if (!kasan_enabled()) return; if (!is_vmalloc_or_module_addr(start)) diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index b9382b5b6a37..c75741a74602 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -44,6 +44,7 @@ void __init kasan_init_sw_tags(void) per_cpu(prng_state, cpu) = (u32)get_cycles(); kasan_init_tags(); + kasan_enable(); pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", str_on_off(kasan_stack_collection_enabled())); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index d65d48b85f90..b9f31293622b 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -142,7 +142,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) save_stack_info(cache, object, flags, false); } -void kasan_save_free_info(struct kmem_cache *cache, void *object) +void __kasan_save_free_info(struct kmem_cache *cache, void *object) { save_stack_info(cache, object, 0, true); } From e45085f2673b165687a3874d8e868437683fa8e4 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Sun, 10 Aug 2025 17:57:46 +0500 Subject: [PATCH 177/372] kasan: call kasan_init_generic in kasan_init Call kasan_init_generic() which handles Generic KASAN initialization. For architectures that do not select ARCH_DEFER_KASAN, this will be a no-op for the runtime flag but will print the initialization banner. For SW_TAGS and HW_TAGS modes, their respective init functions will handle the flag enabling, if they are enabled/implemented. Link: https://lkml.kernel.org/r/20250810125746.1105476-3-snovitoll@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217049 Signed-off-by: Sabyrzhan Tasbolatov Tested-by: Alexandre Ghiti [riscv] Acked-by: Alexander Gordeev [s390] Reviewed-by: Christophe Leroy Cc: Alexander Potapenko Cc: Alexandre Ghiti Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Baoquan He Cc: David Gow Cc: Dmitriy Vyukov Cc: Heiko Carstens Cc: Huacai Chen Cc: Marco Elver Cc: Qing Zhang Cc: Vincenzo Frascino Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- arch/arm/mm/kasan_init.c | 2 +- arch/arm64/mm/kasan_init.c | 4 +--- arch/riscv/mm/kasan_init.c | 1 + arch/s390/kernel/early.c | 3 ++- arch/x86/mm/kasan_init_64.c | 2 +- arch/xtensa/mm/kasan_init.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/arm/mm/kasan_init.c b/arch/arm/mm/kasan_init.c index 111d4f703136..c6625e808bf8 100644 --- a/arch/arm/mm/kasan_init.c +++ b/arch/arm/mm/kasan_init.c @@ -300,6 +300,6 @@ void __init kasan_init(void) local_flush_tlb_all(); memset(kasan_early_shadow_page, 0, PAGE_SIZE); - pr_info("Kernel address sanitizer initialized\n"); init_task.kasan_depth = 0; + kasan_init_generic(); } diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index d541ce45daeb..abeb81bf6ebd 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -399,14 +399,12 @@ void __init kasan_init(void) { kasan_init_shadow(); kasan_init_depth(); -#if defined(CONFIG_KASAN_GENERIC) + kasan_init_generic(); /* * Generic KASAN is now fully initialized. * Software and Hardware Tag-Based modes still require * kasan_init_sw_tags() and kasan_init_hw_tags() correspondingly. */ - pr_info("KernelAddressSanitizer initialized (generic)\n"); -#endif } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 41c635d6aca4..c4a2a9e5586e 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -533,4 +533,5 @@ void __init kasan_init(void) csr_write(CSR_SATP, PFN_DOWN(__pa(swapper_pg_dir)) | satp_mode); local_flush_tlb_all(); + kasan_init_generic(); } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 9adfbdd377dc..544e5403dd91 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,7 @@ static void __init kasan_early_init(void) { #ifdef CONFIG_KASAN init_task.kasan_depth = 0; - pr_info("KernelAddressSanitizer initialized\n"); + kasan_init_generic(); #endif } diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0539efd0d216..998b6010d6d3 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -451,5 +451,5 @@ void __init kasan_init(void) __flush_tlb_all(); init_task.kasan_depth = 0; - pr_info("KernelAddressSanitizer initialized\n"); + kasan_init_generic(); } diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c index f39c4d83173a..0524b9ed5e63 100644 --- a/arch/xtensa/mm/kasan_init.c +++ b/arch/xtensa/mm/kasan_init.c @@ -94,5 +94,5 @@ void __init kasan_init(void) /* At this point kasan is fully initialized. Enable error messages. */ current->kasan_depth = 0; - pr_info("KernelAddressSanitizer initialized\n"); + kasan_init_generic(); } From 5c3f8be0c6b1a7b145a5db012b427c25516564c4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 29 Aug 2025 17:15:26 +0100 Subject: [PATCH 178/372] mm: zswap: interact directly with zsmalloc Patch series "mm: remove zpool". zpool is an indirection layer for zswap to switch between multiple allocator backends at runtime. Since 6.15, zsmalloc is the only allocator left in-tree, so there is no point in keeping zpool around. This patch (of 3): zswap goes through the zpool layer to enable runtime-switching of allocator backends for compressed data. However, since zbud and z3fold were removed in 6.15, zsmalloc has been the only option available. As such, the zpool indirection is unnecessary. Make zswap deal with zsmalloc directly. This is comparable to zram, which also directly interacts with zsmalloc and has never supported a different backend. Note that this does not preclude future improvements and experiments with different allocation strategies. Should it become necessary, it's possible to provide an alternate implementation for the zsmalloc API, selectable at compile time. However, zsmalloc is also rather mature and feature rich, with years of widespread production exposure; it's encouraged to make incremental improvements rather than fork it. In any case, the complexity of runtime pluggability seems excessive and unjustified at this time. Switch zswap to zsmalloc to remove the last user of the zpool API. [hannes@cmpxchg.org: fix default compressr test] Link: https://lkml.kernel.org/r/20250915153640.GA828739@cmpxchg.org Link: https://lkml.kernel.org/r/20250829162212.208258-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20250829162212.208258-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Nacked-by: Vitaly Wool Acked-by: Nhat Pham Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/zswap.c | 198 ++++++++++++++--------------------------------------- 1 file changed, 52 insertions(+), 146 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index e5e1f5687f5e..63045e3fb1f5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include #include "swap.h" #include "internal.h" @@ -107,16 +107,6 @@ static const struct kernel_param_ops zswap_compressor_param_ops = { module_param_cb(compressor, &zswap_compressor_param_ops, &zswap_compressor, 0644); -/* Compressed storage zpool to use */ -static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; -static int zswap_zpool_param_set(const char *, const struct kernel_param *); -static const struct kernel_param_ops zswap_zpool_param_ops = { - .set = zswap_zpool_param_set, - .get = param_get_charp, - .free = param_free_charp, -}; -module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); - /* The maximum percentage of memory that the compressed pool can occupy */ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); @@ -161,7 +151,7 @@ struct crypto_acomp_ctx { * needs to be verified that it's still valid in the tree. */ struct zswap_pool { - struct zpool *zpool; + struct zs_pool *zs_pool; struct crypto_acomp_ctx __percpu *acomp_ctx; struct percpu_ref ref; struct list_head list; @@ -193,7 +183,7 @@ static struct shrinker *zswap_shrinker; * logic if referenced is unset. See comments in the shrinker * section for context. * pool - the zswap_pool the entry's data is in - * handle - zpool allocation handle that stores the compressed page data + * handle - zsmalloc allocation handle that stores the compressed page data * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ @@ -214,7 +204,7 @@ static unsigned int nr_zswap_trees[MAX_SWAPFILES]; static LIST_HEAD(zswap_pools); /* protects zswap_pools list modification */ static DEFINE_SPINLOCK(zswap_pools_lock); -/* pool counter to provide unique names to zpool */ +/* pool counter to provide unique names to zsmalloc */ static atomic_t zswap_pools_count = ATOMIC_INIT(0); enum zswap_init_type { @@ -241,32 +231,22 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp) >> SWAP_ADDRESS_SPACE_SHIFT]; } -#define zswap_pool_debug(msg, p) \ - pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ - zpool_get_type((p)->zpool)) +#define zswap_pool_debug(msg, p) \ + pr_debug("%s pool %s\n", msg, (p)->tfm_name) /********************************* * pool functions **********************************/ static void __zswap_pool_empty(struct percpu_ref *ref); -static struct zswap_pool *zswap_pool_create(char *type, char *compressor) +static struct zswap_pool *zswap_pool_create(char *compressor) { struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ - gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; int ret, cpu; - if (!zswap_has_pool) { - /* if either are unset, pool initialization failed, and we - * need both params to be set correctly before trying to - * create a pool. - */ - if (!strcmp(type, ZSWAP_PARAM_UNSET)) - return NULL; - if (!strcmp(compressor, ZSWAP_PARAM_UNSET)) - return NULL; - } + if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET)) + return NULL; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) @@ -274,12 +254,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); - pool->zpool = zpool_create_pool(type, name, gfp); - if (!pool->zpool) { - pr_err("%s zpool not available\n", type); + pool->zs_pool = zs_create_pool(name); + if (!pool->zs_pool) goto error; - } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); @@ -315,52 +292,29 @@ ref_fail: error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); - if (pool->zpool) - zpool_destroy_pool(pool->zpool); + if (pool->zs_pool) + zs_destroy_pool(pool->zs_pool); kfree(pool); return NULL; } static struct zswap_pool *__zswap_pool_create_fallback(void) { - bool has_comp, has_zpool; - - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); - if (!has_comp && strcmp(zswap_compressor, - CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { + if (!crypto_has_acomp(zswap_compressor, 0, 0) && + strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; - has_comp = crypto_has_acomp(zswap_compressor, 0, 0); } - if (!has_comp) { - pr_err("default compressor %s not available\n", - zswap_compressor); - param_free_charp(&zswap_compressor); + + /* Default compressor should be available. Kconfig bug? */ + if (WARN_ON_ONCE(!crypto_has_acomp(zswap_compressor, 0, 0))) { zswap_compressor = ZSWAP_PARAM_UNSET; - } - - has_zpool = zpool_has_pool(zswap_zpool_type); - if (!has_zpool && strcmp(zswap_zpool_type, - CONFIG_ZSWAP_ZPOOL_DEFAULT)) { - pr_err("zpool %s not available, using default %s\n", - zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; - has_zpool = zpool_has_pool(zswap_zpool_type); - } - if (!has_zpool) { - pr_err("default zpool %s not available\n", - zswap_zpool_type); - param_free_charp(&zswap_zpool_type); - zswap_zpool_type = ZSWAP_PARAM_UNSET; - } - - if (!has_comp || !has_zpool) return NULL; + } - return zswap_pool_create(zswap_zpool_type, zswap_compressor); + return zswap_pool_create(zswap_compressor); } static void zswap_pool_destroy(struct zswap_pool *pool) @@ -370,7 +324,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - zpool_destroy_pool(pool->zpool); + zs_destroy_pool(pool->zs_pool); kfree(pool); } @@ -462,7 +416,7 @@ static struct zswap_pool *zswap_pool_current_get(void) } /* type and compressor must be null-terminated */ -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) +static struct zswap_pool *zswap_pool_find_get(char *compressor) { struct zswap_pool *pool; @@ -471,8 +425,6 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; - if (strcmp(zpool_get_type(pool->zpool), type)) - continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_tryget(pool)) continue; @@ -499,7 +451,7 @@ unsigned long zswap_total_pages(void) rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) - total += zpool_get_total_pages(pool->zpool); + total += zs_get_total_pages(pool->zs_pool); rcu_read_unlock(); return total; @@ -524,33 +476,22 @@ static bool zswap_check_limits(void) * param callbacks **********************************/ -static bool zswap_pool_changed(const char *s, const struct kernel_param *kp) -{ - /* no change required */ - if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool) - return false; - return true; -} - -/* val must be a null-terminated string */ -static int __zswap_param_set(const char *val, const struct kernel_param *kp, - char *type, char *compressor) +static int zswap_compressor_param_set(const char *val, const struct kernel_param *kp) { struct zswap_pool *pool, *put_pool = NULL; char *s = strstrip((char *)val); + bool create_pool = false; int ret = 0; - bool new_pool = false; mutex_lock(&zswap_init_lock); switch (zswap_init_state) { case ZSWAP_UNINIT: - /* if this is load-time (pre-init) param setting, - * don't create a pool; that's done during init. - */ + /* Handled in zswap_setup() */ ret = param_set_charp(s, kp); break; case ZSWAP_INIT_SUCCEED: - new_pool = zswap_pool_changed(s, kp); + if (!zswap_has_pool || strcmp(s, *(char **)kp->arg)) + create_pool = true; break; case ZSWAP_INIT_FAILED: pr_err("can't set param, initialization failed\n"); @@ -558,30 +499,17 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, } mutex_unlock(&zswap_init_lock); - /* no need to create a new pool, return directly */ - if (!new_pool) + if (!create_pool) return ret; - if (!type) { - if (!zpool_has_pool(s)) { - pr_err("zpool %s not available\n", s); - return -ENOENT; - } - type = s; - } else if (!compressor) { - if (!crypto_has_acomp(s, 0, 0)) { - pr_err("compressor %s not available\n", s); - return -ENOENT; - } - compressor = s; - } else { - WARN_ON(1); - return -EINVAL; + if (!crypto_has_acomp(s, 0, 0)) { + pr_err("compressor %s not available\n", s); + return -ENOENT; } spin_lock_bh(&zswap_pools_lock); - pool = zswap_pool_find_get(type, compressor); + pool = zswap_pool_find_get(s); if (pool) { zswap_pool_debug("using existing", pool); WARN_ON(pool == zswap_pool_current()); @@ -591,7 +519,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, spin_unlock_bh(&zswap_pools_lock); if (!pool) - pool = zswap_pool_create(type, compressor); + pool = zswap_pool_create(s); else { /* * Restore the initial ref dropped by percpu_ref_kill() @@ -616,7 +544,8 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, list_add_rcu(&pool->list, &zswap_pools); zswap_has_pool = true; } else if (pool) { - /* add the possibly pre-existing pool to the end of the pools + /* + * Add the possibly pre-existing pool to the end of the pools * list; if it's new (and empty) then it'll be removed and * destroyed by the put after we drop the lock */ @@ -626,18 +555,8 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, spin_unlock_bh(&zswap_pools_lock); - if (!zswap_has_pool && !pool) { - /* if initial pool creation failed, and this pool creation also - * failed, maybe both compressor and zpool params were bad. - * Allow changing this param, so pool creation will succeed - * when the other param is changed. We already verified this - * param is ok in the zpool_has_pool() or crypto_has_acomp() - * checks above. - */ - ret = param_set_charp(s, kp); - } - - /* drop the ref from either the old current pool, + /* + * Drop the ref from either the old current pool, * or the new pool we failed to add */ if (put_pool) @@ -646,18 +565,6 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, return ret; } -static int zswap_compressor_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, zswap_zpool_type, NULL); -} - -static int zswap_zpool_param_set(const char *val, - const struct kernel_param *kp) -{ - return __zswap_param_set(val, kp, NULL, zswap_compressor); -} - static int zswap_enabled_param_set(const char *val, const struct kernel_param *kp) { @@ -801,13 +708,13 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zpool allocation, + * Carries out the common pattern of freeing an entry's zsmalloc allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_entry_free(struct zswap_entry *entry) { zswap_lru_del(&zswap_list_lru, entry); - zpool_free(entry->pool->zpool, entry->handle); + zs_free(entry->pool->zs_pool, entry->handle); zswap_pool_put(entry->pool); if (entry->objcg) { obj_cgroup_uncharge_zswap(entry->objcg, entry->length); @@ -949,7 +856,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, int comp_ret = 0, alloc_ret = 0; unsigned int dlen = PAGE_SIZE; unsigned long handle; - struct zpool *zpool; gfp_t gfp; u8 *dst; bool mapped = false; @@ -997,13 +903,14 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, mapped = true; } - zpool = pool->zpool; gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; - alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); - if (alloc_ret) + handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page)); + if (IS_ERR_VALUE(handle)) { + alloc_ret = PTR_ERR((void *)handle); goto unlock; + } - zpool_obj_write(zpool, handle, dst, dlen); + zs_obj_write(pool->zs_pool, handle, dst, dlen); entry->handle = handle; entry->length = dlen; @@ -1023,14 +930,14 @@ unlock: static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) { - struct zpool *zpool = entry->pool->zpool; + struct zswap_pool *pool = entry->pool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; int decomp_ret = 0, dlen = PAGE_SIZE; u8 *src, *obj; - acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); - obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); + acomp_ctx = acomp_ctx_get_cpu_lock(pool); + obj = zs_obj_read_begin(pool->zs_pool, entry->handle, acomp_ctx->buffer); /* zswap entries of length PAGE_SIZE are not compressed. */ if (entry->length == PAGE_SIZE) { @@ -1039,7 +946,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) } /* - * zpool_obj_read_begin() might return a kmap address of highmem when + * zs_obj_read_begin() might return a kmap address of highmem when * acomp_ctx->buffer is not used. However, sg_init_one() does not * handle highmem addresses, so copy the object to acomp_ctx->buffer. */ @@ -1059,7 +966,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio) dlen = acomp_ctx->req->dlen; read_done: - zpool_obj_read_end(zpool, entry->handle, obj); + zs_obj_read_end(pool->zs_pool, entry->handle, obj); acomp_ctx_put_unlock(acomp_ctx); if (!decomp_ret && dlen == PAGE_SIZE) @@ -1576,7 +1483,7 @@ static bool zswap_store_page(struct page *page, return true; store_failed: - zpool_free(pool->zpool, entry->handle); + zs_free(pool->zs_pool, entry->handle); compress_failed: zswap_entry_cache_free(entry); return false; @@ -1906,8 +1813,7 @@ static int zswap_setup(void) pool = __zswap_pool_create_fallback(); if (pool) { - pr_info("loaded using pool %s/%s\n", pool->tfm_name, - zpool_get_type(pool->zpool)); + pr_info("loaded using pool %s\n", pool->tfm_name); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; static_branch_enable(&zswap_ever_enabled); From 2ccd9fecd9163f168761d4398564c81554f636ef Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 29 Aug 2025 17:15:27 +0100 Subject: [PATCH 179/372] mm: remove unused zpool layer With zswap using zsmalloc directly, there are no more in-tree users of this code. Remove it. With zpool gone, zsmalloc is now always a simple dependency and no longer something the user needs to configure. Hide CONFIG_ZSMALLOC from the user and have zswap and zram pull it in as needed. Link: https://lkml.kernel.org/r/20250829162212.208258-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: SeongJae Park Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Nhat Pham Cc: Vitaly Wool Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/zswap.rst | 33 +- Documentation/core-api/mm-api.rst | 1 - .../driver-api/crypto/iaa/iaa-crypto.rst | 2 - MAINTAINERS | 2 - arch/loongarch/configs/loongson3_defconfig | 1 - include/linux/zpool.h | 86 ----- mm/Kconfig | 51 +-- mm/Makefile | 1 - mm/zpool.c | 328 ------------------ mm/zsmalloc.c | 79 ----- tools/testing/selftests/zram/README | 1 - 11 files changed, 24 insertions(+), 561 deletions(-) delete mode 100644 include/linux/zpool.h delete mode 100644 mm/zpool.c diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index fd3370aa43fe..283d77217c6f 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -53,26 +53,17 @@ Zswap receives pages for compression from the swap subsystem and is able to evict pages from its own compressed pool on an LRU basis and write them back to the backing swap device in the case that the compressed pool is full. -Zswap makes use of zpool for the managing the compressed memory pool. Each -allocation in zpool is not directly accessible by address. Rather, a handle is +Zswap makes use of zsmalloc for the managing the compressed memory pool. Each +allocation in zsmalloc is not directly accessible by address. Rather, a handle is returned by the allocation routine and that handle must be mapped before being accessed. The compressed memory pool grows on demand and shrinks as compressed -pages are freed. The pool is not preallocated. By default, a zpool -of type selected in ``CONFIG_ZSWAP_ZPOOL_DEFAULT`` Kconfig option is created, -but it can be overridden at boot time by setting the ``zpool`` attribute, -e.g. ``zswap.zpool=zsmalloc``. It can also be changed at runtime using the sysfs -``zpool`` attribute, e.g.:: - - echo zsmalloc > /sys/module/zswap/parameters/zpool - -The zsmalloc type zpool has a complex compressed page storage method, and it -can achieve great storage densities. +pages are freed. The pool is not preallocated. When a swap page is passed from swapout to zswap, zswap maintains a mapping -of the swap entry, a combination of the swap type and swap offset, to the zpool -handle that references that compressed swap page. This mapping is achieved -with a red-black tree per swap type. The swap offset is the search key for the -tree nodes. +of the swap entry, a combination of the swap type and swap offset, to the +zsmalloc handle that references that compressed swap page. This mapping is +achieved with a red-black tree per swap type. The swap offset is the search +key for the tree nodes. During a page fault on a PTE that is a swap entry, the swapin code calls the zswap load function to decompress the page into the page allocated by the page @@ -96,11 +87,11 @@ attribute, e.g.:: echo lzo > /sys/module/zswap/parameters/compressor -When the zpool and/or compressor parameter is changed at runtime, any existing -compressed pages are not modified; they are left in their own zpool. When a -request is made for a page in an old zpool, it is uncompressed using its -original compressor. Once all pages are removed from an old zpool, the zpool -and its compressor are freed. +When the compressor parameter is changed at runtime, any existing compressed +pages are not modified; they are left in their own pool. When a request is +made for a page in an old pool, it is uncompressed using its original +compressor. Once all pages are removed from an old pool, the pool and its +compressor are freed. Some of the pages in zswap are same-value filled pages (i.e. contents of the page have same value or repetitive pattern). These pages include zero-filled diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 5063179cfc70..68193a4cfcf5 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -118,7 +118,6 @@ More Memory Management Functions .. kernel-doc:: mm/memremap.c .. kernel-doc:: mm/hugetlb.c .. kernel-doc:: mm/swap.c -.. kernel-doc:: mm/zpool.c .. kernel-doc:: mm/memcontrol.c .. #kernel-doc:: mm/memory-tiers.c (build warnings) .. kernel-doc:: mm/shmem.c diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst index 8e50b900d51c..f815d4fd8372 100644 --- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst +++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst @@ -476,7 +476,6 @@ Use the following commands to enable zswap:: # echo 0 > /sys/module/zswap/parameters/enabled # echo 50 > /sys/module/zswap/parameters/max_pool_percent # echo deflate-iaa > /sys/module/zswap/parameters/compressor - # echo zsmalloc > /sys/module/zswap/parameters/zpool # echo 1 > /sys/module/zswap/parameters/enabled # echo 100 > /proc/sys/vm/swappiness # echo never > /sys/kernel/mm/transparent_hugepage/enabled @@ -625,7 +624,6 @@ the 'fixed' compression mode:: echo 0 > /sys/module/zswap/parameters/enabled echo 50 > /sys/module/zswap/parameters/max_pool_percent echo deflate-iaa > /sys/module/zswap/parameters/compressor - echo zsmalloc > /sys/module/zswap/parameters/zpool echo 1 > /sys/module/zswap/parameters/enabled echo 100 > /proc/sys/vm/swappiness diff --git a/MAINTAINERS b/MAINTAINERS index 9344c33c52e1..a7e123ddf05a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27879,9 +27879,7 @@ R: Chengming Zhou L: linux-mm@kvack.org S: Maintained F: Documentation/admin-guide/mm/zswap.rst -F: include/linux/zpool.h F: include/linux/zswap.h -F: mm/zpool.c F: mm/zswap.c F: tools/testing/selftests/cgroup/test_zswap.c diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 34eaee0384c9..2b8df0e9e42a 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -106,7 +106,6 @@ CONFIG_CMDLINE_PARTITION=y CONFIG_IOSCHED_BFQ=y CONFIG_BFQ_GROUP_IOSCHED=y CONFIG_BINFMT_MISC=m -CONFIG_ZPOOL=y CONFIG_ZSWAP=y CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD=y CONFIG_ZSMALLOC=y diff --git a/include/linux/zpool.h b/include/linux/zpool.h deleted file mode 100644 index 369ef068fad8..000000000000 --- a/include/linux/zpool.h +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * zpool memory storage api - * - * Copyright (C) 2014 Dan Streetman - * - * This is a common frontend for the zswap compressed memory storage - * implementations. - */ - -#ifndef _ZPOOL_H_ -#define _ZPOOL_H_ - -struct zpool; - -bool zpool_has_pool(char *type); - -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); - -const char *zpool_get_type(struct zpool *pool); - -void zpool_destroy_pool(struct zpool *pool); - -int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid); - -void zpool_free(struct zpool *pool, unsigned long handle); - -void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, - void *local_copy); - -void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, - void *handle_mem); - -void zpool_obj_write(struct zpool *zpool, unsigned long handle, - void *handle_mem, size_t mem_len); - -u64 zpool_get_total_pages(struct zpool *pool); - - -/** - * struct zpool_driver - driver implementation for zpool - * @type: name of the driver. - * @list: entry in the list of zpool drivers. - * @create: create a new pool. - * @destroy: destroy a pool. - * @malloc: allocate mem from a pool. - * @free: free mem from a pool. - * @sleep_mapped: whether zpool driver can sleep during map. - * @map: map a handle. - * @unmap: unmap a handle. - * @total_size: get total size of a pool. - * - * This is created by a zpool implementation and registered - * with zpool. - */ -struct zpool_driver { - char *type; - struct module *owner; - atomic_t refcount; - struct list_head list; - - void *(*create)(const char *name, gfp_t gfp); - void (*destroy)(void *pool); - - int (*malloc)(void *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid); - void (*free)(void *pool, unsigned long handle); - - void *(*obj_read_begin)(void *pool, unsigned long handle, - void *local_copy); - void (*obj_read_end)(void *pool, unsigned long handle, - void *handle_mem); - void (*obj_write)(void *pool, unsigned long handle, - void *handle_mem, size_t mem_len); - - u64 (*total_pages)(void *pool); -}; - -void zpool_register_driver(struct zpool_driver *driver); - -int zpool_unregister_driver(struct zpool_driver *driver); - -bool zpool_can_sleep_mapped(struct zpool *pool); - -#endif diff --git a/mm/Kconfig b/mm/Kconfig index 4108bcd96784..b971d35c43c3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -9,9 +9,6 @@ menu "Memory Management options" config ARCH_NO_SWAP bool -config ZPOOL - bool - menuconfig SWAP bool "Support for paging of anonymous memory (swap)" depends on MMU && BLOCK && !ARCH_NO_SWAP @@ -26,7 +23,7 @@ config ZSWAP bool "Compressed cache for swap pages" depends on SWAP select CRYPTO - select ZPOOL + select ZSMALLOC help A lightweight compressed cache for swap pages. It takes pages that are in the process of being swapped out and attempts to @@ -125,45 +122,18 @@ config ZSWAP_COMPRESSOR_DEFAULT default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD default "" -choice - prompt "Default allocator" - depends on ZSWAP - default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU - help - Selects the default allocator for the compressed cache for - swap pages. - The default is 'zbud' for compatibility, however please do - read the description of each of the allocators below before - making a right choice. - - The selection made here can be overridden by using the kernel - command line 'zswap.zpool=' option. - -config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - bool "zsmalloc" - select ZSMALLOC - help - Use the zsmalloc allocator as the default allocator. -endchoice - -config ZSWAP_ZPOOL_DEFAULT - string - depends on ZSWAP - default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC - default "" - config ZSMALLOC tristate - prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM) - depends on MMU - help - zsmalloc is a slab-based memory allocator designed to store - pages of various compression levels efficiently. It achieves - the highest storage density with the least amount of fragmentation. + +if ZSMALLOC + +menu "Zsmalloc allocator options" + depends on ZSMALLOC + +comment "Zsmalloc is a common backend allocator for zswap & zram" config ZSMALLOC_STAT bool "Export zsmalloc statistics" - depends on ZSMALLOC select DEBUG_FS help This option enables code in the zsmalloc to collect various @@ -175,7 +145,6 @@ config ZSMALLOC_CHAIN_SIZE int "Maximum number of physical pages per-zspage" default 8 range 4 16 - depends on ZSMALLOC help This option sets the upper limit on the number of physical pages that a zmalloc page (zspage) can consist of. The optimal zspage @@ -190,6 +159,10 @@ config ZSMALLOC_CHAIN_SIZE For more information, see zsmalloc documentation. +endmenu + +endif + menu "Slab allocator options" config SLUB diff --git a/mm/Makefile b/mm/Makefile index ef54aa615d9d..21abb3353550 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -115,7 +115,6 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o -obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o diff --git a/mm/zpool.c b/mm/zpool.c deleted file mode 100644 index 0a71d03369f1..000000000000 --- a/mm/zpool.c +++ /dev/null @@ -1,328 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * zpool memory storage api - * - * Copyright (C) 2014 Dan Streetman - * - * This is a common frontend for memory storage pool implementations. - * Typically, this is used to store compressed memory. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include - -struct zpool { - struct zpool_driver *driver; - void *pool; -}; - -static LIST_HEAD(drivers_head); -static DEFINE_SPINLOCK(drivers_lock); - -/** - * zpool_register_driver() - register a zpool implementation. - * @driver: driver to register - */ -void zpool_register_driver(struct zpool_driver *driver) -{ - spin_lock(&drivers_lock); - atomic_set(&driver->refcount, 0); - list_add(&driver->list, &drivers_head); - spin_unlock(&drivers_lock); -} -EXPORT_SYMBOL(zpool_register_driver); - -/** - * zpool_unregister_driver() - unregister a zpool implementation. - * @driver: driver to unregister. - * - * Module usage counting is used to prevent using a driver - * while/after unloading, so if this is called from module - * exit function, this should never fail; if called from - * other than the module exit function, and this returns - * failure, the driver is in use and must remain available. - */ -int zpool_unregister_driver(struct zpool_driver *driver) -{ - int ret = 0, refcount; - - spin_lock(&drivers_lock); - refcount = atomic_read(&driver->refcount); - WARN_ON(refcount < 0); - if (refcount > 0) - ret = -EBUSY; - else - list_del(&driver->list); - spin_unlock(&drivers_lock); - - return ret; -} -EXPORT_SYMBOL(zpool_unregister_driver); - -/* this assumes @type is null-terminated. */ -static struct zpool_driver *zpool_get_driver(const char *type) -{ - struct zpool_driver *driver; - - spin_lock(&drivers_lock); - list_for_each_entry(driver, &drivers_head, list) { - if (!strcmp(driver->type, type)) { - bool got = try_module_get(driver->owner); - - if (got) - atomic_inc(&driver->refcount); - spin_unlock(&drivers_lock); - return got ? driver : NULL; - } - } - - spin_unlock(&drivers_lock); - return NULL; -} - -static void zpool_put_driver(struct zpool_driver *driver) -{ - atomic_dec(&driver->refcount); - module_put(driver->owner); -} - -/** - * zpool_has_pool() - Check if the pool driver is available - * @type: The type of the zpool to check (e.g. zsmalloc) - * - * This checks if the @type pool driver is available. This will try to load - * the requested module, if needed, but there is no guarantee the module will - * still be loaded and available immediately after calling. If this returns - * true, the caller should assume the pool is available, but must be prepared - * to handle the @zpool_create_pool() returning failure. However if this - * returns false, the caller should assume the requested pool type is not - * available; either the requested pool type module does not exist, or could - * not be loaded, and calling @zpool_create_pool() with the pool type will - * fail. - * - * The @type string must be null-terminated. - * - * Returns: true if @type pool is available, false if not - */ -bool zpool_has_pool(char *type) -{ - struct zpool_driver *driver = zpool_get_driver(type); - - if (!driver) { - request_module("zpool-%s", type); - driver = zpool_get_driver(type); - } - - if (!driver) - return false; - - zpool_put_driver(driver); - return true; -} -EXPORT_SYMBOL(zpool_has_pool); - -/** - * zpool_create_pool() - Create a new zpool - * @type: The type of the zpool to create (e.g. zsmalloc) - * @name: The name of the zpool (e.g. zram0, zswap) - * @gfp: The GFP flags to use when allocating the pool. - * - * This creates a new zpool of the specified type. The gfp flags will be - * used when allocating memory, if the implementation supports it. If the - * ops param is NULL, then the created zpool will not be evictable. - * - * Implementations must guarantee this to be thread-safe. - * - * The @type and @name strings must be null-terminated. - * - * Returns: New zpool on success, NULL on failure. - */ -struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp) -{ - struct zpool_driver *driver; - struct zpool *zpool; - - pr_debug("creating pool type %s\n", type); - - driver = zpool_get_driver(type); - - if (!driver) { - request_module("zpool-%s", type); - driver = zpool_get_driver(type); - } - - if (!driver) { - pr_err("no driver for type %s\n", type); - return NULL; - } - - zpool = kmalloc(sizeof(*zpool), gfp); - if (!zpool) { - pr_err("couldn't create zpool - out of memory\n"); - zpool_put_driver(driver); - return NULL; - } - - zpool->driver = driver; - zpool->pool = driver->create(name, gfp); - - if (!zpool->pool) { - pr_err("couldn't create %s pool\n", type); - zpool_put_driver(driver); - kfree(zpool); - return NULL; - } - - pr_debug("created pool type %s\n", type); - - return zpool; -} - -/** - * zpool_destroy_pool() - Destroy a zpool - * @zpool: The zpool to destroy. - * - * Implementations must guarantee this to be thread-safe, - * however only when destroying different pools. The same - * pool should only be destroyed once, and should not be used - * after it is destroyed. - * - * This destroys an existing zpool. The zpool should not be in use. - */ -void zpool_destroy_pool(struct zpool *zpool) -{ - pr_debug("destroying pool type %s\n", zpool->driver->type); - - zpool->driver->destroy(zpool->pool); - zpool_put_driver(zpool->driver); - kfree(zpool); -} - -/** - * zpool_get_type() - Get the type of the zpool - * @zpool: The zpool to check - * - * This returns the type of the pool. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: The type of zpool. - */ -const char *zpool_get_type(struct zpool *zpool) -{ - return zpool->driver->type; -} - -/** - * zpool_malloc() - Allocate memory - * @zpool: The zpool to allocate from. - * @size: The amount of memory to allocate. - * @gfp: The GFP flags to use when allocating memory. - * @handle: Pointer to the handle to set - * @nid: The preferred node id. - * - * This allocates the requested amount of memory from the pool. - * The gfp flags will be used when allocating memory, if the - * implementation supports it. The provided @handle will be - * set to the allocated object handle. The allocation will - * prefer the NUMA node specified by @nid. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: 0 on success, negative value on error. - */ -int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid) -{ - return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); -} - -/** - * zpool_free() - Free previously allocated memory - * @zpool: The zpool that allocated the memory. - * @handle: The handle to the memory to free. - * - * This frees previously allocated memory. This does not guarantee - * that the pool will actually free memory, only that the memory - * in the pool will become available for use by the pool. - * - * Implementations must guarantee this to be thread-safe, - * however only when freeing different handles. The same - * handle should only be freed once, and should not be used - * after freeing. - */ -void zpool_free(struct zpool *zpool, unsigned long handle) -{ - zpool->driver->free(zpool->pool, handle); -} - -/** - * zpool_obj_read_begin() - Start reading from a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @local_copy: A local buffer to use if needed. - * - * This starts a read operation of a previously allocated handle. The passed - * @local_copy buffer may be used if needed by copying the memory into. - * zpool_obj_read_end() MUST be called after the read is completed to undo any - * actions taken (e.g. release locks). - * - * Returns: A pointer to the handle memory to be read, if @local_copy is used, - * the returned pointer is @local_copy. - */ -void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, - void *local_copy) -{ - return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy); -} - -/** - * zpool_obj_read_end() - Finish reading from a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @handle_mem: The pointer returned by zpool_obj_read_begin() - * - * Finishes a read operation previously started by zpool_obj_read_begin(). - */ -void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, - void *handle_mem) -{ - zpool->driver->obj_read_end(zpool->pool, handle, handle_mem); -} - -/** - * zpool_obj_write() - Write to a previously allocated handle. - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to read from - * @handle_mem: The memory to copy from into the handle. - * @mem_len: The length of memory to be written. - * - */ -void zpool_obj_write(struct zpool *zpool, unsigned long handle, - void *handle_mem, size_t mem_len) -{ - zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len); -} - -/** - * zpool_get_total_pages() - The total size of the pool - * @zpool: The zpool to check - * - * This returns the total size in pages of the pool. - * - * Returns: Total size of the zpool in pages. - */ -u64 zpool_get_total_pages(struct zpool *zpool) -{ - return zpool->driver->total_pages(zpool->pool); -} - -MODULE_AUTHOR("Dan Streetman "); -MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 153783d49d34..5bf832f9c05c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include "zpdesc.h" @@ -433,78 +432,6 @@ static void record_obj(unsigned long handle, unsigned long obj) *(unsigned long *)handle = obj; } -/* zpool driver */ - -#ifdef CONFIG_ZPOOL - -static void *zs_zpool_create(const char *name, gfp_t gfp) -{ - /* - * Ignore global gfp flags: zs_malloc() may be invoked from - * different contexts and its caller must provide a valid - * gfp mask. - */ - return zs_create_pool(name); -} - -static void zs_zpool_destroy(void *pool) -{ - zs_destroy_pool(pool); -} - -static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle, const int nid) -{ - *handle = zs_malloc(pool, size, gfp, nid); - - if (IS_ERR_VALUE(*handle)) - return PTR_ERR((void *)*handle); - return 0; -} -static void zs_zpool_free(void *pool, unsigned long handle) -{ - zs_free(pool, handle); -} - -static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle, - void *local_copy) -{ - return zs_obj_read_begin(pool, handle, local_copy); -} - -static void zs_zpool_obj_read_end(void *pool, unsigned long handle, - void *handle_mem) -{ - zs_obj_read_end(pool, handle, handle_mem); -} - -static void zs_zpool_obj_write(void *pool, unsigned long handle, - void *handle_mem, size_t mem_len) -{ - zs_obj_write(pool, handle, handle_mem, mem_len); -} - -static u64 zs_zpool_total_pages(void *pool) -{ - return zs_get_total_pages(pool); -} - -static struct zpool_driver zs_zpool_driver = { - .type = "zsmalloc", - .owner = THIS_MODULE, - .create = zs_zpool_create, - .destroy = zs_zpool_destroy, - .malloc = zs_zpool_malloc, - .free = zs_zpool_free, - .obj_read_begin = zs_zpool_obj_read_begin, - .obj_read_end = zs_zpool_obj_read_end, - .obj_write = zs_zpool_obj_write, - .total_pages = zs_zpool_total_pages, -}; - -MODULE_ALIAS("zpool-zsmalloc"); -#endif /* CONFIG_ZPOOL */ - static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) { return PagePrivate(zpdesc_page(zpdesc)); @@ -2248,9 +2175,6 @@ static int __init zs_init(void) { int rc __maybe_unused; -#ifdef CONFIG_ZPOOL - zpool_register_driver(&zs_zpool_driver); -#endif #ifdef CONFIG_COMPACTION rc = set_movable_ops(&zsmalloc_mops, PGTY_zsmalloc); if (rc) @@ -2262,9 +2186,6 @@ static int __init zs_init(void) static void __exit zs_exit(void) { -#ifdef CONFIG_ZPOOL - zpool_unregister_driver(&zs_zpool_driver); -#endif #ifdef CONFIG_COMPACTION set_movable_ops(NULL, PGTY_zsmalloc); #endif diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README index 110b34834a6f..82921c75681c 100644 --- a/tools/testing/selftests/zram/README +++ b/tools/testing/selftests/zram/README @@ -14,7 +14,6 @@ Statistics for individual zram devices are exported through sysfs nodes at Kconfig required: CONFIG_ZRAM=y CONFIG_CRYPTO_LZ4=y -CONFIG_ZPOOL=y CONFIG_ZSMALLOC=y ZRAM Testcases From 2f5bd89ba9e322a9c4677837411203890286b53a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 29 Aug 2025 17:15:28 +0100 Subject: [PATCH 180/372] mm: zpdesc: minor naming and comment corrections zpdesc is the page descriptor used by the zsmalloc backend allocator, which in turn is used by zswap and zram. The zpool layer is gone. Link: https://lkml.kernel.org/r/20250829162212.208258-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Chengming Zhou Cc: Nhat Pham Cc: SeongJae Park Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zpdesc.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/zpdesc.h b/mm/zpdesc.h index 25bf5ea0beb8..b8258dc78548 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* zpdesc.h: zswap.zpool memory descriptor +/* zpdesc.h: zsmalloc pool memory descriptor * * Written by Alex Shi * Hyeonggon Yoo <42.hyeyoo@gmail.com> @@ -11,14 +11,14 @@ #include /* - * struct zpdesc - Memory descriptor for zpool memory. + * struct zpdesc - Memory descriptor for zsmalloc pool memory. * @flags: Page flags, mostly unused by zsmalloc. * @lru: Indirectly used by page migration. * @movable_ops: Used by page migration. - * @next: Next zpdesc in a zspage in zsmalloc zpool. - * @handle: For huge zspage in zsmalloc zpool. + * @next: Next zpdesc in a zspage in zsmalloc pool. + * @handle: For huge zspage in zsmalloc pool. * @zspage: Points to the zspage this zpdesc is a part of. - * @first_obj_offset: First object offset in zsmalloc zpool. + * @first_obj_offset: First object offset in zsmalloc pool. * @_refcount: The number of references to this zpdesc. * * This struct overlays struct page for now. Do not modify without a good @@ -79,8 +79,8 @@ static_assert(sizeof(struct zpdesc) <= sizeof(struct page)); * zpdesc_folio - The folio allocated for a zpdesc * @zp: The zpdesc. * - * Zpdescs are descriptors for zpool memory. The zpool memory itself is - * allocated as folios that contain the zpool objects, and zpdesc uses specific + * Zpdescs are descriptors for zsmalloc memory. The memory itself is allocated + * as folios that contain the zsmalloc objects, and zpdesc uses specific * fields in the first struct page of the folio - those fields are now accessed * by struct zpdesc. * From 4dfd4bba85785c88365b27af859ddb01c6fcf44a Mon Sep 17 00:00:00 2001 From: Ujwal Kundur Date: Fri, 29 Aug 2025 21:26:00 +0530 Subject: [PATCH 181/372] selftests/mm/uffd: refactor non-composite global vars into struct Refactor macros and non-composite global variable definitions into a struct that is defined at the start of a test and is passed around instead of relying on global vars. Link: https://lkml.kernel.org/r/20250829155600.2000-1-ujwal.kundur@gmail.com Signed-off-by: Ujwal Kundur Acked-by: Peter Xu Reviewed-by: Brendan Jackman Cc: David Hildenbrand Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 271 ++++----- tools/testing/selftests/mm/uffd-common.h | 78 +-- tools/testing/selftests/mm/uffd-stress.c | 228 ++++---- tools/testing/selftests/mm/uffd-unit-tests.c | 559 ++++++++++--------- tools/testing/selftests/mm/uffd-wp-mremap.c | 20 +- 5 files changed, 615 insertions(+), 541 deletions(-) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index a37088a23ffe..994fe8c03923 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -7,18 +7,29 @@ #include "uffd-common.h" -#define BASE_PMD_ADDR ((void *)(1UL << 30)) - -volatile bool test_uffdio_copy_eexist = true; -unsigned long nr_parallel, nr_pages, nr_pages_per_cpu, page_size; -char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; -int uffd = -1, uffd_flags, finished, *pipefd, test_type; -bool map_shared; -bool test_uffdio_wp = true; -unsigned long long *count_verify; uffd_test_ops_t *uffd_test_ops; uffd_test_case_ops_t *uffd_test_case_ops; -atomic_bool ready_for_fork; + +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + +/* pthread_mutex_t starts at page offset 0 */ +pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts) +{ + return (pthread_mutex_t *) (area + nr * gopts->page_size); +} + +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +volatile unsigned long long *area_count(char *area, unsigned long nr, + uffd_global_test_opts_t *gopts) +{ + return (volatile unsigned long long *) + ((unsigned long)(area + nr * gopts->page_size + + sizeof(pthread_mutex_t) + sizeof(unsigned long long) - 1) & + ~(unsigned long)(sizeof(unsigned long long) - 1)); +} static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) { @@ -40,15 +51,15 @@ static int uffd_mem_fd_create(off_t mem_size, bool hugetlb) return mem_fd; } -static void anon_release_pages(char *rel_area) +static void anon_release_pages(uffd_global_test_opts_t *gopts, char *rel_area) { - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED)) err("madvise(MADV_DONTNEED) failed"); } -static int anon_allocate_area(void **alloc_area, bool is_src) +static int anon_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src) { - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + *alloc_area = mmap(NULL, gopts->nr_pages * gopts->page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (*alloc_area == MAP_FAILED) { *alloc_area = NULL; @@ -57,31 +68,32 @@ static int anon_allocate_area(void **alloc_area, bool is_src) return 0; } -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void noop_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start, + size_t len, unsigned long offset) { } -static void hugetlb_release_pages(char *rel_area) +static void hugetlb_release_pages(uffd_global_test_opts_t *gopts, char *rel_area) { - if (!map_shared) { - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + if (!gopts->map_shared) { + if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_DONTNEED)) err("madvise(MADV_DONTNEED) failed"); } else { - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE)) err("madvise(MADV_REMOVE) failed"); } } -static int hugetlb_allocate_area(void **alloc_area, bool is_src) +static int hugetlb_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src) { - off_t size = nr_pages * page_size; + off_t size = gopts->nr_pages * gopts->page_size; off_t offset = is_src ? 0 : size; void *area_alias = NULL; char **alloc_area_alias; int mem_fd = uffd_mem_fd_create(size * 2, true); *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | + (gopts->map_shared ? MAP_SHARED : MAP_PRIVATE) | (is_src ? 0 : MAP_NORESERVE), mem_fd, offset); if (*alloc_area == MAP_FAILED) { @@ -89,7 +101,7 @@ static int hugetlb_allocate_area(void **alloc_area, bool is_src) return -errno; } - if (map_shared) { + if (gopts->map_shared) { area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, mem_fd, offset); if (area_alias == MAP_FAILED) @@ -97,9 +109,9 @@ static int hugetlb_allocate_area(void **alloc_area, bool is_src) } if (is_src) { - alloc_area_alias = &area_src_alias; + alloc_area_alias = &gopts->area_src_alias; } else { - alloc_area_alias = &area_dst_alias; + alloc_area_alias = &gopts->area_dst_alias; } if (area_alias) *alloc_area_alias = area_alias; @@ -108,24 +120,25 @@ static int hugetlb_allocate_area(void **alloc_area, bool is_src) return 0; } -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void hugetlb_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start, + size_t len, unsigned long offset) { - if (!map_shared) + if (!gopts->map_shared) return; - *start = (unsigned long) area_dst_alias + offset; + *start = (unsigned long) gopts->area_dst_alias + offset; } -static void shmem_release_pages(char *rel_area) +static void shmem_release_pages(uffd_global_test_opts_t *gopts, char *rel_area) { - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + if (madvise(rel_area, gopts->nr_pages * gopts->page_size, MADV_REMOVE)) err("madvise(MADV_REMOVE) failed"); } -static int shmem_allocate_area(void **alloc_area, bool is_src) +static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src) { void *area_alias = NULL; - size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize(); + size_t bytes = gopts->nr_pages * gopts->page_size, hpage_size = read_pmd_pagesize(); unsigned long offset = is_src ? 0 : bytes; char *p = NULL, *p_alias = NULL; int mem_fd = uffd_mem_fd_create(bytes * 2, false); @@ -159,22 +172,23 @@ static int shmem_allocate_area(void **alloc_area, bool is_src) err("mmap of anonymous memory failed at %p", p_alias); if (is_src) - area_src_alias = area_alias; + gopts->area_src_alias = area_alias; else - area_dst_alias = area_alias; + gopts->area_dst_alias = area_alias; close(mem_fd); return 0; } -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +static void shmem_alias_mapping(uffd_global_test_opts_t *gopts, __u64 *start, + size_t len, unsigned long offset) { - *start = (unsigned long)area_dst_alias + offset; + *start = (unsigned long)gopts->area_dst_alias + offset; } -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +static void shmem_check_pmd_mapping(uffd_global_test_opts_t *gopts, void *p, int expect_nr_hpages) { - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, + if (!check_huge_shmem(gopts->area_dst_alias, expect_nr_hpages, read_pmd_pagesize())) err("Did not find expected %d number of hugepages", expect_nr_hpages); @@ -234,18 +248,18 @@ void uffd_stats_report(struct uffd_args *args, int n_cpus) printf("\n"); } -int userfaultfd_open(uint64_t *features) +int userfaultfd_open(uffd_global_test_opts_t *gopts, uint64_t *features) { struct uffdio_api uffdio_api; - uffd = uffd_open(UFFD_FLAGS); - if (uffd < 0) + gopts->uffd = uffd_open(UFFD_FLAGS); + if (gopts->uffd < 0) return -1; - uffd_flags = fcntl(uffd, F_GETFD, NULL); + gopts->uffd_flags = fcntl(gopts->uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + if (ioctl(gopts->uffd, UFFDIO_API, &uffdio_api)) /* Probably lack of CAP_PTRACE? */ return -1; if (uffdio_api.api != UFFD_API) @@ -255,59 +269,63 @@ int userfaultfd_open(uint64_t *features) return 0; } -static inline void munmap_area(void **area) +static inline void munmap_area(uffd_global_test_opts_t *gopts, void **area) { if (*area) - if (munmap(*area, nr_pages * page_size)) + if (munmap(*area, gopts->nr_pages * gopts->page_size)) err("munmap"); *area = NULL; } -void uffd_test_ctx_clear(void) +void uffd_test_ctx_clear(uffd_global_test_opts_t *gopts) { size_t i; - if (pipefd) { - for (i = 0; i < nr_parallel * 2; ++i) { - if (close(pipefd[i])) + if (gopts->pipefd) { + for (i = 0; i < gopts->nr_parallel * 2; ++i) { + if (close(gopts->pipefd[i])) err("close pipefd"); } - free(pipefd); - pipefd = NULL; + free(gopts->pipefd); + gopts->pipefd = NULL; } - if (count_verify) { - free(count_verify); - count_verify = NULL; + if (gopts->count_verify) { + free(gopts->count_verify); + gopts->count_verify = NULL; } - if (uffd != -1) { - if (close(uffd)) + if (gopts->uffd != -1) { + if (close(gopts->uffd)) err("close uffd"); - uffd = -1; + gopts->uffd = -1; } - munmap_area((void **)&area_src); - munmap_area((void **)&area_src_alias); - munmap_area((void **)&area_dst); - munmap_area((void **)&area_dst_alias); - munmap_area((void **)&area_remap); + munmap_area(gopts, (void **)&gopts->area_src); + munmap_area(gopts, (void **)&gopts->area_src_alias); + munmap_area(gopts, (void **)&gopts->area_dst); + munmap_area(gopts, (void **)&gopts->area_dst_alias); + munmap_area(gopts, (void **)&gopts->area_remap); } -int uffd_test_ctx_init(uint64_t features, const char **errmsg) +int uffd_test_ctx_init(uffd_global_test_opts_t *gopts, uint64_t features, const char **errmsg) { unsigned long nr, cpu; int ret; + gopts->area_src_alias = NULL; + gopts->area_dst_alias = NULL; + gopts->area_remap = NULL; + if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) { - ret = uffd_test_case_ops->pre_alloc(errmsg); + ret = uffd_test_case_ops->pre_alloc(gopts, errmsg); if (ret) return ret; } - ret = uffd_test_ops->allocate_area((void **)&area_src, true); - ret |= uffd_test_ops->allocate_area((void **)&area_dst, false); + ret = uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_src, true); + ret |= uffd_test_ops->allocate_area(gopts, (void **) &gopts->area_dst, false); if (ret) { if (errmsg) *errmsg = "memory allocation failed"; @@ -315,26 +333,26 @@ int uffd_test_ctx_init(uint64_t features, const char **errmsg) } if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) { - ret = uffd_test_case_ops->post_alloc(errmsg); + ret = uffd_test_case_ops->post_alloc(gopts, errmsg); if (ret) return ret; } - ret = userfaultfd_open(&features); + ret = userfaultfd_open(gopts, &features); if (ret) { if (errmsg) *errmsg = "possible lack of privilege"; return ret; } - count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) + gopts->count_verify = malloc(gopts->nr_pages * sizeof(unsigned long long)); + if (!gopts->count_verify) err("count_verify"); - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = + for (nr = 0; nr < gopts->nr_pages; nr++) { + *area_mutex(gopts->area_src, nr, gopts) = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; + gopts->count_verify[nr] = *area_count(gopts->area_src, nr, gopts) = 1; /* * In the transition between 255 to 256, powerpc will * read out of order in my_bcmp and see both bytes as @@ -342,7 +360,7 @@ int uffd_test_ctx_init(uint64_t features, const char **errmsg) * after the count, to avoid my_bcmp to trigger false * positives. */ - *(area_count(area_src, nr) + 1) = 1; + *(area_count(gopts->area_src, nr, gopts) + 1) = 1; } /* @@ -363,13 +381,13 @@ int uffd_test_ctx_init(uint64_t features, const char **errmsg) * proactively split the thp and drop any accidentally initialized * pages within area_dst. */ - uffd_test_ops->release_pages(area_dst); + uffd_test_ops->release_pages(gopts, gopts->area_dst); - pipefd = malloc(sizeof(int) * nr_parallel * 2); - if (!pipefd) + gopts->pipefd = malloc(sizeof(int) * gopts->nr_parallel * 2); + if (!gopts->pipefd) err("pipefd"); - for (cpu = 0; cpu < nr_parallel; cpu++) - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) + for (cpu = 0; cpu < gopts->nr_parallel; cpu++) + if (pipe2(&gopts->pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) err("pipe"); return 0; @@ -416,9 +434,9 @@ static void continue_range(int ufd, __u64 start, __u64 len, bool wp) ret, (int64_t) req.mapped); } -int uffd_read_msg(int ufd, struct uffd_msg *msg) +int uffd_read_msg(uffd_global_test_opts_t *gopts, struct uffd_msg *msg) { - int ret = read(uffd, msg, sizeof(*msg)); + int ret = read(gopts->uffd, msg, sizeof(*msg)); if (ret != sizeof(*msg)) { if (ret < 0) { @@ -433,7 +451,8 @@ int uffd_read_msg(int ufd, struct uffd_msg *msg) return 0; } -void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) +void uffd_handle_page_fault(uffd_global_test_opts_t *gopts, struct uffd_msg *msg, + struct uffd_args *args) { unsigned long offset; @@ -442,7 +461,7 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { /* Write protect page faults */ - wp_range(uffd, msg->arg.pagefault.address, page_size, false); + wp_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size, false); args->wp_faults++; } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { uint8_t *area; @@ -460,12 +479,12 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) * (UFFD-registered). */ - area = (uint8_t *)(area_dst + - ((char *)msg->arg.pagefault.address - - area_dst_alias)); - for (b = 0; b < page_size; ++b) + area = (uint8_t *)(gopts->area_dst + + ((char *)msg->arg.pagefault.address - + gopts->area_dst_alias)); + for (b = 0; b < gopts->page_size; ++b) area[b] = ~area[b]; - continue_range(uffd, msg->arg.pagefault.address, page_size, + continue_range(gopts->uffd, msg->arg.pagefault.address, gopts->page_size, args->apply_wp); args->minor_faults++; } else { @@ -493,10 +512,10 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) err("unexpected write fault"); - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); + offset = (char *)(unsigned long)msg->arg.pagefault.address - gopts->area_dst; + offset &= ~(gopts->page_size-1); - if (copy_page(uffd, offset, args->apply_wp)) + if (copy_page(gopts, offset, args->apply_wp)) args->missing_faults++; } } @@ -504,6 +523,7 @@ void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args) void *uffd_poll_thread(void *arg) { struct uffd_args *args = (struct uffd_args *)arg; + uffd_global_test_opts_t *gopts = args->gopts; unsigned long cpu = args->cpu; struct pollfd pollfd[2]; struct uffd_msg msg; @@ -514,12 +534,12 @@ void *uffd_poll_thread(void *arg) if (!args->handle_fault) args->handle_fault = uffd_handle_page_fault; - pollfd[0].fd = uffd; + pollfd[0].fd = gopts->uffd; pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].fd = gopts->pipefd[cpu*2]; pollfd[1].events = POLLIN; - ready_for_fork = true; + gopts->ready_for_fork = true; for (;;) { ret = poll(pollfd, 2, -1); @@ -537,30 +557,30 @@ void *uffd_poll_thread(void *arg) } if (!(pollfd[0].revents & POLLIN)) err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) + if (uffd_read_msg(gopts, &msg)) continue; switch (msg.event) { default: err("unexpected msg event %u\n", msg.event); break; case UFFD_EVENT_PAGEFAULT: - args->handle_fault(&msg, args); + args->handle_fault(gopts, &msg, args); break; case UFFD_EVENT_FORK: - close(uffd); - uffd = msg.arg.fork.ufd; - pollfd[0].fd = uffd; + close(gopts->uffd); + gopts->uffd = msg.arg.fork.ufd; + pollfd[0].fd = gopts->uffd; break; case UFFD_EVENT_REMOVE: uffd_reg.range.start = msg.arg.remove.start; uffd_reg.range.len = msg.arg.remove.end - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + if (ioctl(gopts->uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) err("remove failure"); break; case UFFD_EVENT_REMAP: - area_remap = area_dst; /* save for later unmap */ - area_dst = (char *)(unsigned long)msg.arg.remap.to; + gopts->area_remap = gopts->area_dst; /* save for later unmap */ + gopts->area_dst = (char *)(unsigned long)msg.arg.remap.to; break; } } @@ -568,17 +588,18 @@ void *uffd_poll_thread(void *arg) return NULL; } -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, +static void retry_copy_page(uffd_global_test_opts_t *gopts, struct uffdio_copy *uffdio_copy, unsigned long offset) { - uffd_test_ops->alias_mapping(&uffdio_copy->dst, + uffd_test_ops->alias_mapping(gopts, + &uffdio_copy->dst, uffdio_copy->len, offset); - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { + if (ioctl(gopts->uffd, UFFDIO_COPY, uffdio_copy)) { /* real retval in ufdio_copy.copy */ if (uffdio_copy->copy != -EEXIST) err("UFFDIO_COPY retry error: %"PRId64, - (int64_t)uffdio_copy->copy); + (int64_t)uffdio_copy->copy); } else { err("UFFDIO_COPY retry unexpected: %"PRId64, (int64_t)uffdio_copy->copy); @@ -597,60 +618,60 @@ static void wake_range(int ufd, unsigned long addr, unsigned long len) addr), exit(1); } -int __copy_page(int ufd, unsigned long offset, bool retry, bool wp) +int __copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool retry, bool wp) { struct uffdio_copy uffdio_copy; - if (offset >= nr_pages * page_size) + if (offset >= gopts->nr_pages * gopts->page_size) err("unexpected offset %lu\n", offset); - uffdio_copy.dst = (unsigned long) area_dst + offset; - uffdio_copy.src = (unsigned long) area_src + offset; - uffdio_copy.len = page_size; + uffdio_copy.dst = (unsigned long) gopts->area_dst + offset; + uffdio_copy.src = (unsigned long) gopts->area_src + offset; + uffdio_copy.len = gopts->page_size; if (wp) uffdio_copy.mode = UFFDIO_COPY_MODE_WP; else uffdio_copy.mode = 0; uffdio_copy.copy = 0; - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { + if (ioctl(gopts->uffd, UFFDIO_COPY, &uffdio_copy)) { /* real retval in ufdio_copy.copy */ if (uffdio_copy.copy != -EEXIST) err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); - wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { + wake_range(gopts->uffd, uffdio_copy.dst, gopts->page_size); + } else if (uffdio_copy.copy != gopts->page_size) { err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); } else { - if (test_uffdio_copy_eexist && retry) { - test_uffdio_copy_eexist = false; - retry_copy_page(ufd, &uffdio_copy, offset); + if (gopts->test_uffdio_copy_eexist && retry) { + gopts->test_uffdio_copy_eexist = false; + retry_copy_page(gopts, &uffdio_copy, offset); } return 1; } return 0; } -int copy_page(int ufd, unsigned long offset, bool wp) +int copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool wp) { - return __copy_page(ufd, offset, false, wp); + return __copy_page(gopts, offset, false, wp); } -int move_page(int ufd, unsigned long offset, unsigned long len) +int move_page(uffd_global_test_opts_t *gopts, unsigned long offset, unsigned long len) { struct uffdio_move uffdio_move; - if (offset + len > nr_pages * page_size) + if (offset + len > gopts->nr_pages * gopts->page_size) err("unexpected offset %lu and length %lu\n", offset, len); - uffdio_move.dst = (unsigned long) area_dst + offset; - uffdio_move.src = (unsigned long) area_src + offset; + uffdio_move.dst = (unsigned long) gopts->area_dst + offset; + uffdio_move.src = (unsigned long) gopts->area_src + offset; uffdio_move.len = len; uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES; uffdio_move.move = 0; - if (ioctl(ufd, UFFDIO_MOVE, &uffdio_move)) { + if (ioctl(gopts->uffd, UFFDIO_MOVE, &uffdio_move)) { /* real retval in uffdio_move.move */ if (uffdio_move.move != -EEXIST) err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move); - wake_range(ufd, uffdio_move.dst, len); + wake_range(gopts->uffd, uffdio_move.dst, len); } else if (uffdio_move.move != len) { err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move); } else diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 7700cbfa3975..37d3ca55905f 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -56,20 +56,17 @@ #define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) -/* pthread_mutex_t starts at page offset 0 */ -#define area_mutex(___area, ___nr) \ - ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) -/* - * count is placed in the page after pthread_mutex_t naturally aligned - * to avoid non alignment faults on non-x86 archs. - */ -#define area_count(___area, ___nr) \ - ((volatile unsigned long long *) ((unsigned long) \ - ((___area) + (___nr)*page_size + \ - sizeof(pthread_mutex_t) + \ - sizeof(unsigned long long) - 1) & \ - ~(unsigned long)(sizeof(unsigned long long) \ - - 1))) +struct uffd_global_test_opts { + unsigned long nr_parallel, nr_pages, nr_pages_per_cpu, page_size; + char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; + int uffd, uffd_flags, finished, *pipefd, test_type; + bool map_shared; + bool test_uffdio_wp; + unsigned long long *count_verify; + volatile bool test_uffdio_copy_eexist; + atomic_bool ready_for_fork; +}; +typedef struct uffd_global_test_opts uffd_global_test_opts_t; /* Userfaultfd test statistics */ struct uffd_args { @@ -79,50 +76,55 @@ struct uffd_args { unsigned long missing_faults; unsigned long wp_faults; unsigned long minor_faults; + struct uffd_global_test_opts *gopts; /* A custom fault handler; defaults to uffd_handle_page_fault. */ - void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args); + void (*handle_fault)(struct uffd_global_test_opts *gopts, + struct uffd_msg *msg, + struct uffd_args *args); }; struct uffd_test_ops { - int (*allocate_area)(void **alloc_area, bool is_src); - void (*release_pages)(char *rel_area); - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); - void (*check_pmd_mapping)(void *p, int expect_nr_hpages); + int (*allocate_area)(uffd_global_test_opts_t *gopts, void **alloc_area, bool is_src); + void (*release_pages)(uffd_global_test_opts_t *gopts, char *rel_area); + void (*alias_mapping)(uffd_global_test_opts_t *gopts, + __u64 *start, + size_t len, + unsigned long offset); + void (*check_pmd_mapping)(uffd_global_test_opts_t *gopts, void *p, int expect_nr_hpages); }; typedef struct uffd_test_ops uffd_test_ops_t; struct uffd_test_case_ops { - int (*pre_alloc)(const char **errmsg); - int (*post_alloc)(const char **errmsg); + int (*pre_alloc)(uffd_global_test_opts_t *gopts, const char **errmsg); + int (*post_alloc)(uffd_global_test_opts_t *gopts, const char **errmsg); }; typedef struct uffd_test_case_ops uffd_test_case_ops_t; -extern unsigned long nr_parallel, nr_pages, nr_pages_per_cpu, page_size; -extern char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; -extern int uffd, uffd_flags, finished, *pipefd, test_type; -extern bool map_shared; -extern bool test_uffdio_wp; -extern unsigned long long *count_verify; -extern volatile bool test_uffdio_copy_eexist; -extern atomic_bool ready_for_fork; - +extern uffd_global_test_opts_t *uffd_gtest_opts; extern uffd_test_ops_t anon_uffd_test_ops; extern uffd_test_ops_t shmem_uffd_test_ops; extern uffd_test_ops_t hugetlb_uffd_test_ops; extern uffd_test_ops_t *uffd_test_ops; extern uffd_test_case_ops_t *uffd_test_case_ops; +pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts); +volatile unsigned long long *area_count(char *area, + unsigned long nr, + uffd_global_test_opts_t *gopts); + void uffd_stats_report(struct uffd_args *args, int n_cpus); -int uffd_test_ctx_init(uint64_t features, const char **errmsg); -void uffd_test_ctx_clear(void); -int userfaultfd_open(uint64_t *features); -int uffd_read_msg(int ufd, struct uffd_msg *msg); +int uffd_test_ctx_init(uffd_global_test_opts_t *gopts, uint64_t features, const char **errmsg); +void uffd_test_ctx_clear(uffd_global_test_opts_t *gopts); +int userfaultfd_open(uffd_global_test_opts_t *gopts, uint64_t *features); +int uffd_read_msg(uffd_global_test_opts_t *gopts, struct uffd_msg *msg); void wp_range(int ufd, __u64 start, __u64 len, bool wp); -void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args); -int __copy_page(int ufd, unsigned long offset, bool retry, bool wp); -int copy_page(int ufd, unsigned long offset, bool wp); -int move_page(int ufd, unsigned long offset, unsigned long len); +void uffd_handle_page_fault(uffd_global_test_opts_t *gopts, + struct uffd_msg *msg, + struct uffd_args *args); +int __copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool retry, bool wp); +int copy_page(uffd_global_test_opts_t *gopts, unsigned long offset, bool wp); +int move_page(uffd_global_test_opts_t *gopts, unsigned long offset, unsigned long len); void *uffd_poll_thread(void *arg); int uffd_open_dev(unsigned int flags); diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index ecd016329935..b51c89e1cd1a 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -44,6 +44,12 @@ uint64_t features; #define BOUNCE_VERIFY (1<<2) #define BOUNCE_POLL (1<<3) static int bounces; +/* defined globally for this particular test as the sigalrm handler + * depends on test_uffdio_*_eexist. + * XXX: define gopts in main() when we figure out a way to deal with + * test_uffdio_*_eexist. + */ +static uffd_global_test_opts_t *gopts; /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ #define ALARM_INTERVAL_SECS 10 @@ -76,54 +82,58 @@ static void usage(void) exit(1); } -static void uffd_stats_reset(struct uffd_args *args, unsigned long n_cpus) +static void uffd_stats_reset(uffd_global_test_opts_t *gopts, struct uffd_args *args, + unsigned long n_cpus) { int i; for (i = 0; i < n_cpus; i++) { args[i].cpu = i; - args[i].apply_wp = test_uffdio_wp; + args[i].apply_wp = gopts->test_uffdio_wp; args[i].missing_faults = 0; args[i].wp_faults = 0; args[i].minor_faults = 0; + args[i].gopts = gopts; } } static void *locking_thread(void *arg) { - unsigned long cpu = (unsigned long) arg; + struct uffd_args *args = (struct uffd_args *) arg; + uffd_global_test_opts_t *gopts = args->gopts; + unsigned long cpu = (unsigned long) args->cpu; unsigned long page_nr; unsigned long long count; if (!(bounces & BOUNCE_RANDOM)) { page_nr = -bounces; if (!(bounces & BOUNCE_RACINGFAULTS)) - page_nr += cpu * nr_pages_per_cpu; + page_nr += cpu * gopts->nr_pages_per_cpu; } - while (!finished) { + while (!gopts->finished) { if (bounces & BOUNCE_RANDOM) { if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) err("getrandom failed"); } else page_nr += 1; - page_nr %= nr_pages; - pthread_mutex_lock(area_mutex(area_dst, page_nr)); - count = *area_count(area_dst, page_nr); - if (count != count_verify[page_nr]) + page_nr %= gopts->nr_pages; + pthread_mutex_lock(area_mutex(gopts->area_dst, page_nr, gopts)); + count = *area_count(gopts->area_dst, page_nr, gopts); + if (count != gopts->count_verify[page_nr]) err("page_nr %lu memory corruption %llu %llu", - page_nr, count, count_verify[page_nr]); + page_nr, count, gopts->count_verify[page_nr]); count++; - *area_count(area_dst, page_nr) = count_verify[page_nr] = count; - pthread_mutex_unlock(area_mutex(area_dst, page_nr)); + *area_count(gopts->area_dst, page_nr, gopts) = gopts->count_verify[page_nr] = count; + pthread_mutex_unlock(area_mutex(gopts->area_dst, page_nr, gopts)); } return NULL; } -static int copy_page_retry(int ufd, unsigned long offset) +static int copy_page_retry(uffd_global_test_opts_t *gopts, unsigned long offset) { - return __copy_page(ufd, offset, true, test_uffdio_wp); + return __copy_page(gopts, offset, true, gopts->test_uffdio_wp); } pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -131,15 +141,16 @@ pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; static void *uffd_read_thread(void *arg) { struct uffd_args *args = (struct uffd_args *)arg; + uffd_global_test_opts_t *gopts = args->gopts; struct uffd_msg msg; pthread_mutex_unlock(&uffd_read_mutex); /* from here cancellation is ok */ for (;;) { - if (uffd_read_msg(uffd, &msg)) + if (uffd_read_msg(gopts, &msg)) continue; - uffd_handle_page_fault(&msg, args); + uffd_handle_page_fault(gopts, &msg, args); } return NULL; @@ -147,32 +158,34 @@ static void *uffd_read_thread(void *arg) static void *background_thread(void *arg) { - unsigned long cpu = (unsigned long) arg; + struct uffd_args *args = (struct uffd_args *) arg; + uffd_global_test_opts_t *gopts = args->gopts; + unsigned long cpu = (unsigned long) args->cpu; unsigned long page_nr, start_nr, mid_nr, end_nr; - start_nr = cpu * nr_pages_per_cpu; - end_nr = (cpu+1) * nr_pages_per_cpu; + start_nr = cpu * gopts->nr_pages_per_cpu; + end_nr = (cpu+1) * gopts->nr_pages_per_cpu; mid_nr = (start_nr + end_nr) / 2; /* Copy the first half of the pages */ for (page_nr = start_nr; page_nr < mid_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); + copy_page_retry(gopts, page_nr * gopts->page_size); /* * If we need to test uffd-wp, set it up now. Then we'll have * at least the first half of the pages mapped already which * can be write-protected for testing */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, - nr_pages_per_cpu * page_size, true); + if (gopts->test_uffdio_wp) + wp_range(gopts->uffd, (unsigned long)gopts->area_dst + start_nr * gopts->page_size, + gopts->nr_pages_per_cpu * gopts->page_size, true); /* * Continue the 2nd half of the page copying, handling write * protection faults if any */ for (page_nr = mid_nr; page_nr < end_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); + copy_page_retry(gopts, page_nr * gopts->page_size); return NULL; } @@ -180,17 +193,21 @@ static void *background_thread(void *arg) static int stress(struct uffd_args *args) { unsigned long cpu; - pthread_t locking_threads[nr_parallel]; - pthread_t uffd_threads[nr_parallel]; - pthread_t background_threads[nr_parallel]; + uffd_global_test_opts_t *gopts = args->gopts; + pthread_t locking_threads[gopts->nr_parallel]; + pthread_t uffd_threads[gopts->nr_parallel]; + pthread_t background_threads[gopts->nr_parallel]; - finished = 0; - for (cpu = 0; cpu < nr_parallel; cpu++) { + gopts->finished = 0; + for (cpu = 0; cpu < gopts->nr_parallel; cpu++) { if (pthread_create(&locking_threads[cpu], &attr, - locking_thread, (void *)cpu)) + locking_thread, (void *)&args[cpu])) return 1; if (bounces & BOUNCE_POLL) { - if (pthread_create(&uffd_threads[cpu], &attr, uffd_poll_thread, &args[cpu])) + if (pthread_create(&uffd_threads[cpu], + &attr, + uffd_poll_thread, + (void *) &args[cpu])) err("uffd_poll_thread create"); } else { if (pthread_create(&uffd_threads[cpu], &attr, @@ -200,10 +217,10 @@ static int stress(struct uffd_args *args) pthread_mutex_lock(&uffd_read_mutex); } if (pthread_create(&background_threads[cpu], &attr, - background_thread, (void *)cpu)) + background_thread, (void *)&args[cpu])) return 1; } - for (cpu = 0; cpu < nr_parallel; cpu++) + for (cpu = 0; cpu < gopts->nr_parallel; cpu++) if (pthread_join(background_threads[cpu], NULL)) return 1; @@ -216,17 +233,17 @@ static int stress(struct uffd_args *args) * UFFDIO_COPY without writing zero pages into area_dst * because the background threads already completed). */ - uffd_test_ops->release_pages(area_src); + uffd_test_ops->release_pages(gopts, gopts->area_src); - finished = 1; - for (cpu = 0; cpu < nr_parallel; cpu++) + gopts->finished = 1; + for (cpu = 0; cpu < gopts->nr_parallel; cpu++) if (pthread_join(locking_threads[cpu], NULL)) return 1; - for (cpu = 0; cpu < nr_parallel; cpu++) { + for (cpu = 0; cpu < gopts->nr_parallel; cpu++) { char c; if (bounces & BOUNCE_POLL) { - if (write(pipefd[cpu*2+1], &c, 1) != 1) + if (write(gopts->pipefd[cpu*2+1], &c, 1) != 1) err("pipefd write error"); if (pthread_join(uffd_threads[cpu], (void *)&args[cpu])) @@ -242,26 +259,26 @@ static int stress(struct uffd_args *args) return 0; } -static int userfaultfd_stress(void) +static int userfaultfd_stress(uffd_global_test_opts_t *gopts) { void *area; unsigned long nr; - struct uffd_args args[nr_parallel]; - uint64_t mem_size = nr_pages * page_size; + struct uffd_args args[gopts->nr_parallel]; + uint64_t mem_size = gopts->nr_pages * gopts->page_size; int flags = 0; - memset(args, 0, sizeof(struct uffd_args) * nr_parallel); + memset(args, 0, sizeof(struct uffd_args) * gopts->nr_parallel); - if (features & UFFD_FEATURE_WP_UNPOPULATED && test_type == TEST_ANON) + if (features & UFFD_FEATURE_WP_UNPOPULATED && gopts->test_type == TEST_ANON) flags = UFFD_FEATURE_WP_UNPOPULATED; - if (uffd_test_ctx_init(flags, NULL)) + if (uffd_test_ctx_init(gopts, flags, NULL)) err("context init failed"); - if (posix_memalign(&area, page_size, page_size)) + if (posix_memalign(&area, gopts->page_size, gopts->page_size)) err("out of memory"); zeropage = area; - bzero(zeropage, page_size); + bzero(zeropage, gopts->page_size); pthread_mutex_lock(&uffd_read_mutex); @@ -284,18 +301,18 @@ static int userfaultfd_stress(void) fflush(stdout); if (bounces & BOUNCE_POLL) - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + fcntl(gopts->uffd, F_SETFL, gopts->uffd_flags | O_NONBLOCK); else - fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); + fcntl(gopts->uffd, F_SETFL, gopts->uffd_flags & ~O_NONBLOCK); /* register */ - if (uffd_register(uffd, area_dst, mem_size, - true, test_uffdio_wp, false)) + if (uffd_register(gopts->uffd, gopts->area_dst, mem_size, + true, gopts->test_uffdio_wp, false)) err("register failure"); - if (area_dst_alias) { - if (uffd_register(uffd, area_dst_alias, mem_size, - true, test_uffdio_wp, false)) + if (gopts->area_dst_alias) { + if (uffd_register(gopts->uffd, gopts->area_dst_alias, mem_size, + true, gopts->test_uffdio_wp, false)) err("register failure alias"); } @@ -323,87 +340,88 @@ static int userfaultfd_stress(void) * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's * required to MADV_DONTNEED here. */ - uffd_test_ops->release_pages(area_dst); + uffd_test_ops->release_pages(gopts, gopts->area_dst); - uffd_stats_reset(args, nr_parallel); + uffd_stats_reset(gopts, args, gopts->nr_parallel); /* bounce pass */ if (stress(args)) { - uffd_test_ctx_clear(); + uffd_test_ctx_clear(gopts); return 1; } /* Clear all the write protections if there is any */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst, - nr_pages * page_size, false); + if (gopts->test_uffdio_wp) + wp_range(gopts->uffd, (unsigned long)gopts->area_dst, + gopts->nr_pages * gopts->page_size, false); /* unregister */ - if (uffd_unregister(uffd, area_dst, mem_size)) + if (uffd_unregister(gopts->uffd, gopts->area_dst, mem_size)) err("unregister failure"); - if (area_dst_alias) { - if (uffd_unregister(uffd, area_dst_alias, mem_size)) + if (gopts->area_dst_alias) { + if (uffd_unregister(gopts->uffd, gopts->area_dst_alias, mem_size)) err("unregister failure alias"); } /* verification */ if (bounces & BOUNCE_VERIFY) - for (nr = 0; nr < nr_pages; nr++) - if (*area_count(area_dst, nr) != count_verify[nr]) + for (nr = 0; nr < gopts->nr_pages; nr++) + if (*area_count(gopts->area_dst, nr, gopts) != + gopts->count_verify[nr]) err("error area_count %llu %llu %lu\n", - *area_count(area_src, nr), - count_verify[nr], nr); + *area_count(gopts->area_src, nr, gopts), + gopts->count_verify[nr], nr); /* prepare next bounce */ - swap(area_src, area_dst); + swap(gopts->area_src, gopts->area_dst); - swap(area_src_alias, area_dst_alias); + swap(gopts->area_src_alias, gopts->area_dst_alias); - uffd_stats_report(args, nr_parallel); + uffd_stats_report(args, gopts->nr_parallel); } - uffd_test_ctx_clear(); + uffd_test_ctx_clear(gopts); return 0; } -static void set_test_type(const char *type) +static void set_test_type(uffd_global_test_opts_t *gopts, const char *type) { if (!strcmp(type, "anon")) { - test_type = TEST_ANON; + gopts->test_type = TEST_ANON; uffd_test_ops = &anon_uffd_test_ops; } else if (!strcmp(type, "hugetlb")) { - test_type = TEST_HUGETLB; + gopts->test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; - map_shared = true; + gopts->map_shared = true; } else if (!strcmp(type, "hugetlb-private")) { - test_type = TEST_HUGETLB; + gopts->test_type = TEST_HUGETLB; uffd_test_ops = &hugetlb_uffd_test_ops; } else if (!strcmp(type, "shmem")) { - map_shared = true; - test_type = TEST_SHMEM; + gopts->map_shared = true; + gopts->test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; } else if (!strcmp(type, "shmem-private")) { - test_type = TEST_SHMEM; + gopts->test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; } } -static void parse_test_type_arg(const char *raw_type) +static void parse_test_type_arg(uffd_global_test_opts_t *gopts, const char *raw_type) { - set_test_type(raw_type); + set_test_type(gopts, raw_type); - if (!test_type) + if (!gopts->test_type) err("failed to parse test type argument: '%s'", raw_type); - if (test_type == TEST_HUGETLB) - page_size = default_huge_page_size(); + if (gopts->test_type == TEST_HUGETLB) + gopts->page_size = default_huge_page_size(); else - page_size = sysconf(_SC_PAGE_SIZE); + gopts->page_size = sysconf(_SC_PAGE_SIZE); - if (!page_size) + if (!gopts->page_size) err("Unable to determine page size"); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) + if ((unsigned long) area_count(NULL, 0, gopts) + sizeof(unsigned long long) * 2 + > gopts->page_size) err("Impossible to run this test"); /* @@ -415,21 +433,21 @@ static void parse_test_type_arg(const char *raw_type) if (uffd_get_features(&features) && errno == ENOENT) ksft_exit_skip("failed to get available features (%d)\n", errno); - test_uffdio_wp = test_uffdio_wp && + gopts->test_uffdio_wp = gopts->test_uffdio_wp && (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); - if (test_type != TEST_ANON && !(features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM)) - test_uffdio_wp = false; + if (gopts->test_type != TEST_ANON && !(features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM)) + gopts->test_uffdio_wp = false; - close(uffd); - uffd = -1; + close(gopts->uffd); + gopts->uffd = -1; } static void sigalrm(int sig) { if (sig != SIGALRM) abort(); - test_uffdio_copy_eexist = true; + gopts->test_uffdio_copy_eexist = true; alarm(ALARM_INTERVAL_SECS); } @@ -438,6 +456,8 @@ int main(int argc, char **argv) unsigned long nr_cpus; size_t bytes; + gopts = (uffd_global_test_opts_t *) malloc(sizeof(uffd_global_test_opts_t)); + if (argc < 4) usage(); @@ -445,7 +465,7 @@ int main(int argc, char **argv) err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); - parse_test_type_arg(argv[1]); + parse_test_type_arg(gopts, argv[1]); bytes = atol(argv[2]) * 1024 * 1024; nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); @@ -453,9 +473,9 @@ int main(int argc, char **argv) /* Don't let calculation below go to zero. */ ksft_print_msg("_SC_NPROCESSORS_ONLN (%lu) too large, capping nr_threads to 32\n", nr_cpus); - nr_parallel = 32; + gopts->nr_parallel = 32; } else { - nr_parallel = nr_cpus; + gopts->nr_parallel = nr_cpus; } /* @@ -463,16 +483,16 @@ int main(int argc, char **argv) * Ensure nr_parallel - 1 hugepages on top of that to account * for racy extra reservation of hugepages. */ - if (test_type == TEST_HUGETLB && - get_free_hugepages() < 2 * (bytes / page_size) + nr_parallel - 1) { + if (gopts->test_type == TEST_HUGETLB && + get_free_hugepages() < 2 * (bytes / gopts->page_size) + gopts->nr_parallel - 1) { printf("skip: Skipping userfaultfd... not enough hugepages\n"); return KSFT_SKIP; } - nr_pages_per_cpu = bytes / page_size / nr_parallel; - if (!nr_pages_per_cpu) { + gopts->nr_pages_per_cpu = bytes / gopts->page_size / gopts->nr_parallel; + if (!gopts->nr_pages_per_cpu) { _err("pages_per_cpu = 0, cannot test (%lu / %lu / %lu)", - bytes, page_size, nr_parallel); + bytes, gopts->page_size, gopts->nr_parallel); usage(); } @@ -481,11 +501,11 @@ int main(int argc, char **argv) _err("invalid bounces"); usage(); } - nr_pages = nr_pages_per_cpu * nr_parallel; + gopts->nr_pages = gopts->nr_pages_per_cpu * gopts->nr_parallel; printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - nr_pages, nr_pages_per_cpu); - return userfaultfd_stress(); + gopts->nr_pages, gopts->nr_pages_per_cpu); + return userfaultfd_stress(gopts); } #else /* __NR_userfaultfd */ diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 50501b38e34e..9e3be2ee7f1b 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -76,7 +76,7 @@ struct uffd_test_args { typedef struct uffd_test_args uffd_test_args_t; /* Returns: UFFD_TEST_* */ -typedef void (*uffd_test_fn)(uffd_test_args_t *); +typedef void (*uffd_test_fn)(uffd_global_test_opts_t *, uffd_test_args_t *); typedef struct { const char *name; @@ -181,33 +181,6 @@ out: return 1; } -/* - * This function initializes the global variables. TODO: remove global - * vars and then remove this. - */ -static int -uffd_setup_environment(uffd_test_args_t *args, uffd_test_case_t *test, - mem_type_t *mem_type, const char **errmsg) -{ - map_shared = mem_type->shared; - uffd_test_ops = mem_type->mem_ops; - uffd_test_case_ops = test->test_case_ops; - - if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) - page_size = default_huge_page_size(); - else - page_size = psize(); - - /* Ensure we have at least 2 pages */ - nr_pages = MAX(UFFD_TEST_MEM_SIZE, page_size * 2) / page_size; - /* TODO: remove this global var.. it's so ugly */ - nr_parallel = 1; - - /* Initialize test arguments */ - args->mem_type = mem_type; - - return uffd_test_ctx_init(test->uffd_feature_required, errmsg); -} static bool uffd_feature_supported(uffd_test_case_t *test) { @@ -237,7 +210,8 @@ static int pagemap_open(void) } while (0) typedef struct { - int parent_uffd, child_uffd; + uffd_global_test_opts_t *gopts; + int child_uffd; } fork_event_args; static void *fork_event_consumer(void *data) @@ -245,10 +219,10 @@ static void *fork_event_consumer(void *data) fork_event_args *args = data; struct uffd_msg msg = { 0 }; - ready_for_fork = true; + args->gopts->ready_for_fork = true; /* Read until a full msg received */ - while (uffd_read_msg(args->parent_uffd, &msg)); + while (uffd_read_msg(args->gopts, &msg)); if (msg.event != UFFD_EVENT_FORK) err("wrong message: %u\n", msg.event); @@ -304,9 +278,9 @@ static void unpin_pages(pin_args *args) args->pinned = false; } -static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) +static int pagemap_test_fork(uffd_global_test_opts_t *gopts, bool with_event, bool test_pin) { - fork_event_args args = { .parent_uffd = uffd, .child_uffd = -1 }; + fork_event_args args = { .gopts = gopts, .child_uffd = -1 }; pthread_t thread; pid_t child; uint64_t value; @@ -314,10 +288,10 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) /* Prepare a thread to resolve EVENT_FORK */ if (with_event) { - ready_for_fork = false; + gopts->ready_for_fork = false; if (pthread_create(&thread, NULL, fork_event_consumer, &args)) err("pthread_create()"); - while (!ready_for_fork) + while (!gopts->ready_for_fork) ; /* Wait for the poll_thread to start executing before forking */ } @@ -328,14 +302,14 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) fd = pagemap_open(); - if (test_pin && pin_pages(&args, area_dst, page_size)) + if (test_pin && pin_pages(&args, gopts->area_dst, gopts->page_size)) /* * Normally when reach here we have pinned in * previous tests, so shouldn't fail anymore */ err("pin page failed in child"); - value = pagemap_get_entry(fd, area_dst); + value = pagemap_get_entry(fd, gopts->area_dst); /* * After fork(), we should handle uffd-wp bit differently: * @@ -361,70 +335,70 @@ static int pagemap_test_fork(int uffd, bool with_event, bool test_pin) return result; } -static void uffd_wp_unpopulated_test(uffd_test_args_t *args) +static void uffd_wp_unpopulated_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { uint64_t value; int pagemap_fd; - if (uffd_register(uffd, area_dst, nr_pages * page_size, + if (uffd_register(gopts->uffd, gopts->area_dst, gopts->nr_pages * gopts->page_size, false, true, false)) err("register failed"); pagemap_fd = pagemap_open(); /* Test applying pte marker to anon unpopulated */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - value = pagemap_get_entry(pagemap_fd, area_dst); + wp_range(gopts->uffd, (uint64_t)gopts->area_dst, gopts->page_size, true); + value = pagemap_get_entry(pagemap_fd, gopts->area_dst); pagemap_check_wp(value, true); /* Test unprotect on anon pte marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_get_entry(pagemap_fd, area_dst); + wp_range(gopts->uffd, (uint64_t)gopts->area_dst, gopts->page_size, false); + value = pagemap_get_entry(pagemap_fd, gopts->area_dst); pagemap_check_wp(value, false); /* Test zap on anon marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - if (madvise(area_dst, page_size, MADV_DONTNEED)) + wp_range(gopts->uffd, (uint64_t)gopts->area_dst, gopts->page_size, true); + if (madvise(gopts->area_dst, gopts->page_size, MADV_DONTNEED)) err("madvise(MADV_DONTNEED) failed"); - value = pagemap_get_entry(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, gopts->area_dst); pagemap_check_wp(value, false); /* Test fault in after marker removed */ - *area_dst = 1; - value = pagemap_get_entry(pagemap_fd, area_dst); + *gopts->area_dst = 1; + value = pagemap_get_entry(pagemap_fd, gopts->area_dst); pagemap_check_wp(value, false); /* Drop it to make pte none again */ - if (madvise(area_dst, page_size, MADV_DONTNEED)) + if (madvise(gopts->area_dst, gopts->page_size, MADV_DONTNEED)) err("madvise(MADV_DONTNEED) failed"); /* Test read-zero-page upon pte marker */ - wp_range(uffd, (uint64_t)area_dst, page_size, true); - *(volatile char *)area_dst; + wp_range(gopts->uffd, (uint64_t)gopts->area_dst, gopts->page_size, true); + *(volatile char *)gopts->area_dst; /* Drop it to make pte none again */ - if (madvise(area_dst, page_size, MADV_DONTNEED)) + if (madvise(gopts->area_dst, gopts->page_size, MADV_DONTNEED)) err("madvise(MADV_DONTNEED) failed"); uffd_test_pass(); } -static void uffd_wp_fork_test_common(uffd_test_args_t *args, +static void uffd_wp_fork_test_common(uffd_global_test_opts_t *gopts, uffd_test_args_t *args, bool with_event) { int pagemap_fd; uint64_t value; - if (uffd_register(uffd, area_dst, nr_pages * page_size, + if (uffd_register(gopts->uffd, gopts->area_dst, gopts->nr_pages * gopts->page_size, false, true, false)) err("register failed"); pagemap_fd = pagemap_open(); /* Touch the page */ - *area_dst = 1; - wp_range(uffd, (uint64_t)area_dst, page_size, true); - value = pagemap_get_entry(pagemap_fd, area_dst); + *gopts->area_dst = 1; + wp_range(gopts->uffd, (uint64_t)gopts->area_dst, gopts->page_size, true); + value = pagemap_get_entry(pagemap_fd, gopts->area_dst); pagemap_check_wp(value, true); - if (pagemap_test_fork(uffd, with_event, false)) { + if (pagemap_test_fork(gopts, with_event, false)) { uffd_test_fail("Detected %s uffd-wp bit in child in present pte", with_event ? "missing" : "stall"); goto out; @@ -442,79 +416,80 @@ static void uffd_wp_fork_test_common(uffd_test_args_t *args, * to expose pte markers. */ if (args->mem_type->shared) { - if (madvise(area_dst, page_size, MADV_DONTNEED)) + if (madvise(gopts->area_dst, gopts->page_size, MADV_DONTNEED)) err("MADV_DONTNEED"); } else { /* * NOTE: ignore retval because private-hugetlb doesn't yet * support swapping, so it could fail. */ - madvise(area_dst, page_size, MADV_PAGEOUT); + madvise(gopts->area_dst, gopts->page_size, MADV_PAGEOUT); } /* Uffd-wp should persist even swapped out */ - value = pagemap_get_entry(pagemap_fd, area_dst); + value = pagemap_get_entry(pagemap_fd, gopts->area_dst); pagemap_check_wp(value, true); - if (pagemap_test_fork(uffd, with_event, false)) { + if (pagemap_test_fork(gopts, with_event, false)) { uffd_test_fail("Detected %s uffd-wp bit in child in zapped pte", with_event ? "missing" : "stall"); goto out; } /* Unprotect; this tests swap pte modifications */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_get_entry(pagemap_fd, area_dst); + wp_range(gopts->uffd, (uint64_t)gopts->area_dst, gopts->page_size, false); + value = pagemap_get_entry(pagemap_fd, gopts->area_dst); pagemap_check_wp(value, false); /* Fault in the page from disk */ - *area_dst = 2; - value = pagemap_get_entry(pagemap_fd, area_dst); + *gopts->area_dst = 2; + value = pagemap_get_entry(pagemap_fd, gopts->area_dst); pagemap_check_wp(value, false); uffd_test_pass(); out: - if (uffd_unregister(uffd, area_dst, nr_pages * page_size)) + if (uffd_unregister(gopts->uffd, gopts->area_dst, gopts->nr_pages * gopts->page_size)) err("unregister failed"); close(pagemap_fd); } -static void uffd_wp_fork_test(uffd_test_args_t *args) +static void uffd_wp_fork_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_wp_fork_test_common(args, false); + uffd_wp_fork_test_common(gopts, args, false); } -static void uffd_wp_fork_with_event_test(uffd_test_args_t *args) +static void uffd_wp_fork_with_event_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_wp_fork_test_common(args, true); + uffd_wp_fork_test_common(gopts, args, true); } -static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args, +static void uffd_wp_fork_pin_test_common(uffd_global_test_opts_t *gopts, + uffd_test_args_t *args, bool with_event) { int pagemap_fd; pin_args pin_args = {}; - if (uffd_register(uffd, area_dst, page_size, false, true, false)) + if (uffd_register(gopts->uffd, gopts->area_dst, gopts->page_size, false, true, false)) err("register failed"); pagemap_fd = pagemap_open(); /* Touch the page */ - *area_dst = 1; - wp_range(uffd, (uint64_t)area_dst, page_size, true); + *gopts->area_dst = 1; + wp_range(gopts->uffd, (uint64_t)gopts->area_dst, gopts->page_size, true); /* * 1. First pin, then fork(). This tests fork() special path when * doing early CoW if the page is private. */ - if (pin_pages(&pin_args, area_dst, page_size)) { + if (pin_pages(&pin_args, gopts->area_dst, gopts->page_size)) { uffd_test_skip("Possibly CONFIG_GUP_TEST missing " "or unprivileged"); close(pagemap_fd); - uffd_unregister(uffd, area_dst, page_size); + uffd_unregister(gopts->uffd, gopts->area_dst, gopts->page_size); return; } - if (pagemap_test_fork(uffd, with_event, false)) { + if (pagemap_test_fork(gopts, with_event, false)) { uffd_test_fail("Detected %s uffd-wp bit in early CoW of fork()", with_event ? "missing" : "stall"); unpin_pages(&pin_args); @@ -527,49 +502,50 @@ static void uffd_wp_fork_pin_test_common(uffd_test_args_t *args, * 2. First fork(), then pin (in the child, where test_pin==true). * This tests COR, aka, page unsharing on private memories. */ - if (pagemap_test_fork(uffd, with_event, true)) { + if (pagemap_test_fork(gopts, with_event, true)) { uffd_test_fail("Detected %s uffd-wp bit when RO pin", with_event ? "missing" : "stall"); goto out; } uffd_test_pass(); out: - if (uffd_unregister(uffd, area_dst, page_size)) + if (uffd_unregister(gopts->uffd, gopts->area_dst, gopts->page_size)) err("register failed"); close(pagemap_fd); } -static void uffd_wp_fork_pin_test(uffd_test_args_t *args) +static void uffd_wp_fork_pin_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_wp_fork_pin_test_common(args, false); + uffd_wp_fork_pin_test_common(gopts, args, false); } -static void uffd_wp_fork_pin_with_event_test(uffd_test_args_t *args) +static void uffd_wp_fork_pin_with_event_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_wp_fork_pin_test_common(args, true); + uffd_wp_fork_pin_test_common(gopts, args, true); } -static void check_memory_contents(char *p) +static void check_memory_contents(uffd_global_test_opts_t *gopts, char *p) { unsigned long i, j; uint8_t expected_byte; - for (i = 0; i < nr_pages; ++i) { + for (i = 0; i < gopts->nr_pages; ++i) { expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); - for (j = 0; j < page_size; j++) { - uint8_t v = *(uint8_t *)(p + (i * page_size) + j); + for (j = 0; j < gopts->page_size; j++) { + uint8_t v = *(uint8_t *)(p + (i * gopts->page_size) + j); if (v != expected_byte) err("unexpected page contents"); } } } -static void uffd_minor_test_common(bool test_collapse, bool test_wp) +static void uffd_minor_test_common(uffd_global_test_opts_t *gopts, bool test_collapse, bool test_wp) { unsigned long p; pthread_t uffd_mon; char c; struct uffd_args args = { 0 }; + args.gopts = gopts; /* * NOTE: MADV_COLLAPSE is not yet compatible with WP, so testing @@ -577,7 +553,7 @@ static void uffd_minor_test_common(bool test_collapse, bool test_wp) */ assert(!(test_collapse && test_wp)); - if (uffd_register(uffd, area_dst_alias, nr_pages * page_size, + if (uffd_register(gopts->uffd, gopts->area_dst_alias, gopts->nr_pages * gopts->page_size, /* NOTE! MADV_COLLAPSE may not work with uffd-wp */ false, test_wp, true)) err("register failure"); @@ -586,9 +562,9 @@ static void uffd_minor_test_common(bool test_collapse, bool test_wp) * After registering with UFFD, populate the non-UFFD-registered side of * the shared mapping. This should *not* trigger any UFFD minor faults. */ - for (p = 0; p < nr_pages; ++p) - memset(area_dst + (p * page_size), p % ((uint8_t)-1), - page_size); + for (p = 0; p < gopts->nr_pages; ++p) + memset(gopts->area_dst + (p * gopts->page_size), p % ((uint8_t)-1), + gopts->page_size); args.apply_wp = test_wp; if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) @@ -600,50 +576,51 @@ static void uffd_minor_test_common(bool test_collapse, bool test_wp) * fault. uffd_poll_thread will resolve the fault by bit-flipping the * page's contents, and then issuing a CONTINUE ioctl. */ - check_memory_contents(area_dst_alias); + check_memory_contents(gopts, gopts->area_dst_alias); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); if (pthread_join(uffd_mon, NULL)) err("join() failed"); if (test_collapse) { - if (madvise(area_dst_alias, nr_pages * page_size, + if (madvise(gopts->area_dst_alias, gopts->nr_pages * gopts->page_size, MADV_COLLAPSE)) { /* It's fine to fail for this one... */ uffd_test_skip("MADV_COLLAPSE failed"); return; } - uffd_test_ops->check_pmd_mapping(area_dst, - nr_pages * page_size / + uffd_test_ops->check_pmd_mapping(gopts, + gopts->area_dst, + gopts->nr_pages * gopts->page_size / read_pmd_pagesize()); /* * This won't cause uffd-fault - it purely just makes sure there * was no corruption. */ - check_memory_contents(area_dst_alias); + check_memory_contents(gopts, gopts->area_dst_alias); } - if (args.missing_faults != 0 || args.minor_faults != nr_pages) + if (args.missing_faults != 0 || args.minor_faults != gopts->nr_pages) uffd_test_fail("stats check error"); else uffd_test_pass(); } -void uffd_minor_test(uffd_test_args_t *args) +void uffd_minor_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_minor_test_common(false, false); + uffd_minor_test_common(gopts, false, false); } -void uffd_minor_wp_test(uffd_test_args_t *args) +void uffd_minor_wp_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_minor_test_common(false, true); + uffd_minor_test_common(gopts, false, true); } -void uffd_minor_collapse_test(uffd_test_args_t *args) +void uffd_minor_collapse_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_minor_test_common(true, false); + uffd_minor_test_common(gopts, true, false); } static sigjmp_buf jbuf, *sigbuf; @@ -678,7 +655,7 @@ static void sighndl(int sig, siginfo_t *siginfo, void *ptr) * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal * feature. Using monitor thread, verify no userfault events are generated. */ -static int faulting_process(int signal_test, bool wp) +static int faulting_process(uffd_global_test_opts_t *gopts, int signal_test, bool wp) { unsigned long nr, i; unsigned long long count; @@ -687,7 +664,7 @@ static int faulting_process(int signal_test, bool wp) struct sigaction act; volatile unsigned long signalled = 0; - split_nr_pages = (nr_pages + 1) / 2; + split_nr_pages = (gopts->nr_pages + 1) / 2; if (signal_test) { sigbuf = &jbuf; @@ -701,7 +678,7 @@ static int faulting_process(int signal_test, bool wp) for (nr = 0; nr < split_nr_pages; nr++) { volatile int steps = 1; - unsigned long offset = nr * page_size; + unsigned long offset = nr * gopts->page_size; if (signal_test) { if (sigsetjmp(*sigbuf, 1) != 0) { @@ -713,15 +690,15 @@ static int faulting_process(int signal_test, bool wp) if (steps == 1) { /* This is a MISSING request */ steps++; - if (copy_page(uffd, offset, wp)) + if (copy_page(gopts, offset, wp)) signalled++; } else { /* This is a WP request */ assert(steps == 2); - wp_range(uffd, - (__u64)area_dst + + wp_range(gopts->uffd, + (__u64)gopts->area_dst + offset, - page_size, false); + gopts->page_size, false); } } else { signalled++; @@ -730,51 +707,53 @@ static int faulting_process(int signal_test, bool wp) } } - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) + count = *area_count(gopts->area_dst, nr, gopts); + if (count != gopts->count_verify[nr]) err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); + nr, count, gopts->count_verify[nr]); /* * Trigger write protection if there is by writing * the same value back. */ - *area_count(area_dst, nr) = count; + *area_count(gopts->area_dst, nr, gopts) = count; } if (signal_test) return signalled != split_nr_pages; - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) + gopts->area_dst = mremap(gopts->area_dst, gopts->nr_pages * gopts->page_size, + gopts->nr_pages * gopts->page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + gopts->area_src); + if (gopts->area_dst == MAP_FAILED) err("mremap"); /* Reset area_src since we just clobbered it */ - area_src = NULL; + gopts->area_src = NULL; - for (; nr < nr_pages; nr++) { - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) { + for (; nr < gopts->nr_pages; nr++) { + count = *area_count(gopts->area_dst, nr, gopts); + if (count != gopts->count_verify[nr]) { err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); + nr, count, gopts->count_verify[nr]); } /* * Trigger write protection if there is by writing * the same value back. */ - *area_count(area_dst, nr) = count; + *area_count(gopts->area_dst, nr, gopts) = count; } - uffd_test_ops->release_pages(area_dst); + uffd_test_ops->release_pages(gopts, gopts->area_dst); - for (nr = 0; nr < nr_pages; nr++) - for (i = 0; i < page_size; i++) - if (*(area_dst + nr * page_size + i) != 0) + for (nr = 0; nr < gopts->nr_pages; nr++) + for (i = 0; i < gopts->page_size; i++) + if (*(gopts->area_dst + nr * gopts->page_size + i) != 0) err("page %lu offset %lu is not zero", nr, i); return 0; } -static void uffd_sigbus_test_common(bool wp) +static void uffd_sigbus_test_common(uffd_global_test_opts_t *gopts, bool wp) { unsigned long userfaults; pthread_t uffd_mon; @@ -782,25 +761,26 @@ static void uffd_sigbus_test_common(bool wp) int err; char c; struct uffd_args args = { 0 }; + args.gopts = gopts; - ready_for_fork = false; + gopts->ready_for_fork = false; - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + fcntl(gopts->uffd, F_SETFL, gopts->uffd_flags | O_NONBLOCK); - if (uffd_register(uffd, area_dst, nr_pages * page_size, + if (uffd_register(gopts->uffd, gopts->area_dst, gopts->nr_pages * gopts->page_size, true, wp, false)) err("register failure"); - if (faulting_process(1, wp)) + if (faulting_process(gopts, 1, wp)) err("faulting process failed"); - uffd_test_ops->release_pages(area_dst); + uffd_test_ops->release_pages(gopts, gopts->area_dst); args.apply_wp = wp; if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); - while (!ready_for_fork) + while (!gopts->ready_for_fork) ; /* Wait for the poll_thread to start executing before forking */ pid = fork(); @@ -808,12 +788,12 @@ static void uffd_sigbus_test_common(bool wp) err("fork"); if (!pid) - exit(faulting_process(2, wp)); + exit(faulting_process(gopts, 2, wp)); waitpid(pid, &err, 0); if (err) err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); if (pthread_join(uffd_mon, (void **)&userfaults)) err("pthread_join()"); @@ -824,28 +804,29 @@ static void uffd_sigbus_test_common(bool wp) uffd_test_pass(); } -static void uffd_sigbus_test(uffd_test_args_t *args) +static void uffd_sigbus_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_sigbus_test_common(false); + uffd_sigbus_test_common(gopts, false); } -static void uffd_sigbus_wp_test(uffd_test_args_t *args) +static void uffd_sigbus_wp_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_sigbus_test_common(true); + uffd_sigbus_test_common(gopts, true); } -static void uffd_events_test_common(bool wp) +static void uffd_events_test_common(uffd_global_test_opts_t *gopts, bool wp) { pthread_t uffd_mon; pid_t pid; int err; char c; struct uffd_args args = { 0 }; + args.gopts = gopts; - ready_for_fork = false; + gopts->ready_for_fork = false; - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - if (uffd_register(uffd, area_dst, nr_pages * page_size, + fcntl(gopts->uffd, F_SETFL, gopts->uffd_flags | O_NONBLOCK); + if (uffd_register(gopts->uffd, gopts->area_dst, gopts->nr_pages * gopts->page_size, true, wp, false)) err("register failure"); @@ -853,7 +834,7 @@ static void uffd_events_test_common(bool wp) if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); - while (!ready_for_fork) + while (!gopts->ready_for_fork) ; /* Wait for the poll_thread to start executing before forking */ pid = fork(); @@ -861,39 +842,39 @@ static void uffd_events_test_common(bool wp) err("fork"); if (!pid) - exit(faulting_process(0, wp)); + exit(faulting_process(gopts, 0, wp)); waitpid(pid, &err, 0); if (err) err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); if (pthread_join(uffd_mon, NULL)) err("pthread_join()"); - if (args.missing_faults != nr_pages) + if (args.missing_faults != gopts->nr_pages) uffd_test_fail("Fault counts wrong"); else uffd_test_pass(); } -static void uffd_events_test(uffd_test_args_t *args) +static void uffd_events_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_events_test_common(false); + uffd_events_test_common(gopts, false); } -static void uffd_events_wp_test(uffd_test_args_t *args) +static void uffd_events_wp_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { - uffd_events_test_common(true); + uffd_events_test_common(gopts, true); } -static void retry_uffdio_zeropage(int ufd, +static void retry_uffdio_zeropage(uffd_global_test_opts_t *gopts, struct uffdio_zeropage *uffdio_zeropage) { - uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, + uffd_test_ops->alias_mapping(gopts, &uffdio_zeropage->range.start, uffdio_zeropage->range.len, 0); - if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { + if (ioctl(gopts->uffd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { if (uffdio_zeropage->zeropage != -EEXIST) err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)uffdio_zeropage->zeropage); @@ -903,16 +884,16 @@ static void retry_uffdio_zeropage(int ufd, } } -static bool do_uffdio_zeropage(int ufd, bool has_zeropage) +static bool do_uffdio_zeropage(uffd_global_test_opts_t *gopts, bool has_zeropage) { struct uffdio_zeropage uffdio_zeropage = { 0 }; int ret; __s64 res; - uffdio_zeropage.range.start = (unsigned long) area_dst; - uffdio_zeropage.range.len = page_size; + uffdio_zeropage.range.start = (unsigned long) gopts->area_dst; + uffdio_zeropage.range.len = gopts->page_size; uffdio_zeropage.mode = 0; - ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + ret = ioctl(gopts->uffd, UFFDIO_ZEROPAGE, &uffdio_zeropage); res = uffdio_zeropage.zeropage; if (ret) { /* real retval in ufdio_zeropage.zeropage */ @@ -921,10 +902,10 @@ static bool do_uffdio_zeropage(int ufd, bool has_zeropage) else if (res != -EINVAL) err("UFFDIO_ZEROPAGE not -EINVAL"); } else if (has_zeropage) { - if (res != page_size) + if (res != gopts->page_size) err("UFFDIO_ZEROPAGE unexpected size"); else - retry_uffdio_zeropage(ufd, &uffdio_zeropage); + retry_uffdio_zeropage(gopts, &uffdio_zeropage); return true; } else err("UFFDIO_ZEROPAGE succeeded"); @@ -950,25 +931,29 @@ uffd_register_detect_zeropage(int uffd, void *addr, uint64_t len) } /* exercise UFFDIO_ZEROPAGE */ -static void uffd_zeropage_test(uffd_test_args_t *args) +static void uffd_zeropage_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { bool has_zeropage; int i; - has_zeropage = uffd_register_detect_zeropage(uffd, area_dst, page_size); - if (area_dst_alias) + has_zeropage = uffd_register_detect_zeropage(gopts->uffd, + gopts->area_dst, + gopts->page_size); + if (gopts->area_dst_alias) /* Ignore the retval; we already have it */ - uffd_register_detect_zeropage(uffd, area_dst_alias, page_size); + uffd_register_detect_zeropage(gopts->uffd, gopts->area_dst_alias, gopts->page_size); - if (do_uffdio_zeropage(uffd, has_zeropage)) - for (i = 0; i < page_size; i++) - if (area_dst[i] != 0) + if (do_uffdio_zeropage(gopts, has_zeropage)) + for (i = 0; i < gopts->page_size; i++) + if (gopts->area_dst[i] != 0) err("data non-zero at offset %d\n", i); - if (uffd_unregister(uffd, area_dst, page_size)) + if (uffd_unregister(gopts->uffd, gopts->area_dst, gopts->page_size)) err("unregister"); - if (area_dst_alias && uffd_unregister(uffd, area_dst_alias, page_size)) + if (gopts->area_dst_alias && uffd_unregister(gopts->uffd, + gopts->area_dst_alias, + gopts->page_size)) err("unregister"); uffd_test_pass(); @@ -987,26 +972,27 @@ static void uffd_register_poison(int uffd, void *addr, uint64_t len) err("registered area doesn't support COPY and POISON ioctls"); } -static void do_uffdio_poison(int uffd, unsigned long offset) +static void do_uffdio_poison(uffd_global_test_opts_t *gopts, unsigned long offset) { struct uffdio_poison uffdio_poison = { 0 }; int ret; __s64 res; - uffdio_poison.range.start = (unsigned long) area_dst + offset; - uffdio_poison.range.len = page_size; + uffdio_poison.range.start = (unsigned long) gopts->area_dst + offset; + uffdio_poison.range.len = gopts->page_size; uffdio_poison.mode = 0; - ret = ioctl(uffd, UFFDIO_POISON, &uffdio_poison); + ret = ioctl(gopts->uffd, UFFDIO_POISON, &uffdio_poison); res = uffdio_poison.updated; if (ret) err("UFFDIO_POISON error: %"PRId64, (int64_t)res); - else if (res != page_size) + else if (res != gopts->page_size) err("UFFDIO_POISON unexpected size: %"PRId64, (int64_t)res); } -static void uffd_poison_handle_fault( - struct uffd_msg *msg, struct uffd_args *args) +static void uffd_poison_handle_fault(uffd_global_test_opts_t *gopts, + struct uffd_msg *msg, + struct uffd_args *args) { unsigned long offset; @@ -1017,20 +1003,20 @@ static void uffd_poison_handle_fault( (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR)) err("unexpected fault type %llu", msg->arg.pagefault.flags); - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); + offset = (char *)(unsigned long)msg->arg.pagefault.address - gopts->area_dst; + offset &= ~(gopts->page_size-1); /* Odd pages -> copy zeroed page; even pages -> poison. */ - if (offset & page_size) - copy_page(uffd, offset, false); + if (offset & gopts->page_size) + copy_page(gopts, offset, false); else - do_uffdio_poison(uffd, offset); + do_uffdio_poison(gopts, offset); } /* Make sure to cover odd/even, and minimum duplications */ #define UFFD_POISON_TEST_NPAGES 4 -static void uffd_poison_test(uffd_test_args_t *targs) +static void uffd_poison_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *targs) { pthread_t uffd_mon; char c; @@ -1039,15 +1025,17 @@ static void uffd_poison_test(uffd_test_args_t *targs) unsigned long nr_sigbus = 0; unsigned long nr, poison_pages = UFFD_POISON_TEST_NPAGES; - if (nr_pages < poison_pages) { - uffd_test_skip("Too few pages for POISON test"); + if (gopts->nr_pages < poison_pages) { + uffd_test_skip("Too less pages for POISON test"); return; } - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + args.gopts = gopts; - uffd_register_poison(uffd, area_dst, poison_pages * page_size); - memset(area_src, 0, poison_pages * page_size); + fcntl(gopts->uffd, F_SETFL, gopts->uffd_flags | O_NONBLOCK); + + uffd_register_poison(gopts->uffd, gopts->area_dst, poison_pages * gopts->page_size); + memset(gopts->area_src, 0, poison_pages * gopts->page_size); args.handle_fault = uffd_poison_handle_fault; if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) @@ -1060,8 +1048,8 @@ static void uffd_poison_test(uffd_test_args_t *targs) err("sigaction"); for (nr = 0; nr < poison_pages; ++nr) { - unsigned long offset = nr * page_size; - const char *bytes = (const char *) area_dst + offset; + unsigned long offset = nr * gopts->page_size; + const char *bytes = (const char *) gopts->area_dst + offset; const char *i; if (sigsetjmp(*sigbuf, 1)) { @@ -1074,14 +1062,14 @@ static void uffd_poison_test(uffd_test_args_t *targs) continue; } - for (i = bytes; i < bytes + page_size; ++i) { + for (i = bytes; i < bytes + gopts->page_size; ++i) { if (*i) err("nonzero byte in area_dst (%p) at %p: %u", - area_dst, i, *i); + gopts->area_dst, i, *i); } } - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); if (pthread_join(uffd_mon, NULL)) err("pthread_join()"); @@ -1094,7 +1082,9 @@ static void uffd_poison_test(uffd_test_args_t *targs) } static void -uffd_move_handle_fault_common(struct uffd_msg *msg, struct uffd_args *args, +uffd_move_handle_fault_common(uffd_global_test_opts_t *gopts, + struct uffd_msg *msg, + struct uffd_args *args, unsigned long len) { unsigned long offset; @@ -1106,28 +1096,32 @@ uffd_move_handle_fault_common(struct uffd_msg *msg, struct uffd_args *args, (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR | UFFD_PAGEFAULT_FLAG_WRITE)) err("unexpected fault type %llu", msg->arg.pagefault.flags); - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset = (char *)(unsigned long)msg->arg.pagefault.address - gopts->area_dst; offset &= ~(len-1); - if (move_page(uffd, offset, len)) + if (move_page(gopts, offset, len)) args->missing_faults++; } -static void uffd_move_handle_fault(struct uffd_msg *msg, +static void uffd_move_handle_fault(uffd_global_test_opts_t *gopts, struct uffd_msg *msg, struct uffd_args *args) { - uffd_move_handle_fault_common(msg, args, page_size); + uffd_move_handle_fault_common(gopts, msg, args, gopts->page_size); } -static void uffd_move_pmd_handle_fault(struct uffd_msg *msg, +static void uffd_move_pmd_handle_fault(uffd_global_test_opts_t *gopts, struct uffd_msg *msg, struct uffd_args *args) { - uffd_move_handle_fault_common(msg, args, read_pmd_pagesize()); + uffd_move_handle_fault_common(gopts, msg, args, read_pmd_pagesize()); } static void -uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, - void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args)) +uffd_move_test_common(uffd_global_test_opts_t *gopts, + uffd_test_args_t *targs, + unsigned long chunk_size, + void (*handle_fault)(struct uffd_global_test_opts *gopts, + struct uffd_msg *msg, struct uffd_args *args) +) { unsigned long nr; pthread_t uffd_mon; @@ -1139,11 +1133,13 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, unsigned long src_offs = 0; unsigned long dst_offs = 0; + args.gopts = gopts; + /* Prevent source pages from being mapped more than once */ - if (madvise(area_src, nr_pages * page_size, MADV_DONTFORK)) + if (madvise(gopts->area_src, gopts->nr_pages * gopts->page_size, MADV_DONTFORK)) err("madvise(MADV_DONTFORK) failure"); - if (uffd_register(uffd, area_dst, nr_pages * page_size, + if (uffd_register(gopts->uffd, gopts->area_dst, gopts->nr_pages * gopts->page_size, true, false, false)) err("register failure"); @@ -1151,22 +1147,22 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) err("uffd_poll_thread create"); - step_size = chunk_size / page_size; - step_count = nr_pages / step_size; + step_size = chunk_size / gopts->page_size; + step_count = gopts->nr_pages / step_size; - if (chunk_size > page_size) { - char *aligned_src = ALIGN_UP(area_src, chunk_size); - char *aligned_dst = ALIGN_UP(area_dst, chunk_size); + if (chunk_size > gopts->page_size) { + char *aligned_src = ALIGN_UP(gopts->area_src, chunk_size); + char *aligned_dst = ALIGN_UP(gopts->area_dst, chunk_size); - if (aligned_src != area_src || aligned_dst != area_dst) { - src_offs = (aligned_src - area_src) / page_size; - dst_offs = (aligned_dst - area_dst) / page_size; + if (aligned_src != gopts->area_src || aligned_dst != gopts->area_dst) { + src_offs = (aligned_src - gopts->area_src) / gopts->page_size; + dst_offs = (aligned_dst - gopts->area_dst) / gopts->page_size; step_count--; } - orig_area_src = area_src; - orig_area_dst = area_dst; - area_src = aligned_src; - area_dst = aligned_dst; + orig_area_src = gopts->area_src; + orig_area_dst = gopts->area_dst; + gopts->area_src = aligned_src; + gopts->area_dst = aligned_dst; } /* @@ -1180,34 +1176,34 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, /* Check area_src content */ for (i = 0; i < step_size; i++) { - count = *area_count(area_src, nr + i); - if (count != count_verify[src_offs + nr + i]) + count = *area_count(gopts->area_src, nr + i, gopts); + if (count != gopts->count_verify[src_offs + nr + i]) err("nr %lu source memory invalid %llu %llu\n", - nr + i, count, count_verify[src_offs + nr + i]); + nr + i, count, gopts->count_verify[src_offs + nr + i]); } /* Faulting into area_dst should move the page or the huge page */ for (i = 0; i < step_size; i++) { - count = *area_count(area_dst, nr + i); - if (count != count_verify[dst_offs + nr + i]) + count = *area_count(gopts->area_dst, nr + i, gopts); + if (count != gopts->count_verify[dst_offs + nr + i]) err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[dst_offs + nr + i]); + nr, count, gopts->count_verify[dst_offs + nr + i]); } /* Re-check area_src content which should be empty */ for (i = 0; i < step_size; i++) { - count = *area_count(area_src, nr + i); + count = *area_count(gopts->area_src, nr + i, gopts); if (count != 0) err("nr %lu move failed %llu %llu\n", - nr, count, count_verify[src_offs + nr + i]); + nr, count, gopts->count_verify[src_offs + nr + i]); } } - if (chunk_size > page_size) { - area_src = orig_area_src; - area_dst = orig_area_dst; + if (chunk_size > gopts->page_size) { + gopts->area_src = orig_area_src; + gopts->area_dst = orig_area_dst; } - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); if (pthread_join(uffd_mon, NULL)) err("join() failed"); @@ -1218,24 +1214,24 @@ uffd_move_test_common(uffd_test_args_t *targs, unsigned long chunk_size, uffd_test_pass(); } -static void uffd_move_test(uffd_test_args_t *targs) +static void uffd_move_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *targs) { - uffd_move_test_common(targs, page_size, uffd_move_handle_fault); + uffd_move_test_common(gopts, targs, gopts->page_size, uffd_move_handle_fault); } -static void uffd_move_pmd_test(uffd_test_args_t *targs) +static void uffd_move_pmd_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *targs) { - if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) + if (madvise(gopts->area_dst, gopts->nr_pages * gopts->page_size, MADV_HUGEPAGE)) err("madvise(MADV_HUGEPAGE) failure"); - uffd_move_test_common(targs, read_pmd_pagesize(), + uffd_move_test_common(gopts, targs, read_pmd_pagesize(), uffd_move_pmd_handle_fault); } -static void uffd_move_pmd_split_test(uffd_test_args_t *targs) +static void uffd_move_pmd_split_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *targs) { - if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) + if (madvise(gopts->area_dst, gopts->nr_pages * gopts->page_size, MADV_NOHUGEPAGE)) err("madvise(MADV_NOHUGEPAGE) failure"); - uffd_move_test_common(targs, read_pmd_pagesize(), + uffd_move_test_common(gopts, targs, read_pmd_pagesize(), uffd_move_pmd_handle_fault); } @@ -1295,6 +1291,11 @@ typedef enum { THR_STATE_UNINTERRUPTIBLE, } thread_state; +typedef struct { + uffd_global_test_opts_t *gopts; + volatile pid_t *pid; +} mmap_changing_thread_args; + static void sleep_short(void) { usleep(1000); @@ -1337,7 +1338,9 @@ static void thread_state_until(pid_t tid, thread_state state) static void *uffd_mmap_changing_thread(void *opaque) { - volatile pid_t *pid = opaque; + mmap_changing_thread_args *args = opaque; + uffd_global_test_opts_t *gopts = args->gopts; + volatile pid_t *pid = args->pid; int ret; /* Unfortunately, it's only fetch-able from the thread itself.. */ @@ -1345,21 +1348,21 @@ static void *uffd_mmap_changing_thread(void *opaque) *pid = syscall(SYS_gettid); /* Inject an event, this will hang solid until the event read */ - ret = madvise(area_dst, page_size, MADV_REMOVE); + ret = madvise(gopts->area_dst, gopts->page_size, MADV_REMOVE); if (ret) err("madvise(MADV_REMOVE) failed"); return NULL; } -static void uffd_consume_message(int fd) +static void uffd_consume_message(uffd_global_test_opts_t *gopts) { struct uffd_msg msg = { 0 }; - while (uffd_read_msg(fd, &msg)); + while (uffd_read_msg(gopts, &msg)); } -static void uffd_mmap_changing_test(uffd_test_args_t *targs) +static void uffd_mmap_changing_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *targs) { /* * This stores the real PID (which can be different from how tid is @@ -1368,13 +1371,14 @@ static void uffd_mmap_changing_test(uffd_test_args_t *targs) pid_t pid = 0; pthread_t tid; int ret; + mmap_changing_thread_args args = { gopts, &pid }; - if (uffd_register(uffd, area_dst, nr_pages * page_size, + if (uffd_register(gopts->uffd, gopts->area_dst, gopts->nr_pages * gopts->page_size, true, false, false)) err("uffd_register() failed"); /* Create a thread to generate the racy event */ - ret = pthread_create(&tid, NULL, uffd_mmap_changing_thread, &pid); + ret = pthread_create(&tid, NULL, uffd_mmap_changing_thread, &args); if (ret) err("pthread_create() failed"); @@ -1388,26 +1392,26 @@ static void uffd_mmap_changing_test(uffd_test_args_t *targs) /* Wait until the thread hangs at REMOVE event */ thread_state_until(pid, THR_STATE_UNINTERRUPTIBLE); - if (!uffdio_mmap_changing_test_copy(uffd)) + if (!uffdio_mmap_changing_test_copy(gopts->uffd)) return; - if (!uffdio_mmap_changing_test_zeropage(uffd)) + if (!uffdio_mmap_changing_test_zeropage(gopts->uffd)) return; - if (!uffdio_mmap_changing_test_move(uffd)) + if (!uffdio_mmap_changing_test_move(gopts->uffd)) return; - if (!uffdio_mmap_changing_test_poison(uffd)) + if (!uffdio_mmap_changing_test_poison(gopts->uffd)) return; - if (!uffdio_mmap_changing_test_continue(uffd)) + if (!uffdio_mmap_changing_test_continue(gopts->uffd)) return; /* * All succeeded above! Recycle everything. Start by reading the * event so as to kick the thread roll again.. */ - uffd_consume_message(uffd); + uffd_consume_message(gopts); ret = pthread_join(tid, NULL); assert(ret == 0); @@ -1415,10 +1419,10 @@ static void uffd_mmap_changing_test(uffd_test_args_t *targs) uffd_test_pass(); } -static int prevent_hugepages(const char **errmsg) +static int prevent_hugepages(uffd_global_test_opts_t *gopts, const char **errmsg) { /* This should be done before source area is populated */ - if (madvise(area_src, nr_pages * page_size, MADV_NOHUGEPAGE)) { + if (madvise(gopts->area_src, gopts->nr_pages * gopts->page_size, MADV_NOHUGEPAGE)) { /* Ignore only if CONFIG_TRANSPARENT_HUGEPAGE=n */ if (errno != EINVAL) { if (errmsg) @@ -1429,10 +1433,10 @@ static int prevent_hugepages(const char **errmsg) return 0; } -static int request_hugepages(const char **errmsg) +static int request_hugepages(uffd_global_test_opts_t *gopts, const char **errmsg) { /* This should be done before source area is populated */ - if (madvise(area_src, nr_pages * page_size, MADV_HUGEPAGE)) { + if (madvise(gopts->area_src, gopts->nr_pages * gopts->page_size, MADV_HUGEPAGE)) { if (errmsg) { *errmsg = (errno == EINVAL) ? "CONFIG_TRANSPARENT_HUGEPAGE is not set" : @@ -1456,13 +1460,17 @@ struct uffd_test_case_ops uffd_move_test_pmd_case_ops = { * Note that _UFFDIO_ZEROPAGE is tested separately in the zeropage test. */ static void -do_register_ioctls_test(uffd_test_args_t *args, bool miss, bool wp, bool minor) +do_register_ioctls_test(uffd_global_test_opts_t *gopts, + uffd_test_args_t *args, + bool miss, + bool wp, + bool minor) { uint64_t ioctls = 0, expected = BIT_ULL(_UFFDIO_WAKE); mem_type_t *mem_type = args->mem_type; int ret; - ret = uffd_register_with_ioctls(uffd, area_dst, page_size, + ret = uffd_register_with_ioctls(gopts->uffd, gopts->area_dst, gopts->page_size, miss, wp, minor, &ioctls); /* @@ -1493,18 +1501,18 @@ do_register_ioctls_test(uffd_test_args_t *args, bool miss, bool wp, bool minor) "(miss=%d, wp=%d, minor=%d): expected=0x%"PRIx64", " "returned=0x%"PRIx64, miss, wp, minor, expected, ioctls); - if (uffd_unregister(uffd, area_dst, page_size)) + if (uffd_unregister(gopts->uffd, gopts->area_dst, gopts->page_size)) err("unregister"); } -static void uffd_register_ioctls_test(uffd_test_args_t *args) +static void uffd_register_ioctls_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *args) { int miss, wp, minor; for (miss = 0; miss <= 1; miss++) for (wp = 0; wp <= 1; wp++) for (minor = 0; minor <= 1; minor++) - do_register_ioctls_test(args, miss, wp, minor); + do_register_ioctls_test(gopts, args, miss, wp, minor); uffd_test_pass(); } @@ -1742,6 +1750,28 @@ int main(int argc, char *argv[]) } for (j = 0; j < n_mems; j++) { mem_type = &mem_types[j]; + + /* Initialize global test options */ + uffd_global_test_opts_t gopts = { 0 }; + + gopts.map_shared = mem_type->shared; + uffd_test_ops = mem_type->mem_ops; + uffd_test_case_ops = test->test_case_ops; + + if (mem_type->mem_flag & (MEM_HUGETLB_PRIVATE | MEM_HUGETLB)) + gopts.page_size = default_huge_page_size(); + else + gopts.page_size = psize(); + + /* Ensure we have at least 2 pages */ + gopts.nr_pages = MAX(UFFD_TEST_MEM_SIZE, gopts.page_size * 2) + / gopts.page_size; + + gopts.nr_parallel = 1; + + /* Initialize test arguments */ + args.mem_type = mem_type; + if (!(test->mem_targets & mem_type->mem_flag)) continue; @@ -1756,13 +1786,12 @@ int main(int argc, char *argv[]) uffd_test_skip("feature missing"); continue; } - if (uffd_setup_environment(&args, test, mem_type, - &errmsg)) { + if (uffd_test_ctx_init(&gopts, test->uffd_feature_required, &errmsg)) { uffd_test_skip(errmsg); continue; } - test->uffd_fn(&args); - uffd_test_ctx_clear(); + test->uffd_fn(&gopts, &args); + uffd_test_ctx_clear(&gopts); } } diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c index 78038c40aaaf..4e4a591cf527 100644 --- a/tools/testing/selftests/mm/uffd-wp-mremap.c +++ b/tools/testing/selftests/mm/uffd-wp-mremap.c @@ -152,7 +152,8 @@ static bool range_is_swapped(void *addr, size_t size) return true; } -static void test_one_folio(size_t size, bool private, bool swapout, bool hugetlb) +static void test_one_folio(uffd_global_test_opts_t *gopts, size_t size, bool private, + bool swapout, bool hugetlb) { struct uffdio_writeprotect wp_prms; uint64_t features = 0; @@ -176,21 +177,21 @@ static void test_one_folio(size_t size, bool private, bool swapout, bool hugetlb } /* Register range for uffd-wp. */ - if (userfaultfd_open(&features)) { + if (userfaultfd_open(gopts, &features)) { if (errno == ENOENT) ksft_test_result_skip("userfaultfd not available\n"); else ksft_test_result_fail("userfaultfd_open() failed\n"); goto out; } - if (uffd_register(uffd, mem, size, false, true, false)) { + if (uffd_register(gopts->uffd, mem, size, false, true, false)) { ksft_test_result_fail("uffd_register() failed\n"); goto out; } wp_prms.mode = UFFDIO_WRITEPROTECT_MODE_WP; wp_prms.range.start = (uintptr_t)mem; wp_prms.range.len = size; - if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp_prms)) { + if (ioctl(gopts->uffd, UFFDIO_WRITEPROTECT, &wp_prms)) { ksft_test_result_fail("ioctl(UFFDIO_WRITEPROTECT) failed\n"); goto out; } @@ -237,9 +238,9 @@ static void test_one_folio(size_t size, bool private, bool swapout, bool hugetlb out: if (mem) munmap(mem, size); - if (uffd >= 0) { - close(uffd); - uffd = -1; + if (gopts->uffd >= 0) { + close(gopts->uffd); + gopts->uffd = -1; } } @@ -331,6 +332,7 @@ static const struct testcase testcases[] = { int main(int argc, char **argv) { + uffd_global_test_opts_t gopts = { 0 }; struct thp_settings settings; int i, j, plan = 0; @@ -362,8 +364,8 @@ int main(int argc, char **argv) const struct testcase *tc = &testcases[i]; for (j = 0; j < *tc->nr_sizes; j++) - test_one_folio(tc->sizes[j], tc->private, tc->swapout, - tc->hugetlb); + test_one_folio(&gopts, tc->sizes[j], tc->private, + tc->swapout, tc->hugetlb); } /* If THP is supported, restore original THP settings. */ From 1bca7359d72f759f2b119fc324c138372d3a2cc0 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 29 Aug 2025 13:44:40 +0200 Subject: [PATCH 182/372] fork: check charging success before zeroing stack Patch series "mm: task_stack: Stack handling cleanups". These are some small cleanups for the fork code that was split off from Pasha:s dynamic stack patch series, they are generally nice on their own so let's propose them for merging. This patch (of 2): No need to do zero cached stack if memcg charge fails, so move the charging attempt before the memset operation. Link: https://lkml.kernel.org/r/20250829-fork-cleanups-for-dynstack-v1-0-3bbaadce1f00@linaro.org Link: https://lkml.kernel.org/r/20250829-fork-cleanups-for-dynstack-v1-1-3bbaadce1f00@linaro.org Signed-off-by: Pasha Tatashin Link: https://lore.kernel.org/20240311164638.2015063-6-pasha.tatashin@soleen.com Signed-off-by: Linus Walleij Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: Ben Segall Cc: David Hildenbrand Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 5115be549234..157612fd669a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -290,6 +290,11 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) if (!vm_area) continue; + if (memcg_charge_kernel_stack(vm_area)) { + vfree(vm_area->addr); + return -ENOMEM; + } + /* Reset stack metadata. */ kasan_unpoison_range(vm_area->addr, THREAD_SIZE); @@ -298,11 +303,6 @@ static int alloc_thread_stack_node(struct task_struct *tsk, int node) /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); - if (memcg_charge_kernel_stack(vm_area)) { - vfree(vm_area->addr); - return -ENOMEM; - } - tsk->stack_vm_area = vm_area; tsk->stack = stack; return 0; From 783dbe472d4af1704febf7c52b0ae9262220bf1b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 29 Aug 2025 13:44:41 +0200 Subject: [PATCH 183/372] task_stack.h: clean-up stack_not_used() implementation Inside the small stack_not_used() function there are several ifdefs for stack growing-up vs. regular versions. Instead just implement this function two times, one for growing-up and another regular. Add comments like /* !CONFIG_DEBUG_STACK_USAGE */ to clarify what the ifdefs are doing. [linus.walleij@linaro.org: rebased, function moved elsewhere in the kernel] Link: https://lkml.kernel.org/r/20250829-fork-cleanups-for-dynstack-v1-2-3bbaadce1f00@linaro.org Signed-off-by: Pasha Tatashin Link: https://lore.kernel.org/20240311164638.2015063-13-pasha.tatashin@soleen.com Signed-off-by: Linus Walleij Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: Ben Segall Cc: David Hildenbrand Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Kees Cook Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Suren Baghdasaryan Cc: Valentin Schneider Cc: Vincent Guittot Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/exit.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 343eb97543d5..9f74e8f1c431 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -780,24 +780,29 @@ static void exit_notify(struct task_struct *tsk, int group_dead) } #ifdef CONFIG_DEBUG_STACK_USAGE +#ifdef CONFIG_STACK_GROWSUP unsigned long stack_not_used(struct task_struct *p) { unsigned long *n = end_of_stack(p); do { /* Skip over canary */ -# ifdef CONFIG_STACK_GROWSUP n--; -# else - n++; -# endif } while (!*n); -# ifdef CONFIG_STACK_GROWSUP return (unsigned long)end_of_stack(p) - (unsigned long)n; -# else - return (unsigned long)n - (unsigned long)end_of_stack(p); -# endif } +#else /* !CONFIG_STACK_GROWSUP */ +unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ + n++; + } while (!*n); + + return (unsigned long)n - (unsigned long)end_of_stack(p); +} +#endif /* CONFIG_STACK_GROWSUP */ /* Count the maximum pages reached in kernel stacks */ static inline void kstack_histogram(unsigned long used_stack) @@ -856,9 +861,9 @@ static void check_stack_usage(void) } spin_unlock(&low_water_lock); } -#else +#else /* !CONFIG_DEBUG_STACK_USAGE */ static inline void check_stack_usage(void) {} -#endif +#endif /* CONFIG_DEBUG_STACK_USAGE */ static void synchronize_group_exit(struct task_struct *tsk, long code) { From 5a00878f78cf4ef12bd5dcea88ed4dd1ef0c7433 Mon Sep 17 00:00:00 2001 From: Joey Pabalinas Date: Sun, 31 Aug 2025 01:47:48 -1000 Subject: [PATCH 184/372] mm/memfd: remove redundant casts MFD_ALL_FLAGS is already an unsigned int. Remove redundant casts to unsigned int. Link: https://lkml.kernel.org/r/efbbe6093b64a5b19f974871d5262d6e75dff2c0.1756639225.git.joeypabalinas@gmail.com Signed-off-by: Joey Pabalinas Cc: Baolin Wang Cc: Hugh Dickins Cc: Joey Pabalinas Signed-off-by: Andrew Morton --- mm/memfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index bbe679895ef6..1d109c1acf21 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -385,11 +385,11 @@ static int sanitize_flags(unsigned int *flags_ptr) unsigned int flags = *flags_ptr; if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) + if (flags & ~MFD_ALL_FLAGS) return -EINVAL; } else { /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + if (flags & ~(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) return -EINVAL; } From 4fa5b88e772372e7ea3faccd0bbab03cb32104ed Mon Sep 17 00:00:00 2001 From: Kaushlendra Kumar Date: Sat, 30 Aug 2025 22:50:22 +0530 Subject: [PATCH 185/372] tools/mm/slabinfo: fix access to null terminator in string boundary The current code incorrectly accesses buffer[strlen(buffer)], which points to the null terminator ('\0') at the end of the string. This is technically out-of-bounds access since valid string content ends at index strlen(buffer)-1. Fix by: 1. Declaring strlen() result variable at function scope 2. Adding bounds check (len > 0) to handle empty strings 3. Using buffer[len-1] to correctly access the last character before the null terminator [kaushlendra.kumar@intel.com: remove unnecessary blank line] Link: https://lkml.kernel.org/r/20250901044955.3902815-1-kaushlendra.kumar@intel.com Link: https://lkml.kernel.org/r/20250830172022.1927448-1-kaushlendra.kumar@intel.com Signed-off-by: Kaushlendra Kumar Acked-by: SeongJae Park Signed-off-by: Andrew Morton --- tools/mm/slabinfo.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c index 1433eff99feb..80cdbd3db82d 100644 --- a/tools/mm/slabinfo.c +++ b/tools/mm/slabinfo.c @@ -155,6 +155,7 @@ static void usage(void) static unsigned long read_obj(const char *name) { + size_t len; FILE *f = fopen(name, "r"); if (!f) { @@ -165,8 +166,10 @@ static unsigned long read_obj(const char *name) if (!fgets(buffer, sizeof(buffer), f)) buffer[0] = 0; fclose(f); - if (buffer[strlen(buffer)] == '\n') - buffer[strlen(buffer)] = 0; + len = strlen(buffer); + + if (len > 0 && buffer[len - 1] == '\n') + buffer[len - 1] = 0; } return strlen(buffer); } From f8f03eb5f0f91fddc9bb8563c7e82bd7d3ba1dd0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:22 +0200 Subject: [PATCH 186/372] mm: stop making SPARSEMEM_VMEMMAP user-selectable Patch series "mm: remove nth_page()", v2. As discussed recently with Linus, nth_page() is just nasty and we would like to remove it. To recap, the reason we currently need nth_page() within a folio is because on some kernel configs (SPARSEMEM without SPARSEMEM_VMEMMAP), the memmap is allocated per memory section. While buddy allocations cannot cross memory section boundaries, hugetlb and dax folios can. So crossing a memory section means that "page++" could do the wrong thing. Instead, nth_page() on these problematic configs always goes from page->pfn, to the go from (++pfn)->page, which is rather nasty. Likely, many people have no idea when nth_page() is required and when it might be dropped. We refer to such problematic PFN ranges and "non-contiguous pages". If we only deal with "contiguous pages", there is not need for nth_page(). Besides that "obvious" folio case, we might end up using nth_page() within CMA allocations (again, could span memory sections), and in one corner case (kfence) when processing memblock allocations (again, could span memory sections). So let's handle all that, add sanity checks, and remove nth_page(). Patch #1 -> #5 : stop making SPARSEMEM_VMEMMAP user-selectable + cleanups Patch #6 -> #13 : disallow folios to have non-contiguous pages Patch #14 -> #20 : remove nth_page() usage within folios Patch #22 : disallow CMA allocations of non-contiguous pages Patch #23 -> #33 : sanity+check + remove nth_page() usage within SG entry Patch #34 : sanity-check + remove nth_page() usage in unpin_user_page_range_dirty_lock() Patch #35 : remove nth_page() in kfence Patch #36 : adjust stale comment regarding nth_page Patch #37 : mm: remove nth_page() A lot of this is inspired from the discussion at [1] between Linus, Jason and me, so cudos to them. This patch (of 37): In an ideal world, we wouldn't have to deal with SPARSEMEM without SPARSEMEM_VMEMMAP, but in particular for 32bit SPARSEMEM_VMEMMAP is considered too costly and consequently not supported. However, if an architecture does support SPARSEMEM with SPARSEMEM_VMEMMAP, let's forbid the user to disable VMEMMAP: just like we already do for arm64, s390 and x86. So if SPARSEMEM_VMEMMAP is supported, don't allow to use SPARSEMEM without SPARSEMEM_VMEMMAP. This implies that the option to not use SPARSEMEM_VMEMMAP will now be gone for loongarch, powerpc, riscv and sparc. All architectures only enable SPARSEMEM_VMEMMAP with 64bit support, so there should not really be a big downside to using the VMEMMAP (quite the contrary). This is a preparation for not supporting (1) folio sizes that exceed a single memory section (2) CMA allocations of non-contiguous page ranges in SPARSEMEM without SPARSEMEM_VMEMMAP configs, whereby we want to limit possible impact as much as possible (e.g., gigantic hugetlb page allocations suddenly fails). Link: https://lkml.kernel.org/r/20250901150359.867252-1-david@redhat.com Link: https://lkml.kernel.org/r/20250901150359.867252-2-david@redhat.com Link: https://lore.kernel.org/all/CAHk-=wiCYfNp4AJLBORU-c7ZyRBUp66W2-Et6cdQ4REx-GyQ_A@mail.gmail.com/T/#u [1] Signed-off-by: David Hildenbrand Acked-by: Zi Yan Acked-by: Mike Rapoport (Microsoft) Acked-by: SeongJae Park Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Huacai Chen Cc: WANG Xuerui Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Alexandre Ghiti Cc: "David S. Miller" Cc: Andreas Larsson Cc: Alexander Gordeev Cc: Alexander Potapenko Cc: Alexandru Elisei Cc: Alex Dubov Cc: Alex Willamson Cc: Bart van Assche Cc: Borislav Betkov Cc: Brendan Jackman Cc: Brett Creeley Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Lameter (Ampere) Cc: Damien Le Maol Cc: Dave Airlie Cc: Dennis Zhou Cc: Dmitriy Vyukov Cc: Doug Gilbert Cc: Heiko Carstens Cc: Herbert Xu Cc: Ingo Molnar Cc: Inki Dae Cc: James Bottomley Cc: Jani Nikula Cc: Jason A. Donenfeld Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Jesper Nilsson Cc: Johannes Weiner Cc: John Hubbard Cc: Jonas Lahtinen Cc: Kevin Tian Cc: Lars Persson Cc: Linus Torvalds Cc: Marco Elver Cc: "Martin K. Petersen" Cc: Maxim Levitky Cc: Michal Hocko Cc: Muchun Song Cc: Niklas Cassel Cc: Oscar Salvador Cc: Pavel Begunkov Cc: Peter Xu Cc: Robin Murohy Cc: Rodrigo Vivi Cc: Shameerali Kolothum Thodi Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Tejun Heo Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Tvrtko Ursulin Cc: Ulf Hansson Cc: Vasily Gorbik Cc: Vlastimil Babka Cc: Will Deacon Cc: Yishai Hadas Signed-off-by: Andrew Morton --- mm/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index b971d35c43c3..d1ed839ca710 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -412,9 +412,8 @@ config SPARSEMEM_VMEMMAP_ENABLE bool config SPARSEMEM_VMEMMAP - bool "Sparse Memory virtual memmap" + def_bool y depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE - default y help SPARSEMEM_VMEMMAP uses a virtually mapped memmap to optimise pfn_to_page and page_to_pfn operations. This is the most From 84188a3ab578eb5822c668ba6980f3e68e0f8244 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:23 +0200 Subject: [PATCH 187/372] arm64: Kconfig: drop superfluous "select SPARSEMEM_VMEMMAP" Now handled by the core automatically once SPARSEMEM_VMEMMAP_ENABLE is selected. Link: https://lkml.kernel.org/r/20250901150359.867252-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport (Microsoft) Acked-by: Catalin Marinas Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e9bbfacc35a6..b1d1f2ff2493 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1570,7 +1570,6 @@ source "kernel/Kconfig.hz" config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_VMEMMAP_ENABLE - select SPARSEMEM_VMEMMAP config HW_PERF_EVENTS def_bool y From a2f0cbea29d34c8b581ee335b4e59d213bb324fc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:24 +0200 Subject: [PATCH 188/372] s390/Kconfig: drop superfluous "select SPARSEMEM_VMEMMAP" Now handled by the core automatically once SPARSEMEM_VMEMMAP_ENABLE is selected. Link: https://lkml.kernel.org/r/20250901150359.867252-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Signed-off-by: Andrew Morton --- arch/s390/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bf680c26a33c..145ca23c2fff 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -710,7 +710,6 @@ menu "Memory setup" config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_VMEMMAP_ENABLE - select SPARSEMEM_VMEMMAP config ARCH_SPARSEMEM_DEFAULT def_bool y From 016496e3e3cc33421a7b0f4971141ffeec44f04b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:25 +0200 Subject: [PATCH 189/372] x86/Kconfig: drop superfluous "select SPARSEMEM_VMEMMAP" Now handled by the core automatically once SPARSEMEM_VMEMMAP_ENABLE is selected. Link: https://lkml.kernel.org/r/20250901150359.867252-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100..e431d1c06fec 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1552,7 +1552,6 @@ config ARCH_SPARSEMEM_ENABLE def_bool y select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 - select SPARSEMEM_VMEMMAP if X86_64 config ARCH_SPARSEMEM_DEFAULT def_bool X86_64 || (NUMA && X86_32) From 3b864c8f557a52c142905575157c69be20899f09 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:26 +0200 Subject: [PATCH 190/372] wireguard: selftests: remove CONFIG_SPARSEMEM_VMEMMAP=y from qemu kernel config It's no longer user-selectable (and the default was already "y"), so let's just drop it. It was never really relevant to the wireguard selftests either way. Link: https://lkml.kernel.org/r/20250901150359.867252-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: "Jason A. Donenfeld" Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/wireguard/qemu/kernel.config | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config index 0a5381717e9f..1149289f4b30 100644 --- a/tools/testing/selftests/wireguard/qemu/kernel.config +++ b/tools/testing/selftests/wireguard/qemu/kernel.config @@ -48,7 +48,6 @@ CONFIG_JUMP_LABEL=y CONFIG_FUTEX=y CONFIG_SHMEM=y CONFIG_SLUB=y -CONFIG_SPARSEMEM_VMEMMAP=y CONFIG_SMP=y CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y From 0bf2edf041dcb0b304a8dbda8c699771d5a245d2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:27 +0200 Subject: [PATCH 191/372] mm/page_alloc: reject unreasonable folio/compound page sizes in alloc_contig_range_noprof() Let's reject them early, which in turn makes folio_alloc_gigantic() reject them properly. To avoid converting from order to nr_pages, let's just add MAX_FOLIO_ORDER and calculate MAX_FOLIO_NR_PAGES based on that. While at it, let's just make the order a "const unsigned order". Link: https://lkml.kernel.org/r/20250901150359.867252-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: SeongJae Park Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- mm/page_alloc.c | 10 +++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 00c8a54127d3..77737cbf2216 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2055,11 +2055,13 @@ static inline long folio_nr_pages(const struct folio *folio) /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) +#define MAX_FOLIO_ORDER PUD_ORDER #else -#define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES +#define MAX_FOLIO_ORDER MAX_PAGE_ORDER #endif +#define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) + /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0873d640f26c..54dbb6f0d14e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6838,6 +6838,7 @@ static int __alloc_contig_verify_gfp_mask(gfp_t gfp_mask, gfp_t *gfp_cc_mask) int alloc_contig_range_noprof(unsigned long start, unsigned long end, acr_flags_t alloc_flags, gfp_t gfp_mask) { + const unsigned int order = ilog2(end - start); unsigned long outer_start, outer_end; int ret = 0; @@ -6855,6 +6856,14 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, PB_ISOLATE_MODE_CMA_ALLOC : PB_ISOLATE_MODE_OTHER; + /* + * In contrast to the buddy, we allow for orders here that exceed + * MAX_PAGE_ORDER, so we must manually make sure that we are not + * exceeding the maximum folio order. + */ + if (WARN_ON_ONCE((gfp_mask & __GFP_COMP) && order > MAX_FOLIO_ORDER)) + return -EINVAL; + gfp_mask = current_gfp_context(gfp_mask); if (__alloc_contig_verify_gfp_mask(gfp_mask, (gfp_t *)&cc.gfp_mask)) return -EINVAL; @@ -6952,7 +6961,6 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, free_contig_range(end, outer_end - end); } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) { struct page *head = pfn_to_page(start); - int order = ilog2(end - start); check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); From 646b67d575897dc656b07446e23e756d54e49828 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:28 +0200 Subject: [PATCH 192/372] mm/memremap: reject unreasonable folio/compound page sizes in memremap_pages() Let's reject unreasonable folio sizes early, where we can still fail. We'll add sanity checks to prepare_compound_head/prepare_compound_page next. Is there a way to configure a system such that unreasonable folio sizes would be possible? It would already be rather questionable. If so, we'd probably want to bail out earlier, where we can avoid a WARN and just report a proper error message that indicates where something went wrong such that we messed up. Link: https://lkml.kernel.org/r/20250901150359.867252-8-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: SeongJae Park Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/memremap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memremap.c b/mm/memremap.c index b0ce0d8254bd..a2d4bb88f64b 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -275,6 +275,9 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) if (WARN_ONCE(!nr_range, "nr_range must be specified\n")) return ERR_PTR(-EINVAL); + if (WARN_ONCE(pgmap->vmemmap_shift > MAX_FOLIO_ORDER, + "requested folio size unsupported\n")) + return ERR_PTR(-EINVAL); switch (pgmap->type) { case MEMORY_DEVICE_PRIVATE: From 7b4f21f5e0386dfe02c68c009294d8f26e3c1bad Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:29 +0200 Subject: [PATCH 193/372] mm/hugetlb: check for unreasonable folio sizes when registering hstate Let's check that no hstate that corresponds to an unreasonable folio size is registered by an architecture. If we were to succeed registering, we could later try allocating an unsupported gigantic folio size. Further, let's add a BUILD_BUG_ON() for checking that HUGETLB_PAGE_ORDER is sane at build time. As HUGETLB_PAGE_ORDER is dynamic on powerpc, we have to use a BUILD_BUG_ON_INVALID() to make it compile. No existing kernel configuration should be able to trigger this check: either SPARSEMEM without SPARSEMEM_VMEMMAP cannot be configured or gigantic folios will not exceed a memory section (the case on sparse). Link: https://lkml.kernel.org/r/20250901150359.867252-9-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1e777cc51ad0..d3542e92a712 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4657,6 +4657,7 @@ static int __init hugetlb_init(void) BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < __NR_HPAGEFLAGS); + BUILD_BUG_ON_INVALID(HUGETLB_PAGE_ORDER > MAX_FOLIO_ORDER); if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) @@ -4740,6 +4741,7 @@ void __init hugetlb_add_hstate(unsigned int order) } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); + WARN_ON(order > MAX_FOLIO_ORDER); h = &hstates[hugetlb_max_hstate++]; __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key); h->order = order; From 50765b46ab44544f661fb70923871f8d06381015 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:30 +0200 Subject: [PATCH 194/372] mm/mm_init: make memmap_init_compound() look more like prep_compound_page() Grepping for "prep_compound_page" leaves on clueless how devdax gets its compound pages initialized. Let's add a comment that might help finding this open-coded prep_compound_page() initialization more easily. Further, let's be less smart about the ordering of initialization and just perform the prep_compound_head() call after all tail pages were initialized: just like prep_compound_page() does. No need for a comment to describe the initialization order: again, just like prep_compound_page(). Link: https://lkml.kernel.org/r/20250901150359.867252-10-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mm_init.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 5c21b3af216b..df614556741a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1091,6 +1091,12 @@ static void __ref memmap_init_compound(struct page *head, unsigned long pfn, end_pfn = head_pfn + nr_pages; unsigned int order = pgmap->vmemmap_shift; + /* + * We have to initialize the pages, including setting up page links. + * prep_compound_page() does not take care of that, so instead we + * open-code prep_compound_page() so we can take care of initializing + * the pages in the same go. + */ __SetPageHead(head); for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); @@ -1098,15 +1104,8 @@ static void __ref memmap_init_compound(struct page *head, __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); prep_compound_tail(head, pfn - head_pfn); set_page_count(page, 0); - - /* - * The first tail page stores important compound page info. - * Call prep_compound_head() after the first tail page has - * been initialized, to not have the data overwritten. - */ - if (pfn == head_pfn + 1) - prep_compound_head(head, order); } + prep_compound_head(head, order); } void __ref memmap_init_zone_device(struct zone *zone, From 99132d24d76511f0f6ded2e25448765b632794aa Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:31 +0200 Subject: [PATCH 195/372] mm: sanity-check maximum folio size in folio_set_order() Let's sanity-check in folio_set_order() whether we would be trying to create a folio with an order that would make it exceed MAX_FOLIO_ORDER. This will enable the check whenever a folio/compound page is initialized through prepare_compound_head() / prepare_compound_page() with CONFIG_DEBUG_VM set. Link: https://lkml.kernel.org/r/20250901150359.867252-11-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/internal.h b/mm/internal.h index 45da9ff5694f..9b0129531d00 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -755,6 +755,7 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) { if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; + VM_WARN_ON_ONCE(order > MAX_FOLIO_ORDER); folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef NR_PAGES_IN_LARGE_FOLIO From 4751c39eee0c3fcc742aa7d7242ce2b78faa3606 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:32 +0200 Subject: [PATCH 196/372] mm: limit folio/compound page sizes in problematic kernel configs Let's limit the maximum folio size in problematic kernel config where the memmap is allocated per memory section (SPARSEMEM without SPARSEMEM_VMEMMAP) to a single memory section. Currently, only a single architectures supports ARCH_HAS_GIGANTIC_PAGE but not SPARSEMEM_VMEMMAP: sh. Fortunately, the biggest hugetlb size sh supports is 64 MiB (HUGETLB_PAGE_SIZE_64MB) and the section size is at least 64 MiB (SECTION_SIZE_BITS == 26), so their use case is not degraded. As folios and memory sections are naturally aligned to their order-2 size in memory, consequently a single folio can no longer span multiple memory sections on these problematic kernel configs. nth_page() is no longer required when operating within a single compound page / folio. Link: https://lkml.kernel.org/r/20250901150359.867252-12-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 77737cbf2216..2dee79fa2efc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2053,11 +2053,25 @@ static inline long folio_nr_pages(const struct folio *folio) return folio_large_nr_pages(folio); } -/* Only hugetlbfs can allocate folios larger than MAX_ORDER */ -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -#define MAX_FOLIO_ORDER PUD_ORDER -#else +#if !defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) +/* + * We don't expect any folios that exceed buddy sizes (and consequently + * memory sections). + */ #define MAX_FOLIO_ORDER MAX_PAGE_ORDER +#elif defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/* + * Only pages within a single memory section are guaranteed to be + * contiguous. By limiting folios to a single memory section, all folio + * pages are guaranteed to be contiguous. + */ +#define MAX_FOLIO_ORDER PFN_SECTION_SHIFT +#else +/* + * There is no real limit on the folio size. We limit them to the maximum we + * currently expect (e.g., hugetlb, dax). + */ +#define MAX_FOLIO_ORDER PUD_ORDER #endif #define MAX_FOLIO_NR_PAGES (1UL << MAX_FOLIO_ORDER) From 73b3294b1152e94c1971a735b8db8c7503fd97a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:33 +0200 Subject: [PATCH 197/372] mm: simplify folio_page() and folio_page_idx() Now that a single folio/compound page can no longer span memory sections in problematic kernel configurations, we can stop using nth_page() in folio_page() and folio_page_idx(). While at it, turn both macros into static inline functions and add kernel doc for folio_page_idx(). Link: https://lkml.kernel.org/r/20250901150359.867252-13-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ++++++++++++++-- include/linux/page-flags.h | 5 ++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2dee79fa2efc..f6880e3225c5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -210,10 +210,8 @@ extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) -#define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) -#define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ @@ -225,6 +223,20 @@ extern unsigned long sysctl_admin_reserve_kbytes; /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) +/** + * folio_page_idx - Return the number of a page in a folio. + * @folio: The folio. + * @page: The folio page. + * + * This function expects that the page is actually part of the folio. + * The returned number is relative to the start of the folio. + */ +static inline unsigned long folio_page_idx(const struct folio *folio, + const struct page *page) +{ + return page - &folio->page; +} + static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d53a86e68c89..a88b61eec3f8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,7 +316,10 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -#define folio_page(folio, n) nth_page(&(folio)->page, n) +static inline struct page *folio_page(struct folio *folio, unsigned long n) +{ + return &folio->page + n; +} static __always_inline int PageTail(const struct page *page) { From 372c9b5491d2d8e85a8e1b8b74d4116654f5d816 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:34 +0200 Subject: [PATCH 198/372] mm/hugetlb: cleanup hugetlb_folio_init_tail_vmemmap() We can now safely iterate over all pages in a folio, so no need for the pfn_to_page(). Also, as we already force the refcount in __init_single_page() to 1 through init_page_count(), we can just set the refcount to 0 and avoid page_ref_freeze() + VM_BUG_ON. Likely, in the future, we would just want to tell __init_single_page() to which value to initialize the refcount. Further, adjust the comments to highlight that we are dealing with an open-coded prep_compound_page() variant, and add another comment explaining why we really need the __init_single_page() only on the tail pages. Note that the current code was likely problematic, but we never ran into it: prep_compound_tail() would have been called with an offset that might exceed a memory section, and prep_compound_tail() would have simply added that offset to the page pointer -- which would not have done the right thing on sparsemem without vmemmap. Link: https://lkml.kernel.org/r/20250901150359.867252-14-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mike Rapoport (Microsoft) Reviewed-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/hugetlb.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d3542e92a712..56e6d2af0843 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3237,17 +3237,18 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, { enum zone_type zone = zone_idx(folio_zone(folio)); int nid = folio_nid(folio); + struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); unsigned long pfn, end_pfn = head_pfn + end_page_number; - int ret; - - for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { - struct page *page = pfn_to_page(pfn); + /* + * As we marked all tail pages with memblock_reserved_mark_noinit(), + * we must initialize them ourselves here. + */ + for (pfn = head_pfn + start_page_number; pfn < end_pfn; page++, pfn++) { __init_single_page(page, pfn, zone, nid); prep_compound_tail((struct page *)folio, pfn - head_pfn); - ret = page_ref_freeze(page, 1); - VM_BUG_ON(!ret); + set_page_count(page, 0); } } @@ -3257,12 +3258,15 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, { int ret; - /* Prepare folio head */ + /* + * This is an open-coded prep_compound_page() whereby we avoid + * walking pages twice by initializing/preparing+freezing them in the + * same go. + */ __folio_clear_reserved(folio); __folio_set_head(folio); ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); - /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); prep_compound_head((struct page *)folio, huge_page_order(h)); } From cb77aa60a0a45db626b7b9db864be38a150a06f7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:35 +0200 Subject: [PATCH 199/372] mm/mm/percpu-km: drop nth_page() usage within single allocation We're allocating a higher-order page from the buddy. For these pages (that are guaranteed to not exceed a single memory section) there is no need to use nth_page(). Link: https://lkml.kernel.org/r/20250901150359.867252-15-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/percpu-km.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/percpu-km.c b/mm/percpu-km.c index fe31aa19db81..4efa74a495cb 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -69,7 +69,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) } for (i = 0; i < nr_pages; i++) - pcpu_set_page_chunk(nth_page(pages, i), chunk); + pcpu_set_page_chunk(pages + i, chunk); chunk->data = pages; chunk->base_addr = page_address(pages); From 06d42cf49eb740da7dbc4c5fc138a0fdce67417c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:36 +0200 Subject: [PATCH 200/372] fs: hugetlbfs: remove nth_page() usage within folio in adjust_range_hwpoison() The nth_page() is not really required anymore, so let's remove it. Link: https://lkml.kernel.org/r/20250901150359.867252-16-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 34d496a2b7de..c5a46d10afaa 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -217,7 +217,7 @@ static size_t adjust_range_hwpoison(struct folio *folio, size_t offset, break; offset += n; if (offset == PAGE_SIZE) { - page = nth_page(page, 1); + page++; offset = 0; } } From a638ee7f197f4596adeb4b15e5369d055e042635 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:37 +0200 Subject: [PATCH 201/372] fs: hugetlbfs: cleanup folio in adjust_range_hwpoison() Let's cleanup and simplify the function a bit. Link: https://lkml.kernel.org/r/20250901150359.867252-17-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c5a46d10afaa..3cfdf4091001 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -192,37 +192,25 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw * HWPOISON page. - * - * The implementation borrows the iteration logic from copy_page_to_iter*. */ static size_t adjust_range_hwpoison(struct folio *folio, size_t offset, size_t bytes) { - struct page *page; - size_t n = 0; - size_t res = 0; + struct page *page = folio_page(folio, offset / PAGE_SIZE); + size_t safe_bytes; - /* First page to start the loop. */ - page = folio_page(folio, offset / PAGE_SIZE); - offset %= PAGE_SIZE; - while (1) { + if (is_raw_hwpoison_page_in_hugepage(page)) + return 0; + /* Safe to read the remaining bytes in this page. */ + safe_bytes = PAGE_SIZE - (offset % PAGE_SIZE); + page++; + + /* Check each remaining page as long as we are not done yet. */ + for (; safe_bytes < bytes; safe_bytes += PAGE_SIZE, page++) if (is_raw_hwpoison_page_in_hugepage(page)) break; - /* Safe to read n bytes without touching HWPOISON subpage. */ - n = min(bytes, (size_t)PAGE_SIZE - offset); - res += n; - bytes -= n; - if (!bytes || !n) - break; - offset += n; - if (offset == PAGE_SIZE) { - page++; - offset = 0; - } - } - - return res; + return min(safe_bytes, bytes); } /* From 1a55ac6068ae4ca2024164c6d484d681a3f98299 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:38 +0200 Subject: [PATCH 202/372] mm/pagewalk: drop nth_page() usage within folio in folio_walk_start() It's no longer required to use nth_page() within a folio, so let's just drop the nth_page() in folio_walk_start(). Link: https://lkml.kernel.org/r/20250901150359.867252-18-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/pagewalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c6753d370ff4..9e4225e5fcf5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1004,7 +1004,7 @@ not_found: found: if (expose_page) /* Note: Offset from the mapped page, not the folio start. */ - fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); + fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT); else fw->page = NULL; fw->ptl = ptl; From 541541dbfeb84f0995ad1ec54a643c72081d46fc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:39 +0200 Subject: [PATCH 203/372] mm/gup: drop nth_page() usage within folio when recording subpages nth_page() is no longer required when iterating over pages within a single folio, so let's just drop it when recording subpages. Link: https://lkml.kernel.org/r/20250901150359.867252-19-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/gup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index ed02d65f9c72..e1bd2b2e9145 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -488,12 +488,11 @@ static int record_subpages(struct page *page, unsigned long sz, unsigned long addr, unsigned long end, struct page **pages) { - struct page *start_page; int nr; - start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT); + page += (addr & (sz - 1)) >> PAGE_SHIFT; for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) - pages[nr] = nth_page(start_page, nr); + pages[nr] = page++; return nr; } @@ -1512,7 +1511,7 @@ next_page: } for (j = 0; j < page_increm; j++) { - subpage = nth_page(page, j); + subpage = page + j; pages[i + j] = subpage; flush_anon_page(vma, subpage, start + j * PAGE_SIZE); flush_dcache_page(subpage); From e3c05b6e370c22451f87f2d230d131ad1bb49a60 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:40 +0200 Subject: [PATCH 204/372] mm/gup: remove record_subpages() We can just cleanup the code by calculating the #refs earlier, so we can just inline what remains of record_subpages(). Calculate the number of references/pages ahead of times, and record them only once all our tests passed. [david@redhat.com: fix `pages' adjustment] Link: https://lkml.kernel.org/r/cc7f03f8-da8b-407e-a03a-e8e5a9ec5462@redhat.com Link: https://lkml.kernel.org/r/20250901150359.867252-20-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Eric Biggers Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/gup.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index e1bd2b2e9145..b8aee964421c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -484,19 +484,6 @@ static inline void mm_set_has_pinned_flag(struct mm_struct *mm) #ifdef CONFIG_MMU #ifdef CONFIG_HAVE_GUP_FAST -static int record_subpages(struct page *page, unsigned long sz, - unsigned long addr, unsigned long end, - struct page **pages) -{ - int nr; - - page += (addr & (sz - 1)) >> PAGE_SHIFT; - for (nr = 0; addr != end; nr++, addr += PAGE_SIZE) - pages[nr] = page++; - - return nr; -} - /** * try_grab_folio_fast() - Attempt to get or pin a folio in fast path. * @page: pointer to page to be grabbed @@ -2971,8 +2958,8 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (pmd_special(orig)) return 0; - page = pmd_page(orig); - refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr); + refs = (end - addr) >> PAGE_SHIFT; + page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); folio = try_grab_folio_fast(page, refs, flags); if (!folio) @@ -2992,7 +2979,10 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + pages += *nr; *nr += refs; + for (; refs; refs--) + *(pages++) = page++; folio_set_referenced(folio); return 1; } @@ -3011,8 +3001,8 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, if (pud_special(orig)) return 0; - page = pud_page(orig); - refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr); + refs = (end - addr) >> PAGE_SHIFT; + page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); folio = try_grab_folio_fast(page, refs, flags); if (!folio) @@ -3033,7 +3023,10 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + pages += *nr; *nr += refs; + for (; refs; refs--) + *(pages++) = page++; folio_set_referenced(folio); return 1; } From d99c57546d8f8ae4bcd2b5a83d77e248756f0cd1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:41 +0200 Subject: [PATCH 205/372] io_uring/zcrx: remove nth_page() usage within folio Within a folio/compound page, nth_page() is no longer required. Given that we call folio_test_partial_kmap()+kmap_local_page(), the code would already be problematic if the pages would span multiple folios. So let's just assume that all src pages belong to a single folio/compound page and can be iterated ordinarily. The dst page is currently always a single page, so we're not actually iterating anything. Link: https://lkml.kernel.org/r/20250901150359.867252-21-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Pavel Begunkov Reviewed-by: Lorenzo Stoakes Cc: Jens Axboe Cc: Pavel Begunkov Signed-off-by: Andrew Morton --- io_uring/zcrx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index e5ff49f3425e..18c12f4b56b6 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -975,9 +975,9 @@ static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, if (folio_test_partial_kmap(page_folio(dst_page)) || folio_test_partial_kmap(page_folio(src_page))) { - dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE); + dst_page += dst_offset / PAGE_SIZE; dst_offset = offset_in_page(dst_offset); - src_page = nth_page(src_page, src_offset / PAGE_SIZE); + src_page += src_offset / PAGE_SIZE; src_offset = offset_in_page(src_offset); n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); n = min(n, len); From b71ddc9ecc4d142465617d60e16a8a6ff154fdea Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:42 +0200 Subject: [PATCH 206/372] mips: mm: convert __flush_dcache_pages() to __flush_dcache_folio_pages() Let's make it clearer that we are operating within a single folio by providing both the folio and the page. This implies that for flush_dcache_folio() we'll now avoid one more page->folio lookup, and that we can safely drop the "nth_page" usage. While at it, drop the "extern" from the function declaration. Link: https://lkml.kernel.org/r/20250901150359.867252-22-david@redhat.com Signed-off-by: David Hildenbrand Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/include/asm/cacheflush.h | 11 +++++++---- arch/mips/mm/cache.c | 8 ++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index 5d283ef89d90..5099c1b65a58 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -50,13 +50,14 @@ extern void (*flush_cache_mm)(struct mm_struct *mm); extern void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); -extern void __flush_dcache_pages(struct page *page, unsigned int nr); +void __flush_dcache_folio_pages(struct folio *folio, struct page *page, unsigned int nr); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 static inline void flush_dcache_folio(struct folio *folio) { if (cpu_has_dc_aliases) - __flush_dcache_pages(&folio->page, folio_nr_pages(folio)); + __flush_dcache_folio_pages(folio, folio_page(folio, 0), + folio_nr_pages(folio)); else if (!cpu_has_ic_fills_f_dc) folio_set_dcache_dirty(folio); } @@ -64,10 +65,12 @@ static inline void flush_dcache_folio(struct folio *folio) static inline void flush_dcache_page(struct page *page) { + struct folio *folio = page_folio(page); + if (cpu_has_dc_aliases) - __flush_dcache_pages(page, 1); + __flush_dcache_folio_pages(folio, page, 1); else if (!cpu_has_ic_fills_f_dc) - folio_set_dcache_dirty(page_folio(page)); + folio_set_dcache_dirty(folio); } #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index bf9a37c60e9f..e3b4224c9a40 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -99,9 +99,9 @@ SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, bytes, return 0; } -void __flush_dcache_pages(struct page *page, unsigned int nr) +void __flush_dcache_folio_pages(struct folio *folio, struct page *page, + unsigned int nr) { - struct folio *folio = page_folio(page); struct address_space *mapping = folio_flush_mapping(folio); unsigned long addr; unsigned int i; @@ -117,12 +117,12 @@ void __flush_dcache_pages(struct page *page, unsigned int nr) * get faulted into the tlb (and thus flushed) anyways. */ for (i = 0; i < nr; i++) { - addr = (unsigned long)kmap_local_page(nth_page(page, i)); + addr = (unsigned long)kmap_local_page(page + i); flush_data_cache_page(addr); kunmap_local((void *)addr); } } -EXPORT_SYMBOL(__flush_dcache_pages); +EXPORT_SYMBOL(__flush_dcache_folio_pages); void __flush_anon_page(struct page *page, unsigned long vmaddr) { From 6972706f95926838f9bd3ec2b2393c034bdb85ba Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:43 +0200 Subject: [PATCH 207/372] mm/cma: refuse handing out non-contiguous page ranges Let's disallow handing out PFN ranges with non-contiguous pages, so we can remove the nth-page usage in __cma_alloc(), and so any callers don't have to worry about that either when wanting to blindly iterate pages. This is really only a problem in configs with SPARSEMEM but without SPARSEMEM_VMEMMAP, and only when we would cross memory sections in some cases. Will this cause harm? Probably not, because it's mostly 32bit that does not support SPARSEMEM_VMEMMAP. If this ever becomes a problem we could look into allocating the memmap for the memory sections spanned by a single CMA region in one go from memblock. [david@redhat.com: we can have NUMMU configs with SPARSEMEM enabled] Link: https://lkml.kernel.org/r/6ec933b1-b3f7-41c0-95d8-e518bb87375e@redhat.com Link: https://lkml.kernel.org/r/20250901150359.867252-23-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Alexandru Elisei Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++++ mm/cma.c | 39 ++++++++++++++++++++++++--------------- mm/util.c | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f6880e3225c5..2ca1eb2db63e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -209,9 +209,15 @@ extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +bool page_range_contiguous(const struct page *page, unsigned long nr_pages); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #else #define nth_page(page,n) ((page) + (n)) +static inline bool page_range_contiguous(const struct page *page, + unsigned long nr_pages) +{ + return true; +} #endif /* to align the pointer to the (next) page boundary */ diff --git a/mm/cma.c b/mm/cma.c index e56ec64d0567..813e6dc7b095 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -780,10 +780,8 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, unsigned long count, unsigned int align, struct page **pagep, gfp_t gfp) { - unsigned long mask, offset; - unsigned long pfn = -1; - unsigned long start = 0; unsigned long bitmap_maxno, bitmap_no, bitmap_count; + unsigned long start, pfn, mask, offset; int ret = -EBUSY; struct page *page = NULL; @@ -795,7 +793,7 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, if (bitmap_count > bitmap_maxno) goto out; - for (;;) { + for (start = 0; ; start = bitmap_no + mask + 1) { spin_lock_irq(&cma->lock); /* * If the request is larger than the available number @@ -812,6 +810,22 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, spin_unlock_irq(&cma->lock); break; } + + pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); + page = pfn_to_page(pfn); + + /* + * Do not hand out page ranges that are not contiguous, so + * callers can just iterate the pages without having to worry + * about these corner cases. + */ + if (!page_range_contiguous(page, count)) { + spin_unlock_irq(&cma->lock); + pr_warn_ratelimited("%s: %s: skipping incompatible area [0x%lx-0x%lx]", + __func__, cma->name, pfn, pfn + count - 1); + continue; + } + bitmap_set(cmr->bitmap, bitmap_no, bitmap_count); cma->available_count -= count; /* @@ -821,29 +835,24 @@ static int cma_range_alloc(struct cma *cma, struct cma_memrange *cmr, */ spin_unlock_irq(&cma->lock); - pfn = cmr->base_pfn + (bitmap_no << cma->order_per_bit); mutex_lock(&cma->alloc_mutex); ret = alloc_contig_range(pfn, pfn + count, ACR_FLAGS_CMA, gfp); mutex_unlock(&cma->alloc_mutex); - if (ret == 0) { - page = pfn_to_page(pfn); + if (!ret) break; - } cma_clear_bitmap(cma, cmr, pfn, count); if (ret != -EBUSY) break; pr_debug("%s(): memory range at pfn 0x%lx %p is busy, retrying\n", - __func__, pfn, pfn_to_page(pfn)); + __func__, pfn, page); - trace_cma_alloc_busy_retry(cma->name, pfn, pfn_to_page(pfn), - count, align); - /* try again with a bit different memory target */ - start = bitmap_no + mask + 1; + trace_cma_alloc_busy_retry(cma->name, pfn, page, count, align); } out: - *pagep = page; + if (!ret) + *pagep = page; return ret; } @@ -882,7 +891,7 @@ static struct page *__cma_alloc(struct cma *cma, unsigned long count, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(nth_page(page, i)); + page_kasan_tag_reset(page + i); } if (ret && !(gfp & __GFP_NOWARN)) { diff --git a/mm/util.c b/mm/util.c index d235b74f7aff..4b9d40c71286 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1281,3 +1281,38 @@ unsigned int folio_pte_batch(struct folio *folio, pte_t *ptep, pte_t pte, return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, 0); } #endif /* CONFIG_MMU */ + +#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) +/** + * page_range_contiguous - test whether the page range is contiguous + * @page: the start of the page range. + * @nr_pages: the number of pages in the range. + * + * Test whether the page range is contiguous, such that they can be iterated + * naively, corresponding to iterating a contiguous PFN range. + * + * This function should primarily only be used for debug checks, or when + * working with page ranges that are not naturally contiguous (e.g., pages + * within a folio are). + * + * Returns true if contiguous, otherwise false. + */ +bool page_range_contiguous(const struct page *page, unsigned long nr_pages) +{ + const unsigned long start_pfn = page_to_pfn(page); + const unsigned long end_pfn = start_pfn + nr_pages; + unsigned long pfn; + + /* + * The memmap is allocated per memory section, so no need to check + * within the first section. However, we need to check each other + * spanned memory section once, making sure the first page in a + * section could similarly be reached by just iterating pages. + */ + for (pfn = ALIGN(start_pfn, PAGES_PER_SECTION); + pfn < end_pfn; pfn += PAGES_PER_SECTION) + if (unlikely(page + (pfn - start_pfn) != pfn_to_page(pfn))) + return false; + return true; +} +#endif From a16c46c2402026162111ed9fd1fc28d25223443e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:44 +0200 Subject: [PATCH 208/372] dma-remap: drop nth_page() in dma_common_contiguous_remap() dma_common_contiguous_remap() is used to remap an "allocated contiguous region". Within a single allocation, there is no need to use nth_page() anymore. Neither the buddy, nor hugetlb, nor CMA will hand out problematic page ranges. Link: https://lkml.kernel.org/r/20250901150359.867252-24-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Marek Szyprowski Reviewed-by: Lorenzo Stoakes Cc: Robin Murphy Signed-off-by: Andrew Morton --- kernel/dma/remap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 9e2afad1c615..b7c1c0c92d0c 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c @@ -49,7 +49,7 @@ void *dma_common_contiguous_remap(struct page *page, size_t size, if (!pages) return NULL; for (i = 0; i < count; i++) - pages[i] = nth_page(page, i); + pages[i] = page++; vaddr = vmap(pages, count, VM_DMA_COHERENT, prot); kvfree(pages); From 80e7bb74d4ff24725f0ddb1c72d8de45a3d975f6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:45 +0200 Subject: [PATCH 209/372] scatterlist: disallow non-contigous page ranges in a single SG entry The expectation is that there is currently no user that would pass in non-contigous page ranges: no allocator, not even VMA, will hand these out. The only problematic part would be if someone would provide a range obtained directly from memblock, or manually merge problematic ranges. If we find such cases, we should fix them to create separate SG entries. Let's check in sg_set_page() that this is really the case. No need to check in sg_set_folio(), as pages in a folio are guaranteed to be contiguous. As sg_set_page() gets inlined into modules, we have to export the page_range_contiguous() helper -- use EXPORT_SYMBOL, there is nothing special about this helper such that we would want to enforce GPL-only modules. We can now drop the nth_page() usage in sg_page_iter_page(). Link: https://lkml.kernel.org/r/20250901150359.867252-25-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Marek Szyprowski Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 3 ++- mm/util.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 6f8a4965f9b9..29f6ceb98d74 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -158,6 +158,7 @@ static inline void sg_assign_page(struct scatterlist *sg, struct page *page) static inline void sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, unsigned int offset) { + VM_WARN_ON_ONCE(!page_range_contiguous(page, ALIGN(len + offset, PAGE_SIZE) / PAGE_SIZE)); sg_assign_page(sg, page); sg->offset = offset; sg->length = len; @@ -600,7 +601,7 @@ void __sg_page_iter_start(struct sg_page_iter *piter, */ static inline struct page *sg_page_iter_page(struct sg_page_iter *piter) { - return nth_page(sg_page(piter->sg), piter->sg_pgoffset); + return sg_page(piter->sg) + piter->sg_pgoffset; } /** diff --git a/mm/util.c b/mm/util.c index 4b9d40c71286..e29d3310e26b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1315,4 +1315,5 @@ bool page_range_contiguous(const struct page *page, unsigned long nr_pages) return false; return true; } +EXPORT_SYMBOL(page_range_contiguous); #endif From 70aa902651e8aa596f23d6f0d4c8b21a08e93cce Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:46 +0200 Subject: [PATCH 210/372] ata: libata-sff: drop nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-26-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Damien Le Moal Reviewed-by: Lorenzo Stoakes Cc: Niklas Cassel Signed-off-by: Andrew Morton --- drivers/ata/libata-sff.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 7fc407255eb4..1e2a2c33cdc8 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -614,7 +614,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) offset = qc->cursg->offset + qc->cursg_ofs; /* get the current page and offset */ - page = nth_page(page, (offset >> PAGE_SHIFT)); + page += offset >> PAGE_SHIFT; offset %= PAGE_SIZE; /* don't overrun current sg */ @@ -631,7 +631,7 @@ static void ata_pio_sector(struct ata_queued_cmd *qc) unsigned int split_len = PAGE_SIZE - offset; ata_pio_xfer(qc, page, offset, split_len); - ata_pio_xfer(qc, nth_page(page, 1), 0, count - split_len); + ata_pio_xfer(qc, page + 1, 0, count - split_len); } else { ata_pio_xfer(qc, page, offset, count); } @@ -751,7 +751,7 @@ next_sg: offset = sg->offset + qc->cursg_ofs; /* get the current page and offset */ - page = nth_page(page, (offset >> PAGE_SHIFT)); + page += offset >> PAGE_SHIFT; offset %= PAGE_SIZE; /* don't overrun current sg */ From f3dd22376e02b0541bac695b77aaf97130a7c899 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:47 +0200 Subject: [PATCH 211/372] drm/i915/gem: drop nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-27-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Tvrtko Ursulin Cc: David Airlie Cc: Simona Vetter Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/i915_gem_pages.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index c16a57160b26..031d7acc1614 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -779,7 +779,7 @@ __i915_gem_object_get_page(struct drm_i915_gem_object *obj, pgoff_t n) GEM_BUG_ON(!i915_gem_object_has_struct_page(obj)); sg = i915_gem_object_get_sg(obj, n, &offset); - return nth_page(sg_page(sg), offset); + return sg_page(sg) + offset; } /* Like i915_gem_object_get_page(), but mark the returned page dirty */ From 58f2c185839756b690370f5e536e629c7c7b0aac Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:48 +0200 Subject: [PATCH 212/372] mspro_block: drop nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-28-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Ulf Hansson Reviewed-by: Lorenzo Stoakes Cc: Maxim Levitsky Cc: Alex Dubov Signed-off-by: Andrew Morton --- drivers/memstick/core/mspro_block.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c9853d887d28..d3f160dc0da4 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -560,8 +560,7 @@ has_int_reg: t_offset += msb->current_page * msb->page_size; sg_set_page(&t_sg, - nth_page(sg_page(&(msb->req_sg[msb->current_seg])), - t_offset >> PAGE_SHIFT), + sg_page(&(msb->req_sg[msb->current_seg])) + (t_offset >> PAGE_SHIFT), msb->page_size, offset_in_page(t_offset)); memstick_init_req_sg(*mrq, msb->data_dir == READ From a1f4c374cc09789c8ed89b77b45d31e3f15a5685 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:49 +0200 Subject: [PATCH 213/372] memstick: drop nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-29-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Ulf Hansson Reviewed-by: Lorenzo Stoakes Cc: Maxim Levitsky Cc: Alex Dubov Signed-off-by: Andrew Morton --- drivers/memstick/host/jmb38x_ms.c | 3 +-- drivers/memstick/host/tifm_ms.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/memstick/host/jmb38x_ms.c b/drivers/memstick/host/jmb38x_ms.c index cddddb3a5a27..79e66e30417c 100644 --- a/drivers/memstick/host/jmb38x_ms.c +++ b/drivers/memstick/host/jmb38x_ms.c @@ -317,8 +317,7 @@ static int jmb38x_ms_transfer_data(struct jmb38x_ms_host *host) unsigned int p_off; if (host->req->long_data) { - pg = nth_page(sg_page(&host->req->sg), - off >> PAGE_SHIFT); + pg = sg_page(&host->req->sg) + (off >> PAGE_SHIFT); p_off = offset_in_page(off); p_cnt = PAGE_SIZE - p_off; p_cnt = min(p_cnt, length); diff --git a/drivers/memstick/host/tifm_ms.c b/drivers/memstick/host/tifm_ms.c index db7f3a088fb0..0b6a90661eee 100644 --- a/drivers/memstick/host/tifm_ms.c +++ b/drivers/memstick/host/tifm_ms.c @@ -201,8 +201,7 @@ static unsigned int tifm_ms_transfer_data(struct tifm_ms *host) unsigned int p_off; if (host->req->long_data) { - pg = nth_page(sg_page(&host->req->sg), - off >> PAGE_SHIFT); + pg = sg_page(&host->req->sg) + (off >> PAGE_SHIFT); p_off = offset_in_page(off); p_cnt = PAGE_SIZE - p_off; p_cnt = min(p_cnt, length); From 727fd054a01aaecb63151b3ffdacb0a0437453bf Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:50 +0200 Subject: [PATCH 214/372] mmc: drop nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-30-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Ulf Hansson Reviewed-by: Lorenzo Stoakes Cc: Alex Dubov Cc: Jesper Nilsson Cc: Lars Persson Signed-off-by: Andrew Morton --- drivers/mmc/host/tifm_sd.c | 4 ++-- drivers/mmc/host/usdhi6rol0.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/tifm_sd.c b/drivers/mmc/host/tifm_sd.c index ac636efd911d..2cd69c9e9571 100644 --- a/drivers/mmc/host/tifm_sd.c +++ b/drivers/mmc/host/tifm_sd.c @@ -191,7 +191,7 @@ static void tifm_sd_transfer_data(struct tifm_sd *host) } off = sg[host->sg_pos].offset + host->block_pos; - pg = nth_page(sg_page(&sg[host->sg_pos]), off >> PAGE_SHIFT); + pg = sg_page(&sg[host->sg_pos]) + (off >> PAGE_SHIFT); p_off = offset_in_page(off); p_cnt = PAGE_SIZE - p_off; p_cnt = min(p_cnt, cnt); @@ -240,7 +240,7 @@ static void tifm_sd_bounce_block(struct tifm_sd *host, struct mmc_data *r_data) } off = sg[host->sg_pos].offset + host->block_pos; - pg = nth_page(sg_page(&sg[host->sg_pos]), off >> PAGE_SHIFT); + pg = sg_page(&sg[host->sg_pos]) + (off >> PAGE_SHIFT); p_off = offset_in_page(off); p_cnt = PAGE_SIZE - p_off; p_cnt = min(p_cnt, cnt); diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 85b49c07918b..3bccf800339b 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -323,7 +323,7 @@ static void usdhi6_blk_bounce(struct usdhi6_host *host, host->head_pg.page = host->pg.page; host->head_pg.mapped = host->pg.mapped; - host->pg.page = nth_page(host->pg.page, 1); + host->pg.page = host->pg.page + 1; host->pg.mapped = kmap(host->pg.page); host->blk_page = host->bounce_buf; @@ -503,7 +503,7 @@ static void usdhi6_sg_advance(struct usdhi6_host *host) /* We cannot get here after crossing a page border */ /* Next page in the same SG */ - host->pg.page = nth_page(sg_page(host->sg), host->page_idx); + host->pg.page = sg_page(host->sg) + host->page_idx; host->pg.mapped = kmap(host->pg.page); host->blk_page = host->pg.mapped; From 9b6024fa7616bf3b9497c871611f39e2b2893de2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:51 +0200 Subject: [PATCH 215/372] scsi: scsi_lib: drop nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-31-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Bart Van Assche Reviewed-by: Lorenzo Stoakes Reviewed-by: Martin K. Petersen Cc: "James E.J. Bottomley" Signed-off-by: Andrew Morton --- drivers/scsi/scsi_lib.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 0c65ecfedfbd..d7e42293b864 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -3148,8 +3148,7 @@ void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count, /* Offset starting from the beginning of first page in this sg-entry */ *offset = *offset - len_complete + sg->offset; - /* Assumption: contiguous pages can be accessed as "page + i" */ - page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT)); + page = sg_page(sg) + (*offset >> PAGE_SHIFT); *offset &= ~PAGE_MASK; /* Bytes in this sg-entry from *offset to the end of the page */ From d66ff3db89997a916b26c2752d4ddd00772613d6 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:52 +0200 Subject: [PATCH 216/372] scsi: sg: drop nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-32-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Bart Van Assche Reviewed-by: Lorenzo Stoakes Reviewed-by: Martin K. Petersen Cc: Doug Gilbert Cc: "James E.J. Bottomley" Signed-off-by: Andrew Morton --- drivers/scsi/sg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 3c02a5f7b5f3..4c62c597c7be 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1235,8 +1235,7 @@ sg_vma_fault(struct vm_fault *vmf) len = vma->vm_end - sa; len = (len < length) ? len : length; if (offset < len) { - struct page *page = nth_page(rsv_schp->pages[k], - offset >> PAGE_SHIFT); + struct page *page = rsv_schp->pages[k] + (offset >> PAGE_SHIFT); get_page(page); /* increment page count */ vmf->page = page; return 0; /* success */ From fae6406bca03688bde691ef7f2605165d6b4f9b8 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:53 +0200 Subject: [PATCH 217/372] vfio/pci: drop nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-33-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Reviewed-by: Alex Williamson Reviewed-by: Brett Creeley Cc: Jason Gunthorpe Cc: Yishai Hadas Cc: Shameer Kolothum Cc: Kevin Tian Signed-off-by: Andrew Morton --- drivers/vfio/pci/pds/lm.c | 3 +-- drivers/vfio/pci/virtio/migrate.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c index f2673d395236..4d70c833fa32 100644 --- a/drivers/vfio/pci/pds/lm.c +++ b/drivers/vfio/pci/pds/lm.c @@ -151,8 +151,7 @@ static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file, lm_file->last_offset_sg = sg; lm_file->sg_last_entry += i; lm_file->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); + return sg_page(sg) + (offset - cur_offset) / PAGE_SIZE; } cur_offset += sg->length; } diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index ba92bb4e9af9..7dd0ac866461 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -53,8 +53,7 @@ virtiovf_get_migration_page(struct virtiovf_data_buffer *buf, buf->last_offset_sg = sg; buf->sg_last_entry += i; buf->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); + return sg_page(sg) + (offset - cur_offset) / PAGE_SIZE; } cur_offset += sg->length; } From ce00897b94bc5c62fab962625efcf1ab824d3688 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:54 +0200 Subject: [PATCH 218/372] crypto: remove nth_page() usage within SG entry It's no longer required to use nth_page() when iterating pages within a single SG entry, so let's drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-34-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Acked-by: Herbert Xu Cc: "David S. Miller" Signed-off-by: Andrew Morton --- crypto/ahash.c | 4 ++-- crypto/scompress.c | 8 ++++---- include/crypto/scatterwalk.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/crypto/ahash.c b/crypto/ahash.c index a227793d2c5b..dfb4f5476428 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -88,7 +88,7 @@ static int hash_walk_new_entry(struct crypto_hash_walk *walk) sg = walk->sg; walk->offset = sg->offset; - walk->pg = nth_page(sg_page(walk->sg), (walk->offset >> PAGE_SHIFT)); + walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT); walk->offset = offset_in_page(walk->offset); walk->entrylen = sg->length; @@ -226,7 +226,7 @@ int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) if (!IS_ENABLED(CONFIG_HIGHMEM)) return crypto_shash_digest(desc, data, nbytes, req->result); - page = nth_page(page, offset >> PAGE_SHIFT); + page += offset >> PAGE_SHIFT; offset = offset_in_page(offset); if (nbytes > (unsigned int)PAGE_SIZE - offset) diff --git a/crypto/scompress.c b/crypto/scompress.c index c651e7f2197a..1a7ed8ae65b0 100644 --- a/crypto/scompress.c +++ b/crypto/scompress.c @@ -198,7 +198,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) } else return -ENOSYS; - dpage = nth_page(dpage, doff / PAGE_SIZE); + dpage += doff / PAGE_SIZE; doff = offset_in_page(doff); n = (dlen - 1) / PAGE_SIZE; @@ -220,12 +220,12 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) } else break; - spage = nth_page(spage, soff / PAGE_SIZE); + spage = spage + soff / PAGE_SIZE; soff = offset_in_page(soff); n = (slen - 1) / PAGE_SIZE; n += (offset_in_page(slen - 1) + soff) / PAGE_SIZE; - if (PageHighMem(nth_page(spage, n)) && + if (PageHighMem(spage + n) && size_add(soff, slen) > PAGE_SIZE) break; src = kmap_local_page(spage) + soff; @@ -270,7 +270,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) if (dlen <= PAGE_SIZE) break; dlen -= PAGE_SIZE; - dpage = nth_page(dpage, 1); + dpage++; } } diff --git a/include/crypto/scatterwalk.h b/include/crypto/scatterwalk.h index 15ab743f68c8..83d14376ff2b 100644 --- a/include/crypto/scatterwalk.h +++ b/include/crypto/scatterwalk.h @@ -159,7 +159,7 @@ static inline void scatterwalk_map(struct scatter_walk *walk) if (IS_ENABLED(CONFIG_HIGHMEM)) { struct page *page; - page = nth_page(base_page, offset >> PAGE_SHIFT); + page = base_page + (offset >> PAGE_SHIFT); offset = offset_in_page(offset); addr = kmap_local_page(page) + offset; } else { @@ -259,7 +259,7 @@ static inline void scatterwalk_done_dst(struct scatter_walk *walk, end += (offset_in_page(offset) + offset_in_page(nbytes) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = start; i < end; i++) - flush_dcache_page(nth_page(base_page, i)); + flush_dcache_page(base_page + i); } scatterwalk_advance(walk, nbytes); } From b5ba761a7f5612759770117657577925fcb2e668 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:55 +0200 Subject: [PATCH 219/372] mm/gup: drop nth_page() usage in unpin_user_page_range_dirty_lock() There is the concern that unpin_user_page_range_dirty_lock() might do some weird merging of PFN ranges -- either now or in the future -- such that PFN range is contiguous but the page range might not be. Let's sanity-check for that and drop the nth_page() usage. Link: https://lkml.kernel.org/r/20250901150359.867252-35-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/gup.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/gup.c b/mm/gup.c index b8aee964421c..83438bbbf2f6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -237,7 +237,7 @@ void folio_add_pin(struct folio *folio) static inline struct folio *gup_folio_range_next(struct page *start, unsigned long npages, unsigned long i, unsigned int *ntails) { - struct page *next = nth_page(start, i); + struct page *next = start + i; struct folio *folio = page_folio(next); unsigned int nr = 1; @@ -342,6 +342,10 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock); * "gup-pinned page range" refers to a range of pages that has had one of the * pin_user_pages() variants called on that page. * + * The page range must be truly physically contiguous: the page range + * corresponds to a contiguous PFN range and all pages can be iterated + * naturally. + * * For the page ranges defined by [page .. page+npages], make that range (or * its head pages, if a compound page) dirty, if @make_dirty is true, and if the * page range was previously listed as clean. @@ -359,6 +363,8 @@ void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, struct folio *folio; unsigned int nr; + VM_WARN_ON_ONCE(!page_range_contiguous(page, npages)); + for (i = 0; i < npages; i += nr) { folio = gup_folio_range_next(page, npages, i, &nr); if (make_dirty && !folio_test_dirty(folio)) { From 56531761d4b04ea46de04f7ddab0cdc9cd1a35e1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:56 +0200 Subject: [PATCH 220/372] kfence: drop nth_page() usage We want to get rid of nth_page(), and kfence init code is the last user. Unfortunately, we might actually walk a PFN range where the pages are not contiguous, because we might be allocating an area from memblock that could span memory sections in problematic kernel configs (SPARSEMEM without SPARSEMEM_VMEMMAP). We could check whether the page range is contiguous using page_range_contiguous() and failing kfence init, or making kfence incompatible these problemtic kernel configs. Let's keep it simple and simply use pfn_to_page() by iterating PFNs. Link: https://lkml.kernel.org/r/20250901150359.867252-36-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Marco Elver Reviewed-by: Lorenzo Stoakes Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kfence/core.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 0ed3be100963..727c20c94ac5 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -594,15 +594,14 @@ static void rcu_guarded_free(struct rcu_head *h) */ static unsigned long kfence_init_pool(void) { - unsigned long addr; - struct page *pages; + unsigned long addr, start_pfn; int i; if (!arch_kfence_init_pool()) return (unsigned long)__kfence_pool; addr = (unsigned long)__kfence_pool; - pages = virt_to_page(__kfence_pool); + start_pfn = PHYS_PFN(virt_to_phys(__kfence_pool)); /* * Set up object pages: they must have PGTY_slab set to avoid freeing @@ -613,11 +612,12 @@ static unsigned long kfence_init_pool(void) * enters __slab_free() slow-path. */ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab = page_slab(nth_page(pages, i)); + struct slab *slab; if (!i || (i % 2)) continue; + slab = page_slab(pfn_to_page(start_pfn + i)); __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG slab->obj_exts = (unsigned long)&kfence_metadata_init[i / 2 - 1].obj_exts | @@ -665,10 +665,12 @@ static unsigned long kfence_init_pool(void) reset_slab: for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) { - struct slab *slab = page_slab(nth_page(pages, i)); + struct slab *slab; if (!i || (i % 2)) continue; + + slab = page_slab(pfn_to_page(start_pfn + i)); #ifdef CONFIG_MEMCG slab->obj_exts = 0; #endif From d5170ce4d71b3843613ee1840bca50ad71c3671e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:57 +0200 Subject: [PATCH 221/372] block: update comment of "struct bio_vec" regarding nth_page() Ever since commit 858c708d9efb ("block: move the bi_size update out of __bio_try_merge_page"), page_is_mergeable() no longer exists, and the logic in bvec_try_merge_page() is now a simple page pointer comparison. Link: https://lkml.kernel.org/r/20250901150359.867252-37-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/bvec.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 0a80e1f9aa20..3fc0efa0825b 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -22,11 +22,8 @@ struct page; * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. * - * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: - * - * nth_page(@bv_page, n) == @bv_page + n - * - * This holds because page_is_mergeable() checks the above property. + * All pages within a bio_vec starting from @bv_page are contiguous and + * can simply be iterated (see bvec_advance()). */ struct bio_vec { struct page *bv_page; From 84efbefa26df36a845a9210ee962aa6866f99bb7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 1 Sep 2025 17:03:58 +0200 Subject: [PATCH 222/372] mm: remove nth_page() Now that all users are gone, let's remove it. Link: https://lkml.kernel.org/r/20250901150359.867252-38-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- tools/testing/scatterlist/linux/mm.h | 1 - 2 files changed, 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca1eb2db63e..b26ca8b2162d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -210,9 +210,7 @@ extern unsigned long sysctl_admin_reserve_kbytes; #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) bool page_range_contiguous(const struct page *page, unsigned long nr_pages); -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #else -#define nth_page(page,n) ((page) + (n)) static inline bool page_range_contiguous(const struct page *page, unsigned long nr_pages) { diff --git a/tools/testing/scatterlist/linux/mm.h b/tools/testing/scatterlist/linux/mm.h index 5bd9e6e80625..121ae78d6e88 100644 --- a/tools/testing/scatterlist/linux/mm.h +++ b/tools/testing/scatterlist/linux/mm.h @@ -51,7 +51,6 @@ static inline unsigned long page_to_phys(struct page *page) #define page_to_pfn(page) ((unsigned long)(page) / PAGE_SIZE) #define pfn_to_page(pfn) (void *)((pfn) * PAGE_SIZE) -#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define __min(t1, t2, min1, min2, x, y) ({ \ t1 min1 = (x); \ From 31d8edb535bd6b387c55650c245b09fcfeaef768 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Tue, 16 Sep 2025 23:27:54 +0100 Subject: [PATCH 223/372] kasan/hw-tags: introduce kasan.write_only option Patch series "introduce kasan.write_only option in hw-tags", v8. Hardware tag based KASAN is implemented using the Memory Tagging Extension (MTE) feature. MTE is built on top of the ARMv8.0 virtual address tagging TBI (Top Byte Ignore) feature and allows software to access a 4-bit allocation tag for each 16-byte granule in the physical address space. A logical tag is derived from bits 59-56 of the virtual address used for the memory access. A CPU with MTE enabled will compare the logical tag against the allocation tag and potentially raise an tag check fault on mismatch, subject to system registers configuration. Since ARMv8.9, FEAT_MTE_STORE_ONLY can be used to restrict raise of tag check fault on store operation only. Using this feature (FEAT_MTE_STORE_ONLY), introduce KASAN write-only mode which restricts KASAN check write (store) operation only. This mode omits KASAN check for read (fetch/load) operation. Therefore, it might be used not only debugging purpose but also in normal environment. This patch (of 2): Since Armv8.9, FEATURE_MTE_STORE_ONLY feature is introduced to restrict raise of tag check fault on store operation only. Introduce KASAN write only mode based on this feature. KASAN write only mode restricts KASAN checks operation for write only and omits the checks for fetch/read operations when accessing memory. So it might be used not only debugging enviroment but also normal enviroment to check memory safty. This features can be controlled with "kasan.write_only" arguments. When "kasan.write_only=on", KASAN checks write operation only otherwise KASAN checks all operations. This changes the MTE_STORE_ONLY feature as BOOT_CPU_FEATURE like ARM64_MTE_ASYMM so that makes it initialise in kasan_init_hw_tags() with other function together. Link: https://lkml.kernel.org/r/20250916222755.466009-1-yeoreum.yun@arm.com Link: https://lkml.kernel.org/r/20250916222755.466009-2-yeoreum.yun@arm.com Signed-off-by: Yeoreum Yun Reviewed-by: Catalin Marinas Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Ard Biesheuvel Cc: Breno Leitao Cc: David Hildenbrand Cc: Dmitriy Vyukov Cc: D Scott Phillips Cc: Hardevsinh Palaniya Cc: James Morse Cc: John Hubbard Cc: Jonathan Corbet Cc: Kalesh Singh Cc: levi.yun Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Pankaj Gupta Cc: Vincenzo Frascino Cc: Will Deacon Cc: Yang Shi Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 3 ++ arch/arm64/include/asm/memory.h | 1 + arch/arm64/include/asm/mte-kasan.h | 6 ++++ arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/kernel/mte.c | 18 ++++++++++++ mm/kasan/hw_tags.c | 45 ++++++++++++++++++++++++++++-- mm/kasan/kasan.h | 7 +++++ 7 files changed, 79 insertions(+), 3 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 0a1418ab72fd..a034700da7c4 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -143,6 +143,9 @@ disabling KASAN altogether or controlling its features: Asymmetric mode: a bad access is detected synchronously on reads and asynchronously on writes. +- ``kasan.write_only=off`` or ``kasan.write_only=on`` controls whether KASAN + checks the write (store) accesses only or all accesses (default: ``off``). + - ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc allocations (default: ``on``). diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 5213248e081b..f1505c4acb38 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -308,6 +308,7 @@ static inline const void *__tag_set(const void *addr, u8 tag) #define arch_enable_tag_checks_sync() mte_enable_kernel_sync() #define arch_enable_tag_checks_async() mte_enable_kernel_async() #define arch_enable_tag_checks_asymm() mte_enable_kernel_asymm() +#define arch_enable_tag_checks_write_only() mte_enable_kernel_store_only() #define arch_suppress_tag_checks_start() mte_enable_tco() #define arch_suppress_tag_checks_stop() mte_disable_tco() #define arch_force_async_tag_fault() mte_check_tfsr_exit() diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h index 2e98028c1965..0f9b08e8fb8d 100644 --- a/arch/arm64/include/asm/mte-kasan.h +++ b/arch/arm64/include/asm/mte-kasan.h @@ -200,6 +200,7 @@ static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag, void mte_enable_kernel_sync(void); void mte_enable_kernel_async(void); void mte_enable_kernel_asymm(void); +int mte_enable_kernel_store_only(void); #else /* CONFIG_ARM64_MTE */ @@ -251,6 +252,11 @@ static inline void mte_enable_kernel_asymm(void) { } +static inline int mte_enable_kernel_store_only(void) +{ + return -EINVAL; +} + #endif /* CONFIG_ARM64_MTE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index ef269a5a37e1..1f6e8c87aae7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2945,7 +2945,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "Store Only MTE Tag Check", .capability = ARM64_MTE_STORE_ONLY, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .type = ARM64_CPUCAP_BOOT_CPU_FEATURE, .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, MTESTOREONLY, IMP) }, diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index e5e773844889..54a52dc5c1ae 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -157,6 +157,24 @@ void mte_enable_kernel_asymm(void) mte_enable_kernel_sync(); } } + +int mte_enable_kernel_store_only(void) +{ + /* + * If the CPU does not support MTE store only, + * the kernel checks all operations. + */ + if (!cpus_have_cap(ARM64_MTE_STORE_ONLY)) + return -EINVAL; + + sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCSO_MASK, + SYS_FIELD_PREP(SCTLR_EL1, TCSO, 1)); + isb(); + + pr_info_once("MTE: enabled store only mode at EL1\n"); + + return 0; +} #endif #ifdef CONFIG_KASAN_HW_TAGS diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index c8289a3feabf..1c373cc4b3fa 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -60,6 +60,9 @@ DEFINE_STATIC_KEY_FALSE(kasan_flag_vmalloc); #endif EXPORT_SYMBOL_GPL(kasan_flag_vmalloc); +/* Whether to check write accesses only. */ +static bool kasan_flag_write_only = false; + #define PAGE_ALLOC_SAMPLE_DEFAULT 1 #define PAGE_ALLOC_SAMPLE_ORDER_DEFAULT 3 @@ -134,6 +137,23 @@ static int __init early_kasan_flag_vmalloc(char *arg) } early_param("kasan.vmalloc", early_kasan_flag_vmalloc); +/* kasan.write_only=off/on */ +static int __init early_kasan_flag_write_only(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_flag_write_only = false; + else if (!strcmp(arg, "on")) + kasan_flag_write_only = true; + else + return -EINVAL; + + return 0; +} +early_param("kasan.write_only", early_kasan_flag_write_only); + static inline const char *kasan_mode_info(void) { if (kasan_mode == KASAN_MODE_ASYNC) @@ -255,10 +275,11 @@ void __init kasan_init_hw_tags(void) /* KASAN is now initialized, enable it. */ kasan_enable(); - pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s)\n", + pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, vmalloc=%s, stacktrace=%s, write_only=%s)\n", kasan_mode_info(), str_on_off(kasan_vmalloc_enabled()), - str_on_off(kasan_stack_collection_enabled())); + str_on_off(kasan_stack_collection_enabled()), + str_on_off(kasan_flag_write_only)); } #ifdef CONFIG_KASAN_VMALLOC @@ -385,6 +406,20 @@ void kasan_enable_hw_tags(void) hw_enable_tag_checks_asymm(); else hw_enable_tag_checks_sync(); + + /* + * CPUs can only be in one of two states: + * - All CPUs support the write_only feature + * - No CPUs support the write_only feature + * + * If the first CPU attempts hw_enable_tag_checks_write_only() and + * finds the feature unsupported, kasan_flag_write_only is set to OFF + * to avoid further unnecessary calls on other CPUs. + */ + if (kasan_flag_write_only && hw_enable_tag_checks_write_only()) { + kasan_flag_write_only = false; + pr_err_once("write-only mode is not supported and thus not enabled\n"); + } } #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) @@ -397,4 +432,10 @@ VISIBLE_IF_KUNIT void kasan_force_async_fault(void) } EXPORT_SYMBOL_IF_KUNIT(kasan_force_async_fault); +VISIBLE_IF_KUNIT bool kasan_write_only_enabled(void) +{ + return kasan_flag_write_only; +} +EXPORT_SYMBOL_IF_KUNIT(kasan_write_only_enabled); + #endif diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 8a9d8a6ea717..07fa7375a848 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -437,6 +437,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_suppress_tag_checks_start() arch_suppress_tag_checks_start() #define hw_suppress_tag_checks_stop() arch_suppress_tag_checks_stop() #define hw_force_async_tag_fault() arch_force_async_tag_fault() +#define hw_enable_tag_checks_write_only() arch_enable_tag_checks_write_only() #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) #define hw_set_mem_tag_range(addr, size, tag, init) \ @@ -457,11 +458,17 @@ void __init kasan_init_tags(void); #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_force_async_fault(void); +bool kasan_write_only_enabled(void); #else /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ static inline void kasan_force_async_fault(void) { } +static inline bool kasan_write_only_enabled(void) +{ + return false; +} + #endif /* CONFIG_KASAN_HW_TAGS && CONFIG_KASAN_KUNIT_TEST */ #ifdef CONFIG_KASAN_SW_TAGS From 2b79cb3eac3a2b841a2d66550c44e6e073dcd422 Mon Sep 17 00:00:00 2001 From: Yeoreum Yun Date: Tue, 16 Sep 2025 23:27:55 +0100 Subject: [PATCH 224/372] kasan: apply write-only mode in kasan kunit testcases When KASAN is configured in write-only mode, fetch/load operations do not trigger tag check faults. As a result, the outcome of some test cases may differ compared to when KASAN is configured without write-only mode. Therefore, by modifying pre-exist testcases check the write only makes tag check fault (TCF) where writing is perform in "allocated memory" but tag is invalid (i.e) redzone write in atomic_set() testcases. Otherwise check the invalid fetch/read doesn't generate TCF. Also, skip some testcases affected by initial value (i.e) atomic_cmpxchg() testcase maybe successd if it passes valid atomic_t address and invalid oldaval address. In this case, if invalid atomic_t doesn't have the same oldval, it won't trigger write operation so the test will pass. Link: https://lkml.kernel.org/r/20250916222755.466009-3-yeoreum.yun@arm.com Signed-off-by: Yeoreum Yun Reviewed-by: Andrey Konovalov Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Breno Leitao Cc: Catalin Marinas Cc: David Hildenbrand Cc: Dmitriy Vyukov Cc: D Scott Phillips Cc: Hardevsinh Palaniya Cc: James Morse Cc: John Hubbard Cc: Jonathan Corbet Cc: Kalesh Singh Cc: Marc Zyngier Cc: Mark Brown Cc: Oliver Upton Cc: Pankaj Gupta Cc: Vincenzo Frascino Cc: Will Deacon Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 205 ++++++++++++++++++++++++++-------------- 1 file changed, 136 insertions(+), 69 deletions(-) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 4cf2b5f8d6c1..2cafca31b092 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -94,11 +94,14 @@ static void kasan_test_exit(struct kunit *test) } /** - * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a - * KASAN report; causes a KUnit test failure otherwise. + * KUNIT_EXPECT_KASAN_RESULT - checks whether the executed expression + * produces a KASAN report; causes a KUnit test failure when the result + * is different from @fail. * * @test: Currently executing KUnit test. - * @expression: Expression that must produce a KASAN report. + * @expr: Expression to be tested. + * @expr_str: Expression to be tested encoded as a string. + * @fail: Whether expression should produce a KASAN report. * * For hardware tag-based KASAN, when a synchronous tag fault happens, tag * checking is auto-disabled. When this happens, this test handler reenables @@ -110,25 +113,29 @@ static void kasan_test_exit(struct kunit *test) * Use READ/WRITE_ONCE() for the accesses and compiler barriers around the * expression to prevent that. * - * In between KUNIT_EXPECT_KASAN_FAIL checks, test_status.report_found is kept + * In between KUNIT_EXPECT_KASAN_RESULT checks, test_status.report_found is kept * as false. This allows detecting KASAN reports that happen outside of the * checks by asserting !test_status.report_found at the start of - * KUNIT_EXPECT_KASAN_FAIL and in kasan_test_exit. + * KUNIT_EXPECT_KASAN_RESULT and in kasan_test_exit. */ -#define KUNIT_EXPECT_KASAN_FAIL(test, expression) do { \ +#define KUNIT_EXPECT_KASAN_RESULT(test, expr, expr_str, fail) \ +do { \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ kasan_sync_fault_possible()) \ migrate_disable(); \ KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); \ barrier(); \ - expression; \ + expr; \ barrier(); \ if (kasan_async_fault_possible()) \ kasan_force_async_fault(); \ - if (!READ_ONCE(test_status.report_found)) { \ - KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure " \ - "expected in \"" #expression \ - "\", but none occurred"); \ + if (READ_ONCE(test_status.report_found) != fail) { \ + KUNIT_FAIL(test, KUNIT_SUBTEST_INDENT "KASAN failure" \ + "%sexpected in \"" expr_str \ + "\", but %soccurred", \ + (fail ? " " : " not "), \ + (test_status.report_found ? \ + "" : "none ")); \ } \ if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ kasan_sync_fault_possible()) { \ @@ -141,6 +148,34 @@ static void kasan_test_exit(struct kunit *test) WRITE_ONCE(test_status.async_fault, false); \ } while (0) +/* + * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a + * KASAN report; causes a KUnit test failure otherwise. + * + * @test: Currently executing KUnit test. + * @expr: Expression that must produce a KASAN report. + */ +#define KUNIT_EXPECT_KASAN_FAIL(test, expr) \ + KUNIT_EXPECT_KASAN_RESULT(test, expr, #expr, true) + +/* + * KUNIT_EXPECT_KASAN_FAIL_READ - check that the executed expression + * produces a KASAN report when the write-only mode is not enabled; + * causes a KUnit test failure otherwise. + * + * Note: At the moment, this macro does not check whether the produced + * KASAN report is a report about a bad read access. It is only intended + * for checking the write-only KASAN mode functionality without failing + * KASAN tests. + * + * @test: Currently executing KUnit test. + * @expr: Expression that must only produce a KASAN report + * when the write-only mode is not enabled. + */ +#define KUNIT_EXPECT_KASAN_FAIL_READ(test, expr) \ + KUNIT_EXPECT_KASAN_RESULT(test, expr, #expr, \ + !kasan_write_only_enabled()) \ + #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ if (!IS_ENABLED(config)) \ kunit_skip((test), "Test requires " #config "=y"); \ @@ -183,8 +218,8 @@ static void kmalloc_oob_right(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, ptr[size + 5] = 'y'); /* Out-of-bounds access past the aligned kmalloc object. */ - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = - ptr[size + KASAN_GRANULE_SIZE + 5]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = + ptr[size + KASAN_GRANULE_SIZE + 5]); kfree(ptr); } @@ -198,7 +233,7 @@ static void kmalloc_oob_left(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); OPTIMIZER_HIDE_VAR(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, *ptr = *(ptr - 1)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, *ptr = *(ptr - 1)); kfree(ptr); } @@ -211,7 +246,7 @@ static void kmalloc_node_oob_right(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); OPTIMIZER_HIDE_VAR(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = ptr[size]); kfree(ptr); } @@ -291,7 +326,7 @@ static void kmalloc_large_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]); } static void kmalloc_large_invalid_free(struct kunit *test) @@ -323,7 +358,7 @@ static void page_alloc_oob_right(struct kunit *test) ptr = page_address(pages); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ptr[0] = ptr[size]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ptr[0] = ptr[size]); free_pages((unsigned long)ptr, order); } @@ -338,7 +373,7 @@ static void page_alloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); free_pages((unsigned long)ptr, order); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]); } static void krealloc_more_oob_helper(struct kunit *test, @@ -458,7 +493,7 @@ static void krealloc_uaf(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL)); KUNIT_ASSERT_NULL(test, ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)ptr1); + KUNIT_EXPECT_KASAN_FAIL_READ(test, *(volatile char *)ptr1); } static void kmalloc_oob_16(struct kunit *test) @@ -501,7 +536,7 @@ static void kmalloc_uaf_16(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); kfree(ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, *ptr1 = *ptr2); + KUNIT_EXPECT_KASAN_FAIL_READ(test, *ptr1 = *ptr2); kfree(ptr1); } @@ -640,8 +675,8 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) memset((char *)ptr, 0, 64); OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(invalid_size); - KUNIT_EXPECT_KASAN_FAIL(test, - memmove((char *)ptr, (char *)ptr + 4, invalid_size)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, + memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); } @@ -654,7 +689,7 @@ static void kmalloc_uaf(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); kfree(ptr); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[8]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[8]); } static void kmalloc_uaf_memset(struct kunit *test) @@ -701,7 +736,7 @@ again: goto again; } - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[40]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr1)[40]); KUNIT_EXPECT_PTR_NE(test, ptr1, ptr2); kfree(ptr2); @@ -727,19 +762,19 @@ static void kmalloc_uaf3(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); kfree(ptr2); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr1)[8]); } static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) { int *i_unsafe = unsafe; - KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*i_unsafe)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, READ_ONCE(*i_unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, WRITE_ONCE(*i_unsafe, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, smp_load_acquire(i_unsafe)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, smp_load_acquire(i_unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, smp_store_release(i_unsafe, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_read(unsafe)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, atomic_read(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_set(unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_add(42, unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub(42, unsafe)); @@ -752,18 +787,31 @@ static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) KUNIT_EXPECT_KASAN_FAIL(test, atomic_xchg(unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_cmpxchg(unsafe, 21, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(unsafe, safe, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42)); + /* + * The result of the test below may vary due to garbage values of + * unsafe in write-only mode. + * Therefore, skip this test when KASAN is configured in write-only mode. + */ + if (!kasan_write_only_enabled()) + KUNIT_EXPECT_KASAN_FAIL(test, atomic_try_cmpxchg(safe, unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_sub_and_test(42, unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_and_test(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_and_test(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_negative(42, unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe)); + /* + * The result of the test below may vary due to garbage values of + * unsafe in write-only mode. + * Therefore, skip this test when KASAN is configured in write-only mode. + */ + if (!kasan_write_only_enabled()) { + KUNIT_EXPECT_KASAN_FAIL(test, atomic_add_unless(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_not_zero(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_inc_unless_negative(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_unless_positive(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_dec_if_positive(unsafe)); + } - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_read(unsafe)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, atomic_long_read(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_set(unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add(42, unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub(42, unsafe)); @@ -776,16 +824,29 @@ static void kasan_atomics_helper(struct kunit *test, void *unsafe, void *safe) KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_xchg(unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_cmpxchg(unsafe, 21, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(unsafe, safe, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42)); + /* + * The result of the test below may vary due to garbage values of + * unsafe in write-only mode. + * Therefore, skip this test when KASAN is configured in write-only mode. + */ + if (!kasan_write_only_enabled()) + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_try_cmpxchg(safe, unsafe, 42)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_sub_and_test(42, unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_and_test(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_and_test(unsafe)); KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_negative(42, unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe)); - KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe)); + /* + * The result of the test below may vary due to garbage values of + * unsafe in write-only mode. + * Therefore, skip this test when KASAN is configured in write-only mode. + */ + if (!kasan_write_only_enabled()) { + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_add_unless(unsafe, 21, 42)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_not_zero(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_inc_unless_negative(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_unless_positive(unsafe)); + KUNIT_EXPECT_KASAN_FAIL(test, atomic_long_dec_if_positive(unsafe)); + } } static void kasan_atomics(struct kunit *test) @@ -842,8 +903,8 @@ static void ksize_unpoisons_memory(struct kunit *test) /* These must trigger a KASAN report. */ if (IS_ENABLED(CONFIG_KASAN_GENERIC)) KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[size + 5]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[real_size - 1]); kfree(ptr); } @@ -863,8 +924,8 @@ static void ksize_uaf(struct kunit *test) OPTIMIZER_HIDE_VAR(ptr); KUNIT_EXPECT_KASAN_FAIL(test, ksize(ptr)); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[size]); } /* @@ -899,9 +960,9 @@ static void rcu_uaf(struct kunit *test) global_rcu_ptr = rcu_dereference_protected( (struct kasan_rcu_info __rcu *)ptr, NULL); - KUNIT_EXPECT_KASAN_FAIL(test, - call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim); - rcu_barrier()); + KUNIT_EXPECT_KASAN_FAIL_READ(test, + call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim); + rcu_barrier()); } static void workqueue_uaf_work(struct work_struct *work) @@ -924,8 +985,8 @@ static void workqueue_uaf(struct kunit *test) queue_work(workqueue, work); destroy_workqueue(workqueue); - KUNIT_EXPECT_KASAN_FAIL(test, - ((volatile struct work_struct *)work)->data); + KUNIT_EXPECT_KASAN_FAIL_READ(test, + ((volatile struct work_struct *)work)->data); } static void kfree_via_page(struct kunit *test) @@ -972,7 +1033,7 @@ static void kmem_cache_oob(struct kunit *test) return; } - KUNIT_EXPECT_KASAN_FAIL(test, *p = p[size + OOB_TAG_OFF]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, *p = p[size + OOB_TAG_OFF]); kmem_cache_free(cache, p); kmem_cache_destroy(cache); @@ -1068,7 +1129,7 @@ static void kmem_cache_rcu_uaf(struct kunit *test) */ rcu_barrier(); - KUNIT_EXPECT_KASAN_FAIL(test, READ_ONCE(*p)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, READ_ONCE(*p)); kmem_cache_destroy(cache); } @@ -1246,7 +1307,7 @@ static void mempool_oob_right_helper(struct kunit *test, mempool_t *pool, size_t KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)&elem[size])[0]); else - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)&elem[round_up(size, KASAN_GRANULE_SIZE)])[0]); mempool_free(elem, pool); @@ -1312,7 +1373,7 @@ static void mempool_uaf_helper(struct kunit *test, mempool_t *pool, bool page) mempool_free(elem, pool); ptr = page ? page_address((struct page *)elem) : elem; - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[0]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)ptr)[0]); } static void mempool_kmalloc_uaf(struct kunit *test) @@ -1571,7 +1632,7 @@ static void kasan_memchr(struct kunit *test) OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = memchr(ptr, '1', size + 1)); kfree(ptr); @@ -1598,7 +1659,7 @@ static void kasan_memcmp(struct kunit *test) OPTIMIZER_HIDE_VAR(ptr); OPTIMIZER_HIDE_VAR(size); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = memcmp(ptr, arr, size+1)); kfree(ptr); } @@ -1635,7 +1696,7 @@ static void kasan_strings(struct kunit *test) strscpy(ptr, src + 1, KASAN_GRANULE_SIZE)); /* strscpy should fail if the first byte is unreadable. */ - KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE, + KUNIT_EXPECT_KASAN_FAIL_READ(test, strscpy(ptr, src + KASAN_GRANULE_SIZE, KASAN_GRANULE_SIZE)); kfree(src); @@ -1648,17 +1709,17 @@ static void kasan_strings(struct kunit *test) * will likely point to zeroed byte. */ ptr += 16; - KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strchr(ptr, '1')); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = strchr(ptr, '1')); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_ptr_result = strrchr(ptr, '1')); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_ptr_result = strrchr(ptr, '1')); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strcmp(ptr, "2")); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strcmp(ptr, "2")); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strncmp(ptr, "2", 1)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strncmp(ptr, "2", 1)); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strlen(ptr)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strlen(ptr)); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = strnlen(ptr, 1)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = strnlen(ptr, 1)); } static void kasan_bitops_modify(struct kunit *test, int nr, void *addr) @@ -1677,12 +1738,18 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) { KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_set_bit(nr, addr)); - KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr)); + /* + * When KASAN is running in write-only mode, + * a fault won't occur when the bit is set. + * Therefore, skip the test_and_set_bit_lock test in write-only mode. + */ + if (!kasan_write_only_enabled()) + KUNIT_EXPECT_KASAN_FAIL(test, test_and_set_bit_lock(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, test_and_clear_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_clear_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); + KUNIT_EXPECT_KASAN_FAIL_READ(test, kasan_int_result = test_bit(nr, addr)); if (nr < 7) KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = xor_unlock_is_negative_byte(1 << nr, addr)); @@ -1806,7 +1873,7 @@ static void vmalloc_oob(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size]); /* An aligned access into the first out-of-bounds granule. */ - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)v_ptr)[size + 5]); + KUNIT_EXPECT_KASAN_FAIL_READ(test, ((volatile char *)v_ptr)[size + 5]); /* Check that in-bounds accesses to the physical page are valid. */ page = vmalloc_to_page(v_ptr); @@ -2083,15 +2150,15 @@ static void copy_user_test_oob(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, unused = copy_from_user(kmem, usermem, size + 1)); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, unused = copy_to_user(usermem, kmem, size + 1)); KUNIT_EXPECT_KASAN_FAIL(test, unused = __copy_from_user(kmem, usermem, size + 1)); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, unused = __copy_to_user(usermem, kmem, size + 1)); KUNIT_EXPECT_KASAN_FAIL(test, unused = __copy_from_user_inatomic(kmem, usermem, size + 1)); - KUNIT_EXPECT_KASAN_FAIL(test, + KUNIT_EXPECT_KASAN_FAIL_READ(test, unused = __copy_to_user_inatomic(usermem, kmem, size + 1)); /* From 2a8f3f44f5ac9a2d27f43a11a96d935ac620be6a Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 1 Sep 2025 16:20:52 +0800 Subject: [PATCH 225/372] mm/hugetlb: retry to allocate for early boot hugepage allocation In cloud environments with massive hugepage reservations (95%+ of system RAM), single-attempt allocation during early boot often fails due to memory pressure. Commit 91f386bf0772 ("hugetlb: batch freeing of vmemmap pages") intensified this by deferring page frees, increase peak memory usage during allocation. Introduce a retry mechanism that leverages vmemmap optimization reclaim (~1.6% memory) when available. Upon initial allocation failure, the system retries until successful or no further progress is made, ensuring reliable hugepage allocation while preserving batched vmemmap freeing benefits. Testing on a 256G machine allocating 252G of hugepages: Before: 128056/129024 hugepages allocated After: Successfully allocated all 129024 hugepages Link: https://lkml.kernel.org/r/20250901082052.3247-1-lirongqing@baidu.com Signed-off-by: Li RongQing Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Li RongQing Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 56e6d2af0843..34f91dba2ca1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3593,10 +3593,9 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) unsigned long jiffies_start; unsigned long jiffies_end; + unsigned long remaining; job.thread_fn = hugetlb_pages_alloc_boot_node; - job.start = 0; - job.size = h->max_huge_pages; /* * job.max_threads is 25% of the available cpu threads by default. @@ -3620,10 +3619,29 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) } job.max_threads = hugepage_allocation_threads; - job.min_chunk = h->max_huge_pages / hugepage_allocation_threads; jiffies_start = jiffies; - padata_do_multithreaded(&job); + do { + remaining = h->max_huge_pages - h->nr_huge_pages; + + job.start = h->nr_huge_pages; + job.size = remaining; + job.min_chunk = remaining / hugepage_allocation_threads; + padata_do_multithreaded(&job); + + if (h->nr_huge_pages == h->max_huge_pages) + break; + + /* + * Retry only if the vmemmap optimization might have been able to free + * some memory back to the system. + */ + if (!hugetlb_vmemmap_optimizable(h)) + break; + + /* Continue if progress was made in last iteration */ + } while (remaining != (h->max_huge_pages - h->nr_huge_pages)); + jiffies_end = jiffies; pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n", From 0c83e7faa8481d184766409ed5aa10493f0040d9 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 2 Sep 2025 09:49:21 -0300 Subject: [PATCH 226/372] mm: show_mem: show number of zspages in show_free_areas When OOM is triggered, it will show where the pages might be for each zone. When using zram or zswap, it might look like lots of pages are missing. After this patch, zspages are shown as below. [ 48.792859] Node 0 DMA free:2812kB boost:0kB min:60kB low:72kB high:84kB reserved_highatomic:0KB free_highatomic:0KB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB zspages:11160kB present:15992kB managed:15360kB mlocked:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB [ 48.792962] lowmem_reserve[]: 0 956 956 956 956 [ 48.792988] Node 0 DMA32 free:3512kB boost:0kB min:3912kB low:4888kB high:5864kB reserved_highatomic:0KB free_highatomic:0KB active_anon:0kB inactive_anon:28kB active_file:8kB inactive_file:16kB unevictable:0kB writepending:0kB zspages:916780kB present:1032064kB managed:978944kB mlocked:0kB bounce:0kB free_pcp:500kB local_pcp:248kB free_cma:0kB [ 48.793118] lowmem_reserve[]: 0 0 0 0 0 Link: https://lkml.kernel.org/r/20250902-show_mem_zspages-v2-1-545daaa8b410@igalia.com Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Zi Yan Acked-by: SeongJae Park Reviewed-by: Chengming Zhou Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Brendan Jackman Cc: Michal Hocko Cc: Minchan Kim Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/show_mem.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/show_mem.c b/mm/show_mem.c index 41999e94a56d..c563d9adfa87 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -310,6 +310,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " inactive_file:%lukB" " unevictable:%lukB" " writepending:%lukB" + " zspages:%lukB" " present:%lukB" " managed:%lukB" " mlocked:%lukB" @@ -332,6 +333,11 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), +#if IS_ENABLED(CONFIG_ZSMALLOC) + K(zone_page_state(zone, NR_ZSPAGES)), +#else + 0UL, +#endif K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), From 902020f027457d999fd2a4ebdbb7ba72e5c8c27e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 10 Sep 2025 21:39:54 +0800 Subject: [PATCH 227/372] mm: hugetlb: convert to use more alloc_fresh_hugetlb_folio() Patch series "mm: hugetlb: cleanup hugetlb folio allocation", v3. Some cleanups for hugetlb folio allocation. This patch (of 3): Simplify alloc_fresh_hugetlb_folio() and convert more functions to use it, which help us to remove prep_new_hugetlb_folio() and __prep_new_hugetlb_folio(). Link: https://lkml.kernel.org/r/20250910133958.301467-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20250910133958.301467-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Oscar Salvador Reviewed-by: Sidhartha Kumar Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/hugetlb.c | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 34f91dba2ca1..1378b37864c8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1906,20 +1906,6 @@ static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) set_hugetlb_cgroup_rsvd(folio, NULL); } -static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) -{ - init_new_hugetlb_folio(h, folio); - hugetlb_vmemmap_optimize_folio(h, folio); -} - -static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) -{ - __prep_new_hugetlb_folio(h, folio); - spin_lock_irq(&hugetlb_lock); - __prep_account_new_huge_page(h, nid); - spin_unlock_irq(&hugetlb_lock); -} - /* * Find and lock address space (mapping) in write mode. * @@ -2005,25 +1991,20 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, } /* - * Common helper to allocate a fresh hugetlb page. All specific allocators - * should use this function to get new hugetlb pages + * Common helper to allocate a fresh hugetlb folio. All specific allocators + * should use this function to get new hugetlb folio * - * Note that returned page is 'frozen': ref count of head page and all tail - * pages is zero. + * Note that returned folio is 'frozen': ref count of head page and all tail + * pages is zero, and the accounting must be done in the caller. */ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); - else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); - if (!folio) - return NULL; - - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (folio) + hugetlb_vmemmap_optimize_folio(h, folio); return folio; } @@ -2241,12 +2222,10 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; - hugetlb_vmemmap_optimize_folio(h, folio); - spin_lock_irq(&hugetlb_lock); /* * nr_huge_pages needs to be adjusted within the same lock cycle @@ -2290,6 +2269,10 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas if (!folio) return NULL; + spin_lock_irq(&hugetlb_lock); + __prep_account_new_huge_page(h, folio_nid(folio)); + spin_unlock_irq(&hugetlb_lock); + /* fresh huge pages are frozen */ folio_ref_unfreeze(folio, 1); /* @@ -2836,11 +2819,10 @@ retry: if (!new_folio) { spin_unlock_irq(&hugetlb_lock); gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, - NULL, NULL); + new_folio = alloc_fresh_hugetlb_folio(h, gfp_mask, + nid, NULL); if (!new_folio) return -ENOMEM; - __prep_new_hugetlb_folio(h, new_folio); goto retry; } From 4094d3434b25a1f0524c28bc3a253fd09d05c361 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 10 Sep 2025 21:39:55 +0800 Subject: [PATCH 228/372] mm: hugetlb: convert to account_new_hugetlb_folio() In order to avoid the wrong nid passed into the account, and we did make such mistake before, so it's better to move folio_nid() into account_new_hugetlb_folio(). Link: https://lkml.kernel.org/r/20250910133958.301467-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Oscar Salvador Reviewed-by: Sidhartha Kumar Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Jane Chu Cc: Johannes Weiner Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/hugetlb.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1378b37864c8..ef6284ec85b6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1890,11 +1890,11 @@ void free_huge_folio(struct folio *folio) /* * Must be called with the hugetlb lock held */ -static void __prep_account_new_huge_page(struct hstate *h, int nid) +static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio) { lockdep_assert_held(&hugetlb_lock); h->nr_huge_pages++; - h->nr_huge_pages_node[nid]++; + h->nr_huge_pages_node[folio_nid(folio)]++; } static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) @@ -2020,7 +2020,7 @@ static void prep_and_add_allocated_folios(struct hstate *h, /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { - __prep_account_new_huge_page(h, folio_nid(folio)); + account_new_hugetlb_folio(h, folio); enqueue_hugetlb_folio(h, folio); } spin_unlock_irqrestore(&hugetlb_lock, flags); @@ -2232,7 +2232,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, * as surplus_pages, otherwise it might confuse * persistent_huge_pages() momentarily. */ - __prep_account_new_huge_page(h, folio_nid(folio)); + account_new_hugetlb_folio(h, folio); /* * We could have raced with the pool size change. @@ -2270,7 +2270,7 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas return NULL; spin_lock_irq(&hugetlb_lock); - __prep_account_new_huge_page(h, folio_nid(folio)); + account_new_hugetlb_folio(h, folio); spin_unlock_irq(&hugetlb_lock); /* fresh huge pages are frozen */ @@ -2829,7 +2829,7 @@ retry: /* * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be - * incremented again when calling __prep_account_new_huge_page() + * incremented again when calling account_new_hugetlb_folio() * and enqueue_hugetlb_folio() for new_folio. The counters will * remain stable since this happens under the lock. */ @@ -2839,7 +2839,7 @@ retry: * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ - __prep_account_new_huge_page(h, nid); + account_new_hugetlb_folio(h, new_folio); enqueue_hugetlb_folio(h, new_folio); /* @@ -3313,7 +3313,7 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h, hugetlb_bootmem_init_migratetype(folio, h); /* Subdivide locks to achieve better parallel performance */ spin_lock_irqsave(&hugetlb_lock, flags); - __prep_account_new_huge_page(h, folio_nid(folio)); + account_new_hugetlb_folio(h, folio); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } From 4a25f995bd59843a898b531bb3e472d710ef9439 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 10 Sep 2025 21:39:56 +0800 Subject: [PATCH 229/372] mm: hugetlb: directly pass order when allocate a hugetlb folio Use order instead of struct hstate to remove huge_page_order() call from all hugetlb folio allocation, also order_is_gigantic() is added to check whether it is a gigantic order. Link: https://lkml.kernel.org/r/20250910133958.301467-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Oscar Salvador Reviewed-by: Sidhartha Kumar Reviewed-by: Jane Chu Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 ++++++- mm/hugetlb.c | 29 ++++++++++++++--------------- mm/hugetlb_cma.c | 3 +-- mm/hugetlb_cma.h | 6 +++--- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 526d27e88b3b..8e63e46b8e1f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -788,9 +788,14 @@ static inline unsigned huge_page_shift(struct hstate *h) return h->order + PAGE_SHIFT; } +static inline bool order_is_gigantic(unsigned int order) +{ + return order > MAX_PAGE_ORDER; +} + static inline bool hstate_is_gigantic(struct hstate *h) { - return huge_page_order(h) > MAX_PAGE_ORDER; + return order_is_gigantic(huge_page_order(h)); } static inline unsigned int pages_per_huge_page(const struct hstate *h) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ef6284ec85b6..7f33e4a158c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1473,17 +1473,16 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { struct folio *folio; - int order = huge_page_order(h); bool retried = false; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); retry: - folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask); + folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); if (!folio) { if (hugetlb_cma_exclusive_alloc()) return NULL; @@ -1506,16 +1505,16 @@ retry: } #else /* !CONFIG_CONTIG_ALLOC */ -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, + nodemask_t *nodemask) { return NULL; } #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, + nodemask_t *nodemask) { return NULL; } @@ -1926,11 +1925,9 @@ struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio) return NULL; } -static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) +static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { - int order = huge_page_order(h); struct folio *folio; bool alloc_try_hard = true; @@ -1980,11 +1977,13 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, nodemask_t *node_alloc_noretry) { struct folio *folio; + int order = huge_page_order(h); - if (hstate_is_gigantic(h)) - folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); + if (order_is_gigantic(order)) + folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask); else - folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); + folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask, + node_alloc_noretry); if (folio) init_new_hugetlb_folio(h, folio); return folio; @@ -2872,7 +2871,7 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (folio_order(folio) > MAX_PAGE_ORDER) + if (order_is_gigantic(folio_order(folio))) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) diff --git a/mm/hugetlb_cma.c b/mm/hugetlb_cma.c index f58ef4969e7a..e8e4dc7182d5 100644 --- a/mm/hugetlb_cma.c +++ b/mm/hugetlb_cma.c @@ -26,11 +26,10 @@ void hugetlb_cma_free_folio(struct folio *folio) } -struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, +struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { int node; - int order = huge_page_order(h); struct folio *folio = NULL; if (hugetlb_cma[nid]) diff --git a/mm/hugetlb_cma.h b/mm/hugetlb_cma.h index f7d7fb9880a2..2c2ec8a7e134 100644 --- a/mm/hugetlb_cma.h +++ b/mm/hugetlb_cma.h @@ -4,7 +4,7 @@ #ifdef CONFIG_CMA void hugetlb_cma_free_folio(struct folio *folio); -struct folio *hugetlb_cma_alloc_folio(struct hstate *h, gfp_t gfp_mask, +struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, int nid, nodemask_t *nodemask); struct huge_bootmem_page *hugetlb_cma_alloc_bootmem(struct hstate *h, int *nid, bool node_exact); @@ -18,8 +18,8 @@ static inline void hugetlb_cma_free_folio(struct folio *folio) { } -static inline struct folio *hugetlb_cma_alloc_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nodemask) +static inline struct folio *hugetlb_cma_alloc_folio(int order, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } From dd4d324bc02c7b14ae5dd864d185d1403648c74d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 10 Sep 2025 21:39:57 +0800 Subject: [PATCH 230/372] mm: hugetlb: remove struct hstate from init_new_hugetlb_folio() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The struct hstate is never used since commit d67e32f26713 ("hugetlb: restructure pool allocations”), remove it. Link: https://lkml.kernel.org/r/20250910133958.301467-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Oscar Salvador Reviewed-by: Sidhartha Kumar Reviewed-by: Jane Chu Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/hugetlb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7f33e4a158c6..1783b9e7c338 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1896,7 +1896,7 @@ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio) h->nr_huge_pages_node[folio_nid(folio)]++; } -static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) +static void init_new_hugetlb_folio(struct folio *folio) { __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); @@ -1985,7 +1985,7 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, folio = alloc_buddy_hugetlb_folio(order, gfp_mask, nid, nmask, node_alloc_noretry); if (folio) - init_new_hugetlb_folio(h, folio); + init_new_hugetlb_folio(folio); return folio; } @@ -3408,7 +3408,7 @@ static void __init gather_bootmem_prealloc_node(unsigned long nid) hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - init_new_hugetlb_folio(h, folio); + init_new_hugetlb_folio(folio); if (hugetlb_bootmem_page_prehvo(m)) /* @@ -4041,7 +4041,7 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, prep_compound_page(page, dst->order); new_folio->mapping = NULL; - init_new_hugetlb_folio(dst, new_folio); + init_new_hugetlb_folio(new_folio); /* Copy the CMA flag so that it is freed correctly */ if (cma) folio_set_hugetlb_cma(new_folio); From 4fe2a8107f332a46ed284fb961a4ddb39a105509 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 10 Sep 2025 21:39:58 +0800 Subject: [PATCH 231/372] mm: hugeltb: check NUMA_NO_NODE in only_alloc_fresh_hugetlb_folio() Move the NUMA_NO_NODE check out of buddy and gigantic folio allocation to cleanup code a bit, also this will avoid NUMA_NO_NODE passed as 'nid' to node_isset() in alloc_buddy_hugetlb_folio(). Link: https://lkml.kernel.org/r/20250910133958.301467-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Oscar Salvador Reviewed-by: Sidhartha Kumar Reviewed-by: Jane Chu Reviewed-by: Zi Yan Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Muchun Song Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1783b9e7c338..d2471a0b6002 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1479,8 +1479,6 @@ static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, struct folio *folio; bool retried = false; - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); retry: folio = hugetlb_cma_alloc_folio(order, gfp_mask, nid, nodemask); if (!folio) { @@ -1942,8 +1940,6 @@ static struct folio *alloc_buddy_hugetlb_folio(int order, gfp_t gfp_mask, alloc_try_hard = false; if (alloc_try_hard) gfp_mask |= __GFP_RETRY_MAYFAIL; - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); folio = (struct folio *)__alloc_frozen_pages(gfp_mask, order, nid, nmask); @@ -1979,6 +1975,9 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, struct folio *folio; int order = huge_page_order(h); + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + if (order_is_gigantic(order)) folio = alloc_gigantic_folio(order, gfp_mask, nid, nmask); else From 8eccb066f28747e966bda716cb90dbca13b78032 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:10 +0200 Subject: [PATCH 232/372] mm: constify shmem related test functions for improved const-correctness Patch series "mm: establish const-correctness for pointer parameters", v6. This series is to improved const-correctness in the low-level memory-management subsystem, which provides a basis for further constification further up the call stack (e.g. filesystems). I started this work when I tried to constify the Ceph filesystem code, but found that to be impossible because many "mm" functions accept non-const pointers, even though they modify nothing. This patch (of 12): We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-1-max.kellermann@ionos.com Link: https://lkml.kernel.org/r/20250901205021.3573313-2-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 8 ++++---- include/linux/shmem_fs.h | 4 ++-- mm/shmem.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b26ca8b2162d..45a47b555499 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -995,11 +995,11 @@ static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ -bool vma_is_shmem(struct vm_area_struct *vma); -bool vma_is_anon_shmem(struct vm_area_struct *vma); +bool vma_is_shmem(const struct vm_area_struct *vma); +bool vma_is_anon_shmem(const struct vm_area_struct *vma); #else -static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } -static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } +static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false; } +static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 6d0f9c599ff7..0e47465ef0fd 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -99,9 +99,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts); #ifdef CONFIG_SHMEM -bool shmem_mapping(struct address_space *mapping); +bool shmem_mapping(const struct address_space *mapping); #else -static inline bool shmem_mapping(struct address_space *mapping) +static inline bool shmem_mapping(const struct address_space *mapping) { return false; } diff --git a/mm/shmem.c b/mm/shmem.c index 640fecc42f60..2df26f4d6e60 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -275,18 +275,18 @@ static const struct vm_operations_struct shmem_vm_ops; static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; -bool shmem_mapping(struct address_space *mapping) +bool shmem_mapping(const struct address_space *mapping) { return mapping->a_ops == &shmem_aops; } EXPORT_SYMBOL_GPL(shmem_mapping); -bool vma_is_anon_shmem(struct vm_area_struct *vma) +bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return vma->vm_ops == &shmem_anon_vm_ops; } -bool vma_is_shmem(struct vm_area_struct *vma) +bool vma_is_shmem(const struct vm_area_struct *vma) { return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } From 7c3e97ac0d75306d9d03de575c9878f8fd9efe3b Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:11 +0200 Subject: [PATCH 233/372] mm: constify pagemap related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-3-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 55 +++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f0dfdfb13cd9..0d66a252b06f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -140,7 +140,7 @@ static inline int inode_drain_writes(struct inode *inode) return filemap_write_and_wait(inode->i_mapping); } -static inline bool mapping_empty(struct address_space *mapping) +static inline bool mapping_empty(const struct address_space *mapping) { return xa_empty(&mapping->i_pages); } @@ -166,7 +166,7 @@ static inline bool mapping_empty(struct address_space *mapping) * refcount and the referenced bit, which will be elevated or set in * the process of adding new cache pages to an inode. */ -static inline bool mapping_shrinkable(struct address_space *mapping) +static inline bool mapping_shrinkable(const struct address_space *mapping) { void *head; @@ -267,7 +267,7 @@ static inline void mapping_clear_unevictable(struct address_space *mapping) clear_bit(AS_UNEVICTABLE, &mapping->flags); } -static inline bool mapping_unevictable(struct address_space *mapping) +static inline bool mapping_unevictable(const struct address_space *mapping) { return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags); } @@ -277,7 +277,7 @@ static inline void mapping_set_exiting(struct address_space *mapping) set_bit(AS_EXITING, &mapping->flags); } -static inline int mapping_exiting(struct address_space *mapping) +static inline int mapping_exiting(const struct address_space *mapping) { return test_bit(AS_EXITING, &mapping->flags); } @@ -287,7 +287,7 @@ static inline void mapping_set_no_writeback_tags(struct address_space *mapping) set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } -static inline int mapping_use_writeback_tags(struct address_space *mapping) +static inline int mapping_use_writeback_tags(const struct address_space *mapping) { return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } @@ -333,7 +333,7 @@ static inline void mapping_set_inaccessible(struct address_space *mapping) set_bit(AS_INACCESSIBLE, &mapping->flags); } -static inline bool mapping_inaccessible(struct address_space *mapping) +static inline bool mapping_inaccessible(const struct address_space *mapping) { return test_bit(AS_INACCESSIBLE, &mapping->flags); } @@ -343,18 +343,18 @@ static inline void mapping_set_writeback_may_deadlock_on_reclaim(struct address_ set_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline bool mapping_writeback_may_deadlock_on_reclaim(struct address_space *mapping) +static inline bool mapping_writeback_may_deadlock_on_reclaim(const struct address_space *mapping) { return test_bit(AS_WRITEBACK_MAY_DEADLOCK_ON_RECLAIM, &mapping->flags); } -static inline gfp_t mapping_gfp_mask(struct address_space * mapping) +static inline gfp_t mapping_gfp_mask(const struct address_space *mapping) { return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ -static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, +static inline gfp_t mapping_gfp_constraint(const struct address_space *mapping, gfp_t gfp_mask) { return mapping_gfp_mask(mapping) & gfp_mask; @@ -477,7 +477,7 @@ mapping_min_folio_order(const struct address_space *mapping) } static inline unsigned long -mapping_min_folio_nrpages(struct address_space *mapping) +mapping_min_folio_nrpages(const struct address_space *mapping) { return 1UL << mapping_min_folio_order(mapping); } @@ -491,7 +491,7 @@ mapping_min_folio_nrpages(struct address_space *mapping) * new folio to the page cache and need to know what index to give it, * call this function. */ -static inline pgoff_t mapping_align_index(struct address_space *mapping, +static inline pgoff_t mapping_align_index(const struct address_space *mapping, pgoff_t index) { return round_down(index, mapping_min_folio_nrpages(mapping)); @@ -501,7 +501,7 @@ static inline pgoff_t mapping_align_index(struct address_space *mapping, * Large folio support currently depends on THP. These dependencies are * being worked on but are not yet fixed. */ -static inline bool mapping_large_folio_support(struct address_space *mapping) +static inline bool mapping_large_folio_support(const struct address_space *mapping) { /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ VM_WARN_ONCE((unsigned long)mapping & FOLIO_MAPPING_ANON, @@ -516,7 +516,7 @@ static inline size_t mapping_max_folio_size(const struct address_space *mapping) return PAGE_SIZE << mapping_max_folio_order(mapping); } -static inline int filemap_nr_thps(struct address_space *mapping) +static inline int filemap_nr_thps(const struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS return atomic_read(&mapping->nr_thps); @@ -930,7 +930,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, * * Return: The index of the folio which follows this folio in the file. */ -static inline pgoff_t folio_next_index(struct folio *folio) +static inline pgoff_t folio_next_index(const struct folio *folio) { return folio->index + folio_nr_pages(folio); } @@ -959,7 +959,7 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) * e.g., shmem did not move this folio to the swap cache. * Return: true or false. */ -static inline bool folio_contains(struct folio *folio, pgoff_t index) +static inline bool folio_contains(const struct folio *folio, pgoff_t index) { VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); return index - folio->index < folio_nr_pages(folio); @@ -1036,13 +1036,13 @@ static inline loff_t page_offset(struct page *page) /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ -static inline pgoff_t folio_pgoff(struct folio *folio) +static inline pgoff_t folio_pgoff(const struct folio *folio) { return folio->index; } -static inline pgoff_t linear_page_index(struct vm_area_struct *vma, - unsigned long address) +static inline pgoff_t linear_page_index(const struct vm_area_struct *vma, + const unsigned long address) { pgoff_t pgoff; pgoff = (address - vma->vm_start) >> PAGE_SHIFT; @@ -1462,7 +1462,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ -static inline loff_t readahead_pos(struct readahead_control *rac) +static inline loff_t readahead_pos(const struct readahead_control *rac) { return (loff_t)rac->_index * PAGE_SIZE; } @@ -1471,7 +1471,7 @@ static inline loff_t readahead_pos(struct readahead_control *rac) * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ -static inline size_t readahead_length(struct readahead_control *rac) +static inline size_t readahead_length(const struct readahead_control *rac) { return rac->_nr_pages * PAGE_SIZE; } @@ -1480,7 +1480,7 @@ static inline size_t readahead_length(struct readahead_control *rac) * readahead_index - The index of the first page in this readahead request. * @rac: The readahead request. */ -static inline pgoff_t readahead_index(struct readahead_control *rac) +static inline pgoff_t readahead_index(const struct readahead_control *rac) { return rac->_index; } @@ -1489,7 +1489,7 @@ static inline pgoff_t readahead_index(struct readahead_control *rac) * readahead_count - The number of pages in this readahead request. * @rac: The readahead request. */ -static inline unsigned int readahead_count(struct readahead_control *rac) +static inline unsigned int readahead_count(const struct readahead_control *rac) { return rac->_nr_pages; } @@ -1498,12 +1498,12 @@ static inline unsigned int readahead_count(struct readahead_control *rac) * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ -static inline size_t readahead_batch_length(struct readahead_control *rac) +static inline size_t readahead_batch_length(const struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } -static inline unsigned long dir_pages(struct inode *inode) +static inline unsigned long dir_pages(const struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -1517,8 +1517,8 @@ static inline unsigned long dir_pages(struct inode *inode) * Return: the number of bytes in the folio up to EOF, * or -EFAULT if the folio was truncated. */ -static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, - struct inode *inode) +static inline ssize_t folio_mkwrite_check_truncate(const struct folio *folio, + const struct inode *inode) { loff_t size = i_size_read(inode); pgoff_t index = size >> PAGE_SHIFT; @@ -1549,7 +1549,8 @@ static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio, * Return: The number of filesystem blocks covered by this folio. */ static inline -unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio) +unsigned int i_blocks_per_folio(const struct inode *inode, + const struct folio *folio) { return folio_size(folio) >> inode->i_blkbits; } From 959b0886256b6896b44633e0e07c5464169087c1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:12 +0200 Subject: [PATCH 234/372] mm: constify zone related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-4-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f3272ef5131b..6c4eae96160d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1104,7 +1104,7 @@ static inline unsigned long promo_wmark_pages(const struct zone *z) return wmark_pages(z, WMARK_PROMO); } -static inline unsigned long zone_managed_pages(struct zone *zone) +static inline unsigned long zone_managed_pages(const struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } @@ -1128,12 +1128,12 @@ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } -static inline bool zone_is_initialized(struct zone *zone) +static inline bool zone_is_initialized(const struct zone *zone) { return zone->initialized; } -static inline bool zone_is_empty(struct zone *zone) +static inline bool zone_is_empty(const struct zone *zone) { return zone->spanned_pages == 0; } @@ -1273,7 +1273,7 @@ static inline bool folio_is_zone_movable(const struct folio *folio) * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ -static inline bool zone_intersects(struct zone *zone, +static inline bool zone_intersects(const struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) @@ -1581,12 +1581,12 @@ static inline int local_memory_node(int node_id) { return node_id; }; #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) #ifdef CONFIG_ZONE_DEVICE -static inline bool zone_is_zone_device(struct zone *zone) +static inline bool zone_is_zone_device(const struct zone *zone) { return zone_idx(zone) == ZONE_DEVICE; } #else -static inline bool zone_is_zone_device(struct zone *zone) +static inline bool zone_is_zone_device(const struct zone *zone) { return false; } @@ -1598,19 +1598,19 @@ static inline bool zone_is_zone_device(struct zone *zone) * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ -static inline bool managed_zone(struct zone *zone) +static inline bool managed_zone(const struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ -static inline bool populated_zone(struct zone *zone) +static inline bool populated_zone(const struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NUMA -static inline int zone_to_nid(struct zone *zone) +static inline int zone_to_nid(const struct zone *zone) { return zone->node; } @@ -1620,7 +1620,7 @@ static inline void zone_set_nid(struct zone *zone, int nid) zone->node = nid; } #else -static inline int zone_to_nid(struct zone *zone) +static inline int zone_to_nid(const struct zone *zone) { return 0; } @@ -1647,7 +1647,7 @@ static inline int is_highmem_idx(enum zone_type idx) * @zone: pointer to struct zone variable * Return: 1 for a highmem zone, 0 otherwise */ -static inline int is_highmem(struct zone *zone) +static inline int is_highmem(const struct zone *zone) { return is_highmem_idx(zone_idx(zone)); } @@ -1713,12 +1713,12 @@ static inline struct zone *zonelist_zone(struct zoneref *zoneref) return zoneref->zone; } -static inline int zonelist_zone_idx(struct zoneref *zoneref) +static inline int zonelist_zone_idx(const struct zoneref *zoneref) { return zoneref->zone_idx; } -static inline int zonelist_node_idx(struct zoneref *zoneref) +static inline int zonelist_node_idx(const struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } @@ -2021,7 +2021,7 @@ static inline struct page *__section_mem_map_addr(struct mem_section *section) return (struct page *)map; } -static inline int present_section(struct mem_section *section) +static inline int present_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } @@ -2031,12 +2031,12 @@ static inline int present_section_nr(unsigned long nr) return present_section(__nr_to_section(nr)); } -static inline int valid_section(struct mem_section *section) +static inline int valid_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } -static inline int early_section(struct mem_section *section) +static inline int early_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } @@ -2046,27 +2046,27 @@ static inline int valid_section_nr(unsigned long nr) return valid_section(__nr_to_section(nr)); } -static inline int online_section(struct mem_section *section) +static inline int online_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } #ifdef CONFIG_ZONE_DEVICE -static inline int online_device_section(struct mem_section *section) +static inline int online_device_section(const struct mem_section *section) { unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE; return section && ((section->section_mem_map & flags) == flags); } #else -static inline int online_device_section(struct mem_section *section) +static inline int online_device_section(const struct mem_section *section) { return 0; } #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT -static inline int preinited_vmemmap_section(struct mem_section *section) +static inline int preinited_vmemmap_section(const struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT)); @@ -2076,7 +2076,7 @@ void sparse_vmemmap_init_nid_early(int nid); void sparse_vmemmap_init_nid_late(int nid); #else -static inline int preinited_vmemmap_section(struct mem_section *section) +static inline int preinited_vmemmap_section(const struct mem_section *section) { return 0; } From b119fb0927738f150cbd179d23d08057dccd75c1 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:13 +0200 Subject: [PATCH 235/372] fs: constify mapping related test functions for improved const-correctness We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-5-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Jan Kara Reviewed-by: Christian Brauner Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index d7ab4f96d705..0783c5d05d3f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -537,7 +537,7 @@ struct address_space { /* * Returns true if any of the pages in the mapping are marked with the tag. */ -static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) +static inline bool mapping_tagged(const struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } @@ -585,7 +585,7 @@ static inline void i_mmap_assert_write_locked(struct address_space *mapping) /* * Might pages of this file be mapped into userspace? */ -static inline int mapping_mapped(struct address_space *mapping) +static inline int mapping_mapped(const struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } @@ -599,7 +599,7 @@ static inline int mapping_mapped(struct address_space *mapping) * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ -static inline int mapping_writably_mapped(struct address_space *mapping) +static inline int mapping_writably_mapped(const struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } From 4680092f8ccb4406e771a6b1a2c0243ebd40bab7 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:14 +0200 Subject: [PATCH 236/372] mm: constify process_shares_mm() for improved const-correctness This function only reads from the pointer arguments. Local (loop) variables are also annotated with `const` to clarify that these will not be written to. Link: https://lkml.kernel.org/r/20250901205021.3573313-6-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/oom_kill.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 45a47b555499..b3b63058e1a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3872,7 +3872,7 @@ static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) } #endif /* __HAVE_ARCH_GATE_AREA */ -extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm); void drop_slab(void); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 17650f0b516e..58bd4cf71d52 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -490,12 +490,12 @@ static bool oom_killer_disabled __read_mostly; * task's threads: if one of those is using this mm then this task was also * using it. */ -bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +bool process_shares_mm(const struct task_struct *p, const struct mm_struct *mm) { - struct task_struct *t; + const struct task_struct *t; for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); + const struct mm_struct *t_mm = READ_ONCE(t->mm); if (t_mm) return t_mm == mm; } From 0bf25cfc9e795ab302ee23550fdeebd2aeedf800 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:15 +0200 Subject: [PATCH 237/372] mm, s390: constify mapping related test/getter functions For improved const-correctness. We select certain test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. (Even though seemingly unrelated, this also constifies the pointer parameter of mmap_is_legacy() in arch/s390/mm/mmap.c because a copy of the function exists in mm/util.c.) Link: https://lkml.kernel.org/r/20250901205021.3573313-7-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/s390/mm/mmap.c | 2 +- include/linux/mm.h | 6 +++--- include/linux/pagemap.h | 2 +- mm/util.c | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 547104ccc22a..e188cb6d4946 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -27,7 +27,7 @@ static unsigned long stack_maxrandom_size(void) return STACK_RND_MASK << PAGE_SHIFT; } -static inline int mmap_is_legacy(struct rlimit *rlim_stack) +static inline int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; diff --git a/include/linux/mm.h b/include/linux/mm.h index b3b63058e1a3..221e98bb7689 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1002,7 +1002,7 @@ static inline bool vma_is_shmem(const struct vm_area_struct *vma) { return false static inline bool vma_is_anon_shmem(const struct vm_area_struct *vma) { return false; } #endif -int vma_is_stack_for_current(struct vm_area_struct *vma); +int vma_is_stack_for_current(const struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } @@ -2617,7 +2617,7 @@ void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, - struct task_struct *task, bool bypass_rlim); + const struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr, int *locked); @@ -3380,7 +3380,7 @@ void anon_vma_interval_tree_verify(struct anon_vma_chain *node); avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ -extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); +extern int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0d66a252b06f..aec4a11565bc 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -545,7 +545,7 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -struct address_space *folio_mapping(struct folio *); +struct address_space *folio_mapping(const struct folio *folio); /** * folio_flush_mapping - Find the file mapping this folio belongs to. diff --git a/mm/util.c b/mm/util.c index e29d3310e26b..391f6e7daf83 100644 --- a/mm/util.c +++ b/mm/util.c @@ -315,7 +315,7 @@ void *memdup_user_nul(const void __user *src, size_t len) EXPORT_SYMBOL(memdup_user_nul); /* Check if the vma is being used as a stack by this task */ -int vma_is_stack_for_current(struct vm_area_struct *vma) +int vma_is_stack_for_current(const struct vm_area_struct *vma) { struct task_struct * __maybe_unused t = current; @@ -410,7 +410,7 @@ unsigned long arch_mmap_rnd(void) return rnd << PAGE_SHIFT; } -static int mmap_is_legacy(struct rlimit *rlim_stack) +static int mmap_is_legacy(const struct rlimit *rlim_stack) { if (current->personality & ADDR_COMPAT_LAYOUT) return 1; @@ -504,7 +504,7 @@ EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout); * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. */ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, - struct task_struct *task, bool bypass_rlim) + const struct task_struct *task, bool bypass_rlim) { unsigned long locked_vm, limit; int ret = 0; @@ -688,7 +688,7 @@ struct anon_vma *folio_anon_vma(const struct folio *folio) * You can call this for folios which aren't in the swap cache or page * cache and it will return NULL. */ -struct address_space *folio_mapping(struct folio *folio) +struct address_space *folio_mapping(const struct folio *folio) { struct address_space *mapping; @@ -926,7 +926,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +int __vm_enough_memory(const struct mm_struct *mm, long pages, int cap_sys_admin) { long allowed; unsigned long bytes_failed; From e7f778767d2e54d9019dd142708547b4e457dab9 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:16 +0200 Subject: [PATCH 238/372] parisc: constify mmap_upper_limit() parameter For improved const-correctness. This piece is necessary to make the `rlim_stack` parameter to mmap_base() const. Link: https://lkml.kernel.org/r/20250901205021.3573313-8-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/parisc/include/asm/processor.h | 2 +- arch/parisc/kernel/sys_parisc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 4c14bde39aac..dd0b5e199559 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -48,7 +48,7 @@ #ifndef __ASSEMBLER__ struct rlimit; -unsigned long mmap_upper_limit(struct rlimit *rlim_stack); +unsigned long mmap_upper_limit(const struct rlimit *rlim_stack); unsigned long calc_max_stack_size(unsigned long stack_max); /* diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index f852fe274abe..b2cdbb8a12b1 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -77,7 +77,7 @@ unsigned long calc_max_stack_size(unsigned long stack_max) * indicating that "current" should be used instead of a passed-in * value from the exec bprm as done with arch_pick_mmap_layout(). */ -unsigned long mmap_upper_limit(struct rlimit *rlim_stack) +unsigned long mmap_upper_limit(const struct rlimit *rlim_stack) { unsigned long stack_base; From a955cca37288fe37cc1cde8d291e02717c8a7409 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:17 +0200 Subject: [PATCH 239/372] mm: constify arch_pick_mmap_layout() for improved const-correctness This function only reads from the rlimit pointer (but writes to the mm_struct pointer which is kept without `const`). All callees are already const-ified or (internal functions) are being constified by this patch. Link: https://lkml.kernel.org/r/20250901205021.3573313-9-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/s390/mm/mmap.c | 4 ++-- arch/sparc/kernel/sys_sparc_64.c | 2 +- arch/x86/mm/mmap.c | 6 +++--- include/linux/sched/mm.h | 4 ++-- mm/util.c | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index e188cb6d4946..197c1d9497a7 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -47,7 +47,7 @@ static unsigned long mmap_base_legacy(unsigned long rnd) } static inline unsigned long mmap_base(unsigned long rnd, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size() + stack_guard_gap; @@ -169,7 +169,7 @@ check_asce_limit: * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 785e9909340f..55faf2effa46 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -294,7 +294,7 @@ static unsigned long mmap_rnd(void) return rnd << PAGE_SHIFT; } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = mmap_rnd(); unsigned long gap; diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 708f85dc9380..82f3a987f7cf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -80,7 +80,7 @@ unsigned long arch_mmap_rnd(void) } static unsigned long mmap_base(unsigned long rnd, unsigned long task_size, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { unsigned long gap = rlim_stack->rlim_cur; unsigned long pad = stack_maxrandom_size(task_size) + stack_guard_gap; @@ -110,7 +110,7 @@ static unsigned long mmap_legacy_base(unsigned long rnd, */ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, unsigned long random_factor, unsigned long task_size, - struct rlimit *rlim_stack) + const struct rlimit *rlim_stack) { *legacy_base = mmap_legacy_base(random_factor, task_size); if (mmap_is_legacy()) @@ -119,7 +119,7 @@ static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, *base = mmap_base(random_factor, task_size, rlim_stack); } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { if (mmap_is_legacy()) mm_flags_clear(MMF_TOPDOWN, mm); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 2201da0afecc..0232d983b715 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -178,7 +178,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) #endif extern void arch_pick_mmap_layout(struct mm_struct *mm, - struct rlimit *rlim_stack); + const struct rlimit *rlim_stack); unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, @@ -211,7 +211,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long flags, vm_flags_t vm_flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, - struct rlimit *rlim_stack) {} + const struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) diff --git a/mm/util.c b/mm/util.c index 391f6e7daf83..732a2dfcaec7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -431,7 +431,7 @@ static int mmap_is_legacy(const struct rlimit *rlim_stack) #define MIN_GAP (SZ_128M) #define MAX_GAP (STACK_TOP / 6 * 5) -static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) +static unsigned long mmap_base(const unsigned long rnd, const struct rlimit *rlim_stack) { #ifdef CONFIG_STACK_GROWSUP /* @@ -462,7 +462,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) #endif } -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { unsigned long random_factor = 0UL; @@ -478,7 +478,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) } } #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) +void arch_pick_mmap_layout(struct mm_struct *mm, const struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm_flags_clear(MMF_TOPDOWN, mm); From 89bf840b84bb53393436426cd4acd80604bd26fd Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:18 +0200 Subject: [PATCH 240/372] mm: constify ptdesc_pmd_pts_count() and folio_get_private() These functions from mm_types.h are trivial getters that should never write to the given pointers. Link: https://lkml.kernel.org/r/20250901205021.3573313-10-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d934a3a5b443..275e8060d918 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -632,7 +632,7 @@ static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc) atomic_dec(&ptdesc->pt_share_count); } -static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc) +static inline int ptdesc_pmd_pts_count(const struct ptdesc *ptdesc) { return atomic_read(&ptdesc->pt_share_count); } @@ -660,7 +660,7 @@ static inline void set_page_private(struct page *page, unsigned long private) page->private = private; } -static inline void *folio_get_private(struct folio *folio) +static inline void *folio_get_private(const struct folio *folio) { return folio->private; } From f346a9473a2fbbab785d1733d475160f1fc54e5a Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:19 +0200 Subject: [PATCH 241/372] mm: constify various inline functions for improved const-correctness We select certain test functions plus folio_migrate_refs() from mm_inline.h which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. One exception is the function folio_migrate_refs() which does write to the "new" folio pointer; there, only the "old" folio pointer is being constified; only its "flags" field is read, but nothing written. Link: https://lkml.kernel.org/r/20250901205021.3573313-11-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 150302b4a905..d6c1011b38f2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -25,7 +25,7 @@ * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ -static inline int folio_is_file_lru(struct folio *folio) +static inline int folio_is_file_lru(const struct folio *folio) { return !folio_test_swapbacked(folio); } @@ -84,7 +84,7 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio) * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ -static __always_inline enum lru_list folio_lru_list(struct folio *folio) +static __always_inline enum lru_list folio_lru_list(const struct folio *folio) { enum lru_list lru; @@ -141,7 +141,7 @@ static inline int lru_tier_from_refs(int refs, bool workingset) return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } -static inline int folio_lru_refs(struct folio *folio) +static inline int folio_lru_refs(const struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags.f); @@ -154,14 +154,14 @@ static inline int folio_lru_refs(struct folio *folio) return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } -static inline int folio_lru_gen(struct folio *folio) +static inline int folio_lru_gen(const struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags.f); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } -static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) +static inline bool lru_gen_is_active(const struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; @@ -217,12 +217,13 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } -static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, +static inline unsigned long lru_gen_folio_seq(const struct lruvec *lruvec, + const struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); - struct lru_gen_folio *lrugen = &lruvec->lrugen; + const struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ @@ -302,7 +303,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return true; } -static inline void folio_migrate_refs(struct folio *new, struct folio *old) +static inline void folio_migrate_refs(struct folio *new, const struct folio *old) { unsigned long refs = READ_ONCE(old->flags.f) & LRU_REFS_MASK; @@ -330,7 +331,7 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return false; } -static inline void folio_migrate_refs(struct folio *new, struct folio *old) +static inline void folio_migrate_refs(struct folio *new, const struct folio *old) { } @@ -508,7 +509,7 @@ static inline void dec_tlb_flush_pending(struct mm_struct *mm) atomic_dec(&mm->tlb_flush_pending); } -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +static inline bool mm_tlb_flush_pending(const struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that @@ -521,7 +522,7 @@ static inline bool mm_tlb_flush_pending(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending); } -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +static inline bool mm_tlb_flush_nested(const struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL @@ -605,7 +606,7 @@ pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, return false; } -static inline bool vma_has_recency(struct vm_area_struct *vma) +static inline bool vma_has_recency(const struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; From da0045587d59d4ffd7710fa45cea51e5a48453a4 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:20 +0200 Subject: [PATCH 242/372] mm: constify assert/test functions in mm.h For improved const-correctness. We select certain assert and test functions which either invoke each other, functions that are already const-ified, or no further functions. It is therefore relatively trivial to const-ify them, which provides a basis for further const-ification further up the call stack. Link: https://lkml.kernel.org/r/20250901205021.3573313-12-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- include/linux/mm.h | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 221e98bb7689..a6bfa46937a8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -719,7 +719,7 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } -static inline void assert_fault_locked(struct vm_fault *vmf) +static inline void assert_fault_locked(const struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); @@ -732,7 +732,7 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } -static inline void assert_fault_locked(struct vm_fault *vmf) +static inline void assert_fault_locked(const struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } @@ -875,7 +875,7 @@ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) vma->vm_end >= vma->vm_mm->start_stack; } -static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) +static inline bool vma_is_temporary_stack(const struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); @@ -889,7 +889,7 @@ static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) return false; } -static inline bool vma_is_foreign(struct vm_area_struct *vma) +static inline bool vma_is_foreign(const struct vm_area_struct *vma) { if (!current->mm) return true; @@ -900,7 +900,7 @@ static inline bool vma_is_foreign(struct vm_area_struct *vma) return false; } -static inline bool vma_is_accessible(struct vm_area_struct *vma) +static inline bool vma_is_accessible(const struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } @@ -911,7 +911,7 @@ static inline bool is_shared_maywrite(vm_flags_t vm_flags) (VM_SHARED | VM_MAYWRITE); } -static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +static inline bool vma_is_shared_maywrite(const struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } @@ -1855,7 +1855,7 @@ static inline struct folio *pfn_folio(unsigned long pfn) } #ifdef CONFIG_MMU -static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) +static inline pte_t mk_pte(const struct page *page, pgprot_t pgprot) { return pfn_pte(page_to_pfn(page), pgprot); } @@ -1870,7 +1870,7 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) +static inline pte_t folio_mk_pte(const struct folio *folio, pgprot_t pgprot) { return pfn_pte(folio_pfn(folio), pgprot); } @@ -1886,7 +1886,7 @@ static inline pte_t folio_mk_pte(struct folio *folio, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) +static inline pmd_t folio_mk_pmd(const struct folio *folio, pgprot_t pgprot) { return pmd_mkhuge(pfn_pmd(folio_pfn(folio), pgprot)); } @@ -1902,7 +1902,7 @@ static inline pmd_t folio_mk_pmd(struct folio *folio, pgprot_t pgprot) * * Return: A page table entry suitable for mapping this folio. */ -static inline pud_t folio_mk_pud(struct folio *folio, pgprot_t pgprot) +static inline pud_t folio_mk_pud(const struct folio *folio, pgprot_t pgprot) { return pud_mkhuge(pfn_pud(folio_pfn(folio), pgprot)); } @@ -3520,7 +3520,7 @@ struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) return mtree_load(&mm->mm_mt, addr); } -static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) +static inline unsigned long stack_guard_start_gap(const struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; @@ -3532,7 +3532,7 @@ static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) return 0; } -static inline unsigned long vm_start_gap(struct vm_area_struct *vma) +static inline unsigned long vm_start_gap(const struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; @@ -3543,7 +3543,7 @@ static inline unsigned long vm_start_gap(struct vm_area_struct *vma) return vm_start; } -static inline unsigned long vm_end_gap(struct vm_area_struct *vma) +static inline unsigned long vm_end_gap(const struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; @@ -3555,7 +3555,7 @@ static inline unsigned long vm_end_gap(struct vm_area_struct *vma) return vm_end; } -static inline unsigned long vma_pages(struct vm_area_struct *vma) +static inline unsigned long vma_pages(const struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } @@ -3572,7 +3572,7 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, return vma; } -static inline bool range_in_vma(struct vm_area_struct *vma, +static inline bool range_in_vma(const struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); @@ -3688,7 +3688,7 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ -static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, +static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma, unsigned int flags) { /* @@ -3818,7 +3818,7 @@ static inline bool debug_guardpage_enabled(void) return static_branch_unlikely(&_debug_guardpage_enabled); } -static inline bool page_is_guard(struct page *page) +static inline bool page_is_guard(const struct page *page) { if (!debug_guardpage_enabled()) return false; @@ -3849,7 +3849,7 @@ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } -static inline bool page_is_guard(struct page *page) { return false; } +static inline bool page_is_guard(const struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, @@ -3931,7 +3931,7 @@ void vmemmap_free(unsigned long start, unsigned long end, #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ if (altmap) @@ -3945,7 +3945,7 @@ static inline void vmem_altmap_free(struct vmem_altmap *altmap, altmap->alloc -= nr_pfns; } #else -static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +static inline unsigned long vmem_altmap_offset(const struct vmem_altmap *altmap) { return 0; } From a847b17009ec271514b269c90320a3893cd9b667 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Mon, 1 Sep 2025 22:50:21 +0200 Subject: [PATCH 243/372] mm: constify highmem related functions for improved const-correctness Lots of functions in mm/highmem.c do not write to the given pointers and do not call functions that take non-const pointers and can therefore be constified. This includes functions like kunmap() which might be implemented in a way that writes to the pointer (e.g. to update reference counters or mapping fields), but currently are not. kmap() on the other hand cannot be made const because it calls set_page_address() which is non-const in some architectures/configurations. [akpm@linux-foundation.org: "fix" folio_page() build failure] Link: https://lkml.kernel.org/r/20250901205021.3573313-13-max.kellermann@ionos.com Signed-off-by: Max Kellermann Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (Microsoft) Acked-by: Shakeel Butt Cc: Alexander Gordeev Cc: Al Viro Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Axel Rasmussen Cc: Baolin Wang Cc: Borislav Betkov Cc: Christian Borntraeger Cc: Christian Brauner Cc: Christian Zankel Cc: David Rientjes Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: James Bottomley Cc: Jan Kara Cc: Jocelyn Falempe Cc: Liam Howlett Cc: Mark Brown Cc: Matthew Wilcox (Oracle) Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Hocko Cc: "Nysal Jan K.A" Cc: Oscar Salvador Cc: Peter Zijlstra Cc: Russel King Cc: Suren Baghdasaryan Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Thomas Huth Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Wei Xu Cc: Yuanchu Xie Signed-off-by: Andrew Morton --- arch/arm/include/asm/highmem.h | 6 +++--- arch/xtensa/include/asm/highmem.h | 2 +- include/linux/highmem-internal.h | 36 +++++++++++++++---------------- include/linux/highmem.h | 8 +++---- include/linux/page-flags.h | 4 ++-- mm/highmem.c | 10 ++++----- 6 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index b4b66220952d..bdb209e002a4 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -46,9 +46,9 @@ extern pte_t *pkmap_page_table; #endif #ifdef ARCH_NEEDS_KMAP_HIGH_GET -extern void *kmap_high_get(struct page *page); +extern void *kmap_high_get(const struct page *page); -static inline void *arch_kmap_local_high_get(struct page *page) +static inline void *arch_kmap_local_high_get(const struct page *page) { if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !cache_is_vivt()) return NULL; @@ -57,7 +57,7 @@ static inline void *arch_kmap_local_high_get(struct page *page) #define arch_kmap_local_high_get arch_kmap_local_high_get #else /* ARCH_NEEDS_KMAP_HIGH_GET */ -static inline void *kmap_high_get(struct page *page) +static inline void *kmap_high_get(const struct page *page) { return NULL; } diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 34b8b620e7f1..b55235f4adac 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -29,7 +29,7 @@ #if DCACHE_WAY_SIZE > PAGE_SIZE #define get_pkmap_color get_pkmap_color -static inline int get_pkmap_color(struct page *page) +static inline int get_pkmap_color(const struct page *page) { return DCACHE_ALIAS(page_to_phys(page)); } diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 36053c3d6d64..0574c21ca45d 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -7,7 +7,7 @@ */ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot); +void *__kmap_local_page_prot(const struct page *page, pgprot_t prot); void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); @@ -33,7 +33,7 @@ static inline void kmap_flush_tlb(unsigned long addr) { } #endif void *kmap_high(struct page *page); -void kunmap_high(struct page *page); +void kunmap_high(const struct page *page); void __kmap_flush_unused(void); struct page *__kmap_to_page(void *addr); @@ -50,7 +50,7 @@ static inline void *kmap(struct page *page) return addr; } -static inline void kunmap(struct page *page) +static inline void kunmap(const struct page *page) { might_sleep(); if (!PageHighMem(page)) @@ -68,12 +68,12 @@ static inline void kmap_flush_unused(void) __kmap_flush_unused(); } -static inline void *kmap_local_page(struct page *page) +static inline void *kmap_local_page(const struct page *page) { return __kmap_local_page_prot(page, kmap_prot); } -static inline void *kmap_local_page_try_from_panic(struct page *page) +static inline void *kmap_local_page_try_from_panic(const struct page *page) { if (!PageHighMem(page)) return page_address(page); @@ -81,13 +81,13 @@ static inline void *kmap_local_page_try_from_panic(struct page *page) return NULL; } -static inline void *kmap_local_folio(struct folio *folio, size_t offset) +static inline void *kmap_local_folio(const struct folio *folio, size_t offset) { - struct page *page = folio_page(folio, offset / PAGE_SIZE); + const struct page *page = folio_page(folio, offset / PAGE_SIZE); return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE; } -static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot) { return __kmap_local_page_prot(page, prot); } @@ -102,7 +102,7 @@ static inline void __kunmap_local(const void *vaddr) kunmap_local_indexed(vaddr); } -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); @@ -113,7 +113,7 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return __kmap_local_page_prot(page, prot); } -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic(const struct page *page) { return kmap_atomic_prot(page, kmap_prot); } @@ -173,32 +173,32 @@ static inline void *kmap(struct page *page) return page_address(page); } -static inline void kunmap_high(struct page *page) { } +static inline void kunmap_high(const struct page *page) { } static inline void kmap_flush_unused(void) { } -static inline void kunmap(struct page *page) +static inline void kunmap(const struct page *page) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(page_address(page)); #endif } -static inline void *kmap_local_page(struct page *page) +static inline void *kmap_local_page(const struct page *page) { return page_address(page); } -static inline void *kmap_local_page_try_from_panic(struct page *page) +static inline void *kmap_local_page_try_from_panic(const struct page *page) { return page_address(page); } -static inline void *kmap_local_folio(struct folio *folio, size_t offset) +static inline void *kmap_local_folio(const struct folio *folio, size_t offset) { return folio_address(folio) + offset; } -static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +static inline void *kmap_local_page_prot(const struct page *page, pgprot_t prot) { return kmap_local_page(page); } @@ -215,7 +215,7 @@ static inline void __kunmap_local(const void *addr) #endif } -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic(const struct page *page) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) migrate_disable(); @@ -225,7 +225,7 @@ static inline void *kmap_atomic(struct page *page) return page_address(page); } -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +static inline void *kmap_atomic_prot(const struct page *page, pgprot_t prot) { return kmap_atomic(page); } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6234f316468c..105cc4c00cc3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -43,7 +43,7 @@ static inline void *kmap(struct page *page); * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ -static inline void kunmap(struct page *page); +static inline void kunmap(const struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address @@ -93,7 +93,7 @@ static inline void kmap_flush_unused(void); * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ -static inline void *kmap_local_page(struct page *page); +static inline void *kmap_local_page(const struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage @@ -129,7 +129,7 @@ static inline void *kmap_local_page(struct page *page); * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ -static inline void *kmap_local_folio(struct folio *folio, size_t offset); +static inline void *kmap_local_folio(const struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! @@ -176,7 +176,7 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset); * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ -static inline void *kmap_atomic(struct page *page); +static inline void *kmap_atomic(const struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a88b61eec3f8..568011930e35 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,9 +316,9 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -static inline struct page *folio_page(struct folio *folio, unsigned long n) +static inline struct page *folio_page(const struct folio *folio, unsigned long n) { - return &folio->page + n; + return (struct page *)(&folio->page + n); } static __always_inline int PageTail(const struct page *page) diff --git a/mm/highmem.c b/mm/highmem.c index ef3189b36cad..b5c8e4c2d5d4 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -61,7 +61,7 @@ static inline int kmap_local_calc_idx(int idx) /* * Determine color of virtual address where the page should be mapped. */ -static inline unsigned int get_pkmap_color(struct page *page) +static inline unsigned int get_pkmap_color(const struct page *page) { return 0; } @@ -334,7 +334,7 @@ EXPORT_SYMBOL(kmap_high); * * This can be called from any context. */ -void *kmap_high_get(struct page *page) +void *kmap_high_get(const struct page *page) { unsigned long vaddr, flags; @@ -356,7 +356,7 @@ void *kmap_high_get(struct page *page) * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called * only from user context. */ -void kunmap_high(struct page *page) +void kunmap_high(const struct page *page) { unsigned long vaddr; unsigned long nr; @@ -508,7 +508,7 @@ static inline void kmap_local_idx_pop(void) #endif #ifndef arch_kmap_local_high_get -static inline void *arch_kmap_local_high_get(struct page *page) +static inline void *arch_kmap_local_high_get(const struct page *page) { return NULL; } @@ -572,7 +572,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) } EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); -void *__kmap_local_page_prot(struct page *page, pgprot_t prot) +void *__kmap_local_page_prot(const struct page *page, pgprot_t prot) { void *kmap; From 9fd53c8122271d9fe8b687f50a9bdf5588d41d0b Mon Sep 17 00:00:00 2001 From: Youling Tang Date: Fri, 11 Jul 2025 13:55:09 +0800 Subject: [PATCH 244/372] mm/filemap: align last_index to folio size On XFS systems with pagesize=4K, blocksize=16K, and CONFIG_TRANSPARENT_HUGEPAGE enabled, We observed the following readahead behaviors: # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=64k count=1 # ./tools/mm/page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136d4c __RU_l_________H______t_________________F_1 1 136d4d __RU_l__________T_____t_________________F_1 2 136d4e __RU_l__________T_____t_________________F_1 3 136d4f __RU_l__________T_____t_________________F_1 ... c 136bb8 __RU_l_________H______t_________________F_1 d 136bb9 __RU_l__________T_____t_________________F_1 e 136bba __RU_l__________T_____t_________________F_1 f 136bbb __RU_l__________T_____t_________________F_1 <-- first read 10 13c2cc ___U_l_________H______t______________I__F_1 <-- readahead flag 11 13c2cd ___U_l__________T_____t______________I__F_1 12 13c2ce ___U_l__________T_____t______________I__F_1 13 13c2cf ___U_l__________T_____t______________I__F_1 ... 1c 1405d4 ___U_l_________H______t_________________F_1 1d 1405d5 ___U_l__________T_____t_________________F_1 1e 1405d6 ___U_l__________T_____t_________________F_1 1f 1405d7 ___U_l__________T_____t_________________F_1 [ra_size = 32, req_count = 16, async_size = 16] # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=60k count=1 # ./page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136048 __RU_l_________H______t_________________F_1 ... c 110a40 __RU_l_________H______t_________________F_1 d 110a41 __RU_l__________T_____t_________________F_1 e 110a42 __RU_l__________T_____t_________________F_1 <-- first read f 110a43 __RU_l__________T_____t_________________F_1 <-- first readahead flag 10 13e7a8 ___U_l_________H______t_________________F_1 ... 20 137a00 ___U_l_________H______t_______P______I__F_1 <-- second readahead flag (20 - 2f) 21 137a01 ___U_l__________T_____t_______P______I__F_1 ... 3f 10d4af ___U_l__________T_____t_______P_________F_1 [first readahead: ra_size = 32, req_count = 15, async_size = 17] When reading 64k data (same for 61-63k range, where last_index is page-aligned in filemap_get_pages()), 128k readahead is triggered via page_cache_sync_ra() and the PG_readahead flag is set on the next folio (the one containing 0x10 page). When reading 60k data, 128k readahead is also triggered via page_cache_sync_ra(). However, in this case the readahead flag is set on the 0xf page. Although the requested read size (req_count) is 60k, the actual read will be aligned to folio size (64k), which triggers the readahead flag and initiates asynchronous readahead via page_cache_async_ra(). This results in two readahead operations totaling 256k. The root cause is that when the requested size is smaller than the actual read size (due to folio alignment), it triggers asynchronous readahead. By changing last_index alignment from page size to folio size, we ensure the requested size matches the actual read size, preventing the case where a single read operation triggers two readahead operations. After applying the patch: # echo 3 > /proc/sys/vm/drop_caches # dd if=test of=/dev/null bs=60k count=1 # ./page-types -r -L -f /mnt/xfs/test foffset offset flags 0 136d4c __RU_l_________H______t_________________F_1 1 136d4d __RU_l__________T_____t_________________F_1 2 136d4e __RU_l__________T_____t_________________F_1 3 136d4f __RU_l__________T_____t_________________F_1 ... c 136bb8 __RU_l_________H______t_________________F_1 d 136bb9 __RU_l__________T_____t_________________F_1 e 136bba __RU_l__________T_____t_________________F_1 <-- first read f 136bbb __RU_l__________T_____t_________________F_1 10 13c2cc ___U_l_________H______t______________I__F_1 <-- readahead flag 11 13c2cd ___U_l__________T_____t______________I__F_1 12 13c2ce ___U_l__________T_____t______________I__F_1 13 13c2cf ___U_l__________T_____t______________I__F_1 ... 1c 1405d4 ___U_l_________H______t_________________F_1 1d 1405d5 ___U_l__________T_____t_________________F_1 1e 1405d6 ___U_l__________T_____t_________________F_1 1f 1405d7 ___U_l__________T_____t_________________F_1 [ra_size = 32, req_count = 16, async_size = 16] The same phenomenon will occur when reading from 49k to 64k. Set the readahead flag to the next folio. Because the minimum order of folio in address_space equals the block size (at least in xfs and bcachefs that already support bs > ps), having request_count aligned to block size will not cause overread. [klarasmodin@gmail.com: fix overflow on 32-bit] Link: https://lkml.kernel.org/r/yru7qf5gvyzccq5ohhpylvxug5lr5tf54omspbjh4sm6pcdb2r@fpjgj2pxw7va [akpm@linux-foundation.org: update it for Max's constification efforts] Link: https://lkml.kernel.org/r/20250711055509.91587-1-youling.tang@linux.dev Co-developed-by: Chi Zhiling Signed-off-by: Chi Zhiling Signed-off-by: Youling Tang Signed-off-by: Klara Modin Reviewed-by: Ryan Roberts Reviewed-by: Jan Kara Cc: Matthew Wilcox (Oracle) Cc: Youling Tang Cc: David Hildenbrand Cc: Klara Modin Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 6 ++++++ mm/filemap.c | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index aec4a11565bc..185644e288ea 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -482,6 +482,12 @@ mapping_min_folio_nrpages(const struct address_space *mapping) return 1UL << mapping_min_folio_order(mapping); } +static inline unsigned long +mapping_min_folio_nrbytes(const struct address_space *mapping) +{ + return mapping_min_folio_nrpages(mapping) << PAGE_SHIFT; +} + /** * mapping_align_index() - Align index for this mapping. * @mapping: The address_space. diff --git a/mm/filemap.c b/mm/filemap.c index cd9387b0a5b5..344ab106c21c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2600,8 +2600,9 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, unsigned int flags; int err = 0; - /* "last_index" is the index of the page beyond the end of the read */ - last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE); + /* "last_index" is the index of the folio beyond the end of the read */ + last_index = round_up(iocb->ki_pos + count, + mapping_min_folio_nrbytes(mapping)) >> PAGE_SHIFT; retry: if (fatal_signal_pending(current)) return -EINTR; From f6d8c7102049f76027bf584eef3300d4bddc6462 Mon Sep 17 00:00:00 2001 From: Chi Zhiling Date: Fri, 29 Aug 2025 10:36:58 +0800 Subject: [PATCH 245/372] mpage: terminate read-ahead on read error For exFAT filesystems with 4MB read_ahead_size, removing the storage device during read operations can delay EIO error reporting by several minutes. This occurs because the read-ahead implementation in mpage doesn't handle errors. Another reason for the delay is that the filesystem requires metadata to issue file read request. When the storage device is removed, the metadata buffers are invalidated, causing mpage to repeatedly attempt to fetch metadata during each get_block call. The original purpose of this patch is terminate read ahead when we fail to get metadata, to make the patch more generic, implement it by checking folio status, instead of checking the return of get_block(). So, if a folio is synchronously unlocked and non-uptodate, should we quit the read ahead? I think it depends on whether the error is permanent or temporary, and whether further read ahead might succeed. A device being unplugged is one reason for returning such a folio, but we could return it for many other reasons (e.g., metadata errors). I think most errors won't be restored in a short time, so we should quit read ahead when they occur. Link: https://lkml.kernel.org/r/20250829023659.688649-1-chizhiling@163.com Signed-off-by: Chi Zhiling Reviewed-by: Jan Kara Cc: Al Viro Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Cc: Namjae Jeon Cc: Sungjong Seo Cc: Yuezhang Mo Signed-off-by: Andrew Morton --- fs/mpage.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/mpage.c b/fs/mpage.c index c5fd821fd30e..e4c11831f234 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -369,6 +369,12 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) args.folio = folio; args.nr_pages = readahead_count(rac); args.bio = do_mpage_readpage(&args); + /* + * If read ahead failed synchronously, it may cause by removed + * device, or some filesystem metadata error. + */ + if (!folio_test_locked(folio) && !folio_test_uptodate(folio)) + break; } if (args.bio) mpage_bio_submit_read(args.bio); From 8583bb0f9a7ed744a376ad4f1647efc240927a5a Mon Sep 17 00:00:00 2001 From: Chi Zhiling Date: Fri, 29 Aug 2025 10:36:59 +0800 Subject: [PATCH 246/372] mpage: convert do_mpage_readpage() to return void type The return value of do_mpage_readpage() is arg->bio, which is already set in the arg structure. Returning it again is redundant. This patch changes the return type to void since the caller doesn't care about the return value. Link: https://lkml.kernel.org/r/20250829023659.688649-2-chizhiling@163.com Signed-off-by: Chi Zhiling Reviewed-by: Jan Kara Cc: Al Viro Cc: Christian Brauner Cc: Matthew Wilcox (Oracle) Cc: Namjae Jeon Cc: Sungjong Seo Cc: Yuezhang Mo Signed-off-by: Andrew Morton --- fs/mpage.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index e4c11831f234..7dae5afc2b9e 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -148,7 +148,7 @@ struct mpage_readpage_args { * represent the validity of its disk mapping and to decide when to do the next * get_block() call. */ -static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) +static void do_mpage_readpage(struct mpage_readpage_args *args) { struct folio *folio = args->folio; struct inode *inode = folio->mapping->host; @@ -305,7 +305,7 @@ alloc_new: else args->last_block_in_bio = first_block + blocks_per_folio - 1; out: - return args->bio; + return; confused: if (args->bio) @@ -368,7 +368,7 @@ void mpage_readahead(struct readahead_control *rac, get_block_t get_block) prefetchw(&folio->flags); args.folio = folio; args.nr_pages = readahead_count(rac); - args.bio = do_mpage_readpage(&args); + do_mpage_readpage(&args); /* * If read ahead failed synchronously, it may cause by removed * device, or some filesystem metadata error. @@ -392,7 +392,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block) .get_block = get_block, }; - args.bio = do_mpage_readpage(&args); + do_mpage_readpage(&args); if (args.bio) mpage_bio_submit_read(args.bio); return 0; From 94326d3130b5e78a35265bbf7822148372b39231 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 3 Sep 2025 20:10:39 +0100 Subject: [PATCH 247/372] mm: remove mlock_count from struct page All users now use folio->mlock_count so we can remove this element of struct page. Move the useful comments over to struct folio. Link: https://lkml.kernel.org/r/20250903191041.1630338-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 275e8060d918..ff2b4e13215f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -94,14 +94,6 @@ struct page { union { struct list_head lru; - /* Or, for the Unevictable "LRU list" slot */ - struct { - /* Always even, to negate PageTail */ - void *__filler; - /* Count page's or folio's mlocks */ - unsigned int mlock_count; - }; - /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; @@ -391,7 +383,9 @@ struct folio { union { struct list_head lru; /* private: avoid cluttering the output */ + /* For the Unevictable "LRU list" slot */ struct { + /* Avoid compound_head */ void *__filler; /* public: */ unsigned int mlock_count; From 162f6c69ea9c42c5553a1f9408ef4291b5b54c5c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 3 Sep 2025 11:59:15 -0700 Subject: [PATCH 248/372] mm/page_alloc: add kernel-docs for free_pages() Patch series "Cleanup free_pages() misuse", v3. free_pages() is supposed to be called when we only have a virtual address. __free_pages() is supposed to be called when we have a page. There are a number of callers that use page_address() to get a page's virtual address then call free_pages() on it when they should just call __free_pages() directly. Add kernel-docs for free_pages() to help callers better understand which function they should be calling, and replace the obvious cases of misuse. This patch (of 7): Add kernel-docs to free_pages(). This will help callers understand when to use it instead of __free_pages(). Link: https://lkml.kernel.org/r/20250903185921.1785167-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20250903185921.1785167-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Acked-by: SeongJae Park Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Jens Axboe Cc: Justin Sanders Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Vishal Moola (Oracle) Cc: Will Deacon Cc: Dave Hansen Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 54dbb6f0d14e..8de5fb5528eb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5275,6 +5275,15 @@ void free_pages_nolock(struct page *page, unsigned int order) ___free_pages(page, order, FPI_TRYLOCK); } +/** + * free_pages - Free pages allocated with __get_free_pages(). + * @addr: The virtual address tied to a page returned from __get_free_pages(). + * @order: The order of the allocation. + * + * This function behaves the same as __free_pages(). Use this function + * to free pages when you only have a valid virtual address. If you have + * the page, call __free_pages() instead. + */ void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { From 367af0508f86d380a817c569c3036c116bf9381f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 3 Sep 2025 11:59:16 -0700 Subject: [PATCH 249/372] aoe: stop calling page_address() in free_page() free_page() should be used when we only have a virtual address. We should call __free_page() directly on our page instead. Link: https://lkml.kernel.org/r/20250903185921.1785167-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Jens Axboe Cc: Justin Sanders Cc: Madhavan Srinivasan Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: SeongJae Park Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/block/aoe/aoecmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 6298f8e271e3..a9affb7c264d 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1761,6 +1761,6 @@ aoecmd_exit(void) kfree(kts); kfree(ktiowq); - free_page((unsigned long) page_address(empty_page)); + __free_page(empty_page); empty_page = NULL; } From b45ef93701142a4a065bf3a5051c902678540ae1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 3 Sep 2025 11:59:17 -0700 Subject: [PATCH 250/372] x86: stop calling page_address() in free_pages() free_pages() should be used when we only have a virtual address. We should call __free_pages() directly on our page instead. Link: https://lkml.kernel.org/r/20250903185921.1785167-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Dave Hansen Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Jens Axboe Cc: Justin Sanders Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: SeongJae Park Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/mm/init_64.c | 2 +- arch/x86/platform/efi/memmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b9426fce5f3e..0e4270e20fad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1031,7 +1031,7 @@ static void __meminit free_pagetable(struct page *page, int order) free_reserved_pages(page, nr_pages); #endif } else { - free_pages((unsigned long)page_address(page), order); + __free_pages(page, order); } } diff --git a/arch/x86/platform/efi/memmap.c b/arch/x86/platform/efi/memmap.c index 061b8ecc71a1..023697c88910 100644 --- a/arch/x86/platform/efi/memmap.c +++ b/arch/x86/platform/efi/memmap.c @@ -42,7 +42,7 @@ void __init __efi_memmap_free(u64 phys, unsigned long size, unsigned long flags) struct page *p = pfn_to_page(PHYS_PFN(phys)); unsigned int order = get_order(size); - free_pages((unsigned long) page_address(p), order); + __free_pages(p, order); } } From 5e8fce2016d1a863913265a9b5fcc2ecf1955067 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 3 Sep 2025 11:59:18 -0700 Subject: [PATCH 251/372] riscv: stop calling page_address() in free_pages() free_pages() should be used when we only have a virtual address. We should call __free_pages() directly on our page instead. Link: https://lkml.kernel.org/r/20250903185921.1785167-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Hildenbrand Acked-by: Alexandre Ghiti Acked-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Jens Axboe Cc: Justin Sanders Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: SeongJae Park Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/mm/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 15683ae13fa5..1056c11d3251 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1624,7 +1624,7 @@ static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) if (PageReserved(page)) free_reserved_page(page); else - free_pages((unsigned long)page_address(page), 0); + __free_pages(page, 0); p4d_clear(p4d); } @@ -1646,7 +1646,7 @@ static void __meminit free_vmemmap_storage(struct page *page, size_t size, return; } - free_pages((unsigned long)page_address(page), order); + __free_pages(page, order); } static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end, From 57fd554c0723d2f9eef64ed8b5b659a6e9208a0b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 3 Sep 2025 11:59:19 -0700 Subject: [PATCH 252/372] powerpc: stop calling page_address() in free_pages() free_pages() should be used when we only have a virtual address. We should call __free_pages() directly on our page instead. Link: https://lkml.kernel.org/r/20250903185921.1785167-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Ritesh Harjani (IBM) Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Christophe Leroy Cc: Albert Ou Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Jens Axboe Cc: Justin Sanders Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: SeongJae Park Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index be523e5fe9c5..73977dbabcf2 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -780,7 +780,7 @@ static void __meminit free_vmemmap_pages(struct page *page, while (nr_pages--) free_reserved_page(page++); } else - free_pages((unsigned long)page_address(page), order); + __free_pages(page, order); } static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, From 77d7dadf89a131a49006650940adf7b001228fbe Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 3 Sep 2025 11:59:20 -0700 Subject: [PATCH 253/372] arm64: stop calling page_address() in free_pages() free_pages() should be used when we only have a virtual address. We should call __free_pages() directly on our page instead. Link: https://lkml.kernel.org/r/20250903185921.1785167-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Catalin Marinas Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Andy Lutomirski Cc: Dave Hansen Cc: Jens Axboe Cc: Justin Sanders Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: SeongJae Park Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 183801520740..980d7745a549 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -836,7 +836,7 @@ static void free_hotplug_page_range(struct page *page, size_t size, vmem_altmap_free(altmap, size >> PAGE_SHIFT); } else { WARN_ON(PageReserved(page)); - free_pages((unsigned long)page_address(page), get_order(size)); + __free_pages(page, get_order(size)); } } From d75d36547d11aed7fc0502e14f243873fb9c5844 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 3 Sep 2025 11:59:21 -0700 Subject: [PATCH 254/372] virtio_balloon: stop calling page_address() in free_pages() free_pages() should be used when we only have a virtual address. We should call __free_pages() directly on our page instead. Link: https://lkml.kernel.org/r/20250903185921.1785167-8-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: David Hildenbrand Acked-by: Michael S. Tsirkin Acked-by: Mike Rapoport (Microsoft) Cc: Albert Ou Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: Jens Axboe Cc: Justin Sanders Cc: Madhavan Srinivasan Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: "Michael S. Tsirkin" Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ritesh Harjani (IBM) Cc: SeongJae Park Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/virtio/virtio_balloon.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index eae65136cdfb..7f3fd72678eb 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -488,8 +488,7 @@ static unsigned long return_free_pages_to_mm(struct virtio_balloon *vb, page = balloon_page_pop(&vb->free_page_list); if (!page) break; - free_pages((unsigned long)page_address(page), - VIRTIO_BALLOON_HINT_BLOCK_ORDER); + __free_pages(page, VIRTIO_BALLOON_HINT_BLOCK_ORDER); } vb->num_free_page_blocks -= num_returned; spin_unlock_irq(&vb->free_page_list_lock); @@ -719,8 +718,7 @@ static int get_free_page_and_send(struct virtio_balloon *vb) if (vq->num_free > 1) { err = virtqueue_add_inbuf(vq, &sg, 1, p, GFP_KERNEL); if (unlikely(err)) { - free_pages((unsigned long)p, - VIRTIO_BALLOON_HINT_BLOCK_ORDER); + __free_pages(page, VIRTIO_BALLOON_HINT_BLOCK_ORDER); return err; } virtqueue_kick(vq); @@ -733,7 +731,7 @@ static int get_free_page_and_send(struct virtio_balloon *vb) * The vq has no available entry to add this page block, so * just free it. */ - free_pages((unsigned long)p, VIRTIO_BALLOON_HINT_BLOCK_ORDER); + __free_pages(page, VIRTIO_BALLOON_HINT_BLOCK_ORDER); } return 0; From 9abd8bd4c6b16d152b7149ef2f27a606ae4ebf4e Mon Sep 17 00:00:00 2001 From: Yueyang Pan Date: Wed, 3 Sep 2025 04:16:13 -0700 Subject: [PATCH 255/372] mm/show_mem: dump the status of the mem alloc profiling before printing This patchset fixes two issues we saw in production rollout. The first issue is that we saw all zero output of memory allocation profiling information from show_mem() if CONFIG_MEM_ALLOC_PROFILING is set and sysctl.vm.mem_profiling=0. This cause ambiguity as we don't know what 0B actually means in the output. It can mean either memory allocation profiling is temporary disabled or the allocation at that position is actually 0. Such ambiguity will make further parsing harder as we cannot differentiate between two case. The second issue is that multiple entities can call show_mem() which messed up the allocation info in dmesg. We saw outputs like this: 327 MiB 83635 mm/compaction.c:1880 func:compaction_alloc 48.4 GiB 12684937 mm/memory.c:1061 func:folio_prealloc 7.48 GiB 10899 mm/huge_memory.c:1159 func:vma_alloc_anon_folio_pmd 298 MiB 95216 kernel/fork.c:318 func:alloc_thread_stack_node 250 MiB 63901 mm/zsmalloc.c:987 func:alloc_zspage 1.42 GiB 372527 mm/memory.c:1063 func:folio_prealloc 1.17 GiB 95693 mm/slub.c:2424 func:alloc_slab_page 651 MiB 166732 mm/readahead.c:270 func:page_cache_ra_unbounded 419 MiB 107261 net/core/page_pool.c:572 func:__page_pool_alloc_pages_slow 404 MiB 103425 arch/x86/mm/pgtable.c:25 func:pte_alloc_one The above example is because one kthread invokes show_mem() from __alloc_pages_slowpath while kernel itself calls oom_kill_process() This patch (of 2): This patch prints the status of the memory allocation profiling before __show_mem actually prints the detailed allocation info. This way will let us know the `0B` we saw in allocation info is because the profiling is disabled or the allocation is actually 0B. Link: https://lkml.kernel.org/r/cover.1756897825.git.pyyjason@gmail.com Link: https://lkml.kernel.org/r/d7998ea0ddc2ea1a78bb6e89adf530526f76679a.1756897825.git.pyyjason@gmail.com Signed-off-by: Yueyang Pan Acked-by: Usama Arif Acked-by: Vlastimil Babka Acked-by: Zi Yan Acked-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Acked-by: Shakeel Butt Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/show_mem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/show_mem.c b/mm/show_mem.c index c563d9adfa87..22bb8b045671 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -431,7 +431,8 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) nr = alloc_tag_top_users(tags, ARRAY_SIZE(tags), false); if (nr) { - pr_notice("Memory allocations:\n"); + pr_notice("Memory allocations (profiling is currently turned %s):\n", + mem_alloc_profiling_enabled() ? "on" : "off"); for (i = 0; i < nr; i++) { struct codetag *ct = tags[i].ct; struct alloc_tag *tag = ct_to_alloc_tag(ct); From 8147bc15b409c8ca52e98c6692273dd9bc5c6905 Mon Sep 17 00:00:00 2001 From: Yueyang Pan Date: Wed, 3 Sep 2025 04:16:14 -0700 Subject: [PATCH 256/372] mm/show_mem: add trylock while printing alloc info In production, show_mem() can be called concurrently from two different entities, for example one from oom_kill_process() another from __alloc_pages_slowpath from another kthread. This patch adds a spinlock and invokes trylock before printing out the kernel alloc info in show_mem(). This way two alloc info won't interleave with each other, which then makes parsing easier. Link: https://lkml.kernel.org/r/4ed91296e0c595d945a38458f7a8d9611b0c1e52.1756897825.git.pyyjason@gmail.com Signed-off-by: Yueyang Pan Acked-by: Usama Arif Acked-by: Vlastimil Babka Acked-by: Zi Yan Acked-by: Suren Baghdasaryan Acked-by: Shakeel Butt Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/show_mem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/show_mem.c b/mm/show_mem.c index 22bb8b045671..90a9a37116e7 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -425,7 +425,9 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif #ifdef CONFIG_MEM_ALLOC_PROFILING - { + static DEFINE_SPINLOCK(mem_alloc_profiling_spinlock); + + if (spin_trylock(&mem_alloc_profiling_spinlock)) { struct codetag_bytes tags[10]; size_t i, nr; @@ -452,6 +454,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) ct->lineno, ct->function); } } + spin_unlock(&mem_alloc_profiling_spinlock); } #endif } From da939ef4c494246bc2102ecb628bbcc71d650410 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 2 Sep 2025 08:35:11 +0000 Subject: [PATCH 257/372] rust: maple_tree: add MapleTree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Add Rust abstraction for Maple Trees", v3. This will be used in the Tyr driver [1] to allocate from the GPU's VA space that is not owned by userspace, but by the kernel, for kernel GPU mappings. Danilo tells me that in nouveau, the maple tree is used for keeping track of "VM regions" on top of GPUVM, and that he will most likely end up doing the same in the Rust Nova driver as well. These abstractions intentionally do not expose any way to make use of external locking. You are required to use the internal spinlock. For now, we do not support loads that only utilize rcu for protection. This contains some parts taken from Andrew Ballance's RFC [2] from April. However, it has also been reworked significantly compared to that RFC taking the use-cases in Tyr into account. This patch (of 3): The maple tree will be used in the Tyr driver to allocate and keep track of GPU allocations created internally (i.e. not by userspace). It will likely also be used in the Nova driver eventually. This adds the simplest methods for additional and removal that do not require any special care with respect to concurrency. This implementation is based on the RFC by Andrew but with significant changes to simplify the implementation. [ojeda@kernel.org: fix intra-doc links] Link: https://lkml.kernel.org/r/20250910140212.997771-1-ojeda@kernel.org Link: https://lkml.kernel.org/r/20250902-maple-tree-v3-0-fb5c8958fb1e@google.com Link: https://lkml.kernel.org/r/20250902-maple-tree-v3-1-fb5c8958fb1e@google.com Link: https://lore.kernel.org/r/20250627-tyr-v1-1-cb5f4c6ced46@collabora.com [1] Link: https://lore.kernel.org/r/20250405060154.1550858-1-andrewjballance@gmail.com [2] Co-developed-by: Andrew Ballance Signed-off-by: Andrew Ballance Signed-off-by: Alice Ryhl Reviewed-by: Danilo Krummrich Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Daniel Almeida Cc: Gary Guo Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Signed-off-by: Andrew Morton --- MAINTAINERS | 4 + include/linux/maple_tree.h | 3 + rust/helpers/helpers.c | 1 + rust/helpers/maple_tree.c | 8 + rust/kernel/lib.rs | 1 + rust/kernel/maple_tree.rs | 349 +++++++++++++++++++++++++++++++++++++ 6 files changed, 366 insertions(+) create mode 100644 rust/helpers/maple_tree.c create mode 100644 rust/kernel/maple_tree.rs diff --git a/MAINTAINERS b/MAINTAINERS index a7e123ddf05a..68d29f0220fc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14672,6 +14672,8 @@ F: net/mctp/ MAPLE TREE M: Liam R. Howlett +R: Alice Ryhl +R: Andrew Ballance L: maple-tree@lists.infradead.org L: linux-mm@kvack.org S: Supported @@ -14680,6 +14682,8 @@ F: include/linux/maple_tree.h F: include/trace/events/maple_tree.h F: lib/maple_tree.c F: lib/test_maple_tree.c +F: rust/helpers/maple_tree.c +F: rust/kernel/maple_tree.rs F: tools/testing/radix-tree/maple.c F: tools/testing/shared/linux/maple_tree.h diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 41e633264e51..05730171d201 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -481,6 +481,9 @@ struct ma_wr_state { #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) +/* + * When changing MA_STATE, remember to also change rust/kernel/maple_tree.rs + */ #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 7cf7fe95e41d..c5d42e0f7ce6 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -26,6 +26,7 @@ #include "io.c" #include "jump_label.c" #include "kunit.c" +#include "maple_tree.c" #include "mm.c" #include "mutex.c" #include "of.c" diff --git a/rust/helpers/maple_tree.c b/rust/helpers/maple_tree.c new file mode 100644 index 000000000000..1dd9ac84a13f --- /dev/null +++ b/rust/helpers/maple_tree.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void rust_helper_mt_init_flags(struct maple_tree *mt, unsigned int flags) +{ + mt_init_flags(mt, flags); +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index ed53169e795c..6b0a5689669f 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -96,6 +96,7 @@ pub mod jump_label; #[cfg(CONFIG_KUNIT)] pub mod kunit; pub mod list; +pub mod maple_tree; pub mod miscdevice; pub mod mm; #[cfg(CONFIG_NET)] diff --git a/rust/kernel/maple_tree.rs b/rust/kernel/maple_tree.rs new file mode 100644 index 000000000000..319772878b89 --- /dev/null +++ b/rust/kernel/maple_tree.rs @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Maple trees. +//! +//! C header: [`include/linux/maple_tree.h`](srctree/include/linux/maple_tree.h) +//! +//! Reference: + +use core::{ + marker::PhantomData, + ops::{Bound, RangeBounds}, + ptr, +}; + +use kernel::{ + alloc::Flags, + error::to_result, + prelude::*, + types::{ForeignOwnable, Opaque}, +}; + +/// A maple tree optimized for storing non-overlapping ranges. +/// +/// # Invariants +/// +/// Each range in the maple tree owns an instance of `T`. +#[pin_data(PinnedDrop)] +#[repr(transparent)] +pub struct MapleTree { + #[pin] + tree: Opaque, + _p: PhantomData, +} + +#[inline] +fn to_maple_range(range: impl RangeBounds) -> Option<(usize, usize)> { + let first = match range.start_bound() { + Bound::Included(start) => *start, + Bound::Excluded(start) => start.checked_add(1)?, + Bound::Unbounded => 0, + }; + + let last = match range.end_bound() { + Bound::Included(end) => *end, + Bound::Excluded(end) => end.checked_sub(1)?, + Bound::Unbounded => usize::MAX, + }; + + if last < first { + return None; + } + + Some((first, last)) +} + +impl MapleTree { + /// Create a new maple tree. + /// + /// The tree will use the regular implementation with a higher branching factor, rather than + /// the allocation tree. + #[inline] + pub fn new() -> impl PinInit { + pin_init!(MapleTree { + // SAFETY: This initializes a maple tree into a pinned slot. The maple tree will be + // destroyed in Drop before the memory location becomes invalid. + tree <- Opaque::ffi_init(|slot| unsafe { bindings::mt_init_flags(slot, 0) }), + _p: PhantomData, + }) + } + + /// Insert the value at the given index. + /// + /// # Errors + /// + /// If the maple tree already contains a range using the given index, then this call will + /// return an [`InsertErrorKind::Occupied`]. It may also fail if memory allocation fails. + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::{InsertErrorKind, MapleTree}; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// let the_answer = KBox::new(42, GFP_KERNEL)?; + /// + /// // These calls will succeed. + /// tree.insert(100, ten, GFP_KERNEL)?; + /// tree.insert(101, twenty, GFP_KERNEL)?; + /// + /// // This will fail because the index is already in use. + /// assert_eq!( + /// tree.insert(100, the_answer, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::Occupied, + /// ); + /// # Ok::<_, Error>(()) + /// ``` + #[inline] + pub fn insert(&self, index: usize, value: T, gfp: Flags) -> Result<(), InsertError> { + self.insert_range(index..=index, value, gfp) + } + + /// Insert a value to the specified range, failing on overlap. + /// + /// This accepts the usual types of Rust ranges using the `..` and `..=` syntax for exclusive + /// and inclusive ranges respectively. The range must not be empty, and must not overlap with + /// any existing range. + /// + /// # Errors + /// + /// If the maple tree already contains an overlapping range, then this call will return an + /// [`InsertErrorKind::Occupied`]. It may also fail if memory allocation fails or if the + /// requested range is invalid (e.g. empty). + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::{InsertErrorKind, MapleTree}; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// let the_answer = KBox::new(42, GFP_KERNEL)?; + /// let hundred = KBox::new(100, GFP_KERNEL)?; + /// + /// // Insert the value 10 at the indices 100 to 499. + /// tree.insert_range(100..500, ten, GFP_KERNEL)?; + /// + /// // Insert the value 20 at the indices 500 to 1000. + /// tree.insert_range(500..=1000, twenty, GFP_KERNEL)?; + /// + /// // This will fail due to overlap with the previous range on index 1000. + /// assert_eq!( + /// tree.insert_range(1000..1200, the_answer, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::Occupied, + /// ); + /// + /// // When using .. to specify the range, you must be careful to ensure that the range is + /// // non-empty. + /// assert_eq!( + /// tree.insert_range(72..72, hundred, GFP_KERNEL).unwrap_err().cause, + /// InsertErrorKind::InvalidRequest, + /// ); + /// # Ok::<_, Error>(()) + /// ``` + pub fn insert_range(&self, range: R, value: T, gfp: Flags) -> Result<(), InsertError> + where + R: RangeBounds, + { + let Some((first, last)) = to_maple_range(range) else { + return Err(InsertError { + value, + cause: InsertErrorKind::InvalidRequest, + }); + }; + + let ptr = T::into_foreign(value); + + // SAFETY: The tree is valid, and we are passing a pointer to an owned instance of `T`. + let res = to_result(unsafe { + bindings::mtree_insert_range(self.tree.get(), first, last, ptr, gfp.as_raw()) + }); + + if let Err(err) = res { + // SAFETY: As `mtree_insert_range` failed, it is safe to take back ownership. + let value = unsafe { T::from_foreign(ptr) }; + + let cause = if err == ENOMEM { + InsertErrorKind::AllocError(kernel::alloc::AllocError) + } else if err == EEXIST { + InsertErrorKind::Occupied + } else { + InsertErrorKind::InvalidRequest + }; + Err(InsertError { value, cause }) + } else { + Ok(()) + } + } + + /// Erase the range containing the given index. + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::MapleTree; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// + /// tree.insert_range(100..500, ten, GFP_KERNEL)?; + /// tree.insert(67, twenty, GFP_KERNEL)?; + /// + /// assert_eq!(tree.erase(67).map(|v| *v), Some(20)); + /// assert_eq!(tree.erase(275).map(|v| *v), Some(10)); + /// + /// // The previous call erased the entire range, not just index 275. + /// assert!(tree.erase(127).is_none()); + /// # Ok::<_, Error>(()) + /// ``` + #[inline] + pub fn erase(&self, index: usize) -> Option { + // SAFETY: `self.tree` contains a valid maple tree. + let ret = unsafe { bindings::mtree_erase(self.tree.get(), index) }; + + // SAFETY: If the pointer is not null, then we took ownership of a valid instance of `T` + // from the tree. + unsafe { T::try_from_foreign(ret) } + } + + /// Free all `T` instances in this tree. + /// + /// # Safety + /// + /// This frees Rust data referenced by the maple tree without removing it from the maple tree, + /// leaving it in an invalid state. The caller must ensure that this invalid state cannot be + /// observed by the end-user. + unsafe fn free_all_entries(self: Pin<&mut Self>) { + // SAFETY: The caller provides exclusive access to the entire maple tree, so we have + // exclusive access to the entire maple tree despite not holding the lock. + let mut ma_state = unsafe { MaState::new_raw(self.into_ref().get_ref(), 0, usize::MAX) }; + + loop { + // This uses the raw accessor because we're destroying pointers without removing them + // from the maple tree, which is only valid because this is the destructor. + let ptr = ma_state.mas_find_raw(usize::MAX); + if ptr.is_null() { + break; + } + // SAFETY: By the type invariants, this pointer references a valid value of type `T`. + // By the safety requirements, it is okay to free it without removing it from the maple + // tree. + drop(unsafe { T::from_foreign(ptr) }); + } + } +} + +#[pinned_drop] +impl PinnedDrop for MapleTree { + #[inline] + fn drop(mut self: Pin<&mut Self>) { + // We only iterate the tree if the Rust value has a destructor. + if core::mem::needs_drop::() { + // SAFETY: Other than the below `mtree_destroy` call, the tree will not be accessed + // after this call. + unsafe { self.as_mut().free_all_entries() }; + } + + // SAFETY: The tree is valid, and will not be accessed after this call. + unsafe { bindings::mtree_destroy(self.tree.get()) }; + } +} + +/// A helper type used for navigating a [`MapleTree`]. +/// +/// # Invariants +/// +/// For the duration of `'tree`: +/// +/// * The `ma_state` references a valid `MapleTree`. +/// * The `ma_state` has read/write access to the tree. +pub struct MaState<'tree, T: ForeignOwnable> { + state: bindings::ma_state, + _phantom: PhantomData<&'tree mut MapleTree>, +} + +impl<'tree, T: ForeignOwnable> MaState<'tree, T> { + /// Initialize a new `MaState` with the given tree. + /// + /// # Safety + /// + /// The caller must ensure that this `MaState` has read/write access to the maple tree. + #[inline] + unsafe fn new_raw(mt: &'tree MapleTree, first: usize, end: usize) -> Self { + // INVARIANT: + // * Having a reference ensures that the `MapleTree` is valid for `'tree`. + // * The caller ensures that we have read/write access. + Self { + state: bindings::ma_state { + tree: mt.tree.get(), + index: first, + last: end, + node: ptr::null_mut(), + status: bindings::maple_status_ma_start, + min: 0, + max: usize::MAX, + alloc: ptr::null_mut(), + mas_flags: 0, + store_type: bindings::store_type_wr_invalid, + ..Default::default() + }, + _phantom: PhantomData, + } + } + + #[inline] + fn as_raw(&mut self) -> *mut bindings::ma_state { + &raw mut self.state + } + + #[inline] + fn mas_find_raw(&mut self, max: usize) -> *mut c_void { + // SAFETY: By the type invariants, the `ma_state` is active and we have read/write access + // to the tree. + unsafe { bindings::mas_find(self.as_raw(), max) } + } +} + +/// Error type for failure to insert a new value. +pub struct InsertError { + /// The value that could not be inserted. + pub value: T, + /// The reason for the failure to insert. + pub cause: InsertErrorKind, +} + +/// The reason for the failure to insert. +#[derive(PartialEq, Eq, Copy, Clone, Debug)] +pub enum InsertErrorKind { + /// There is already a value in the requested range. + Occupied, + /// Failure to allocate memory. + AllocError(kernel::alloc::AllocError), + /// The insertion request was invalid. + InvalidRequest, +} + +impl From for Error { + #[inline] + fn from(kind: InsertErrorKind) -> Error { + match kind { + InsertErrorKind::Occupied => EEXIST, + InsertErrorKind::AllocError(kernel::alloc::AllocError) => ENOMEM, + InsertErrorKind::InvalidRequest => EINVAL, + } + } +} + +impl From> for Error { + #[inline] + fn from(insert_err: InsertError) -> Error { + Error::from(insert_err.cause) + } +} From 01422da19cbeb4b044649322968265464991368e Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 2 Sep 2025 08:35:12 +0000 Subject: [PATCH 258/372] rust: maple_tree: add lock guard for maple tree MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To load a value, one must be careful to hold the lock while accessing it. To enable this, we add a lock() method so that you can perform operations on the value before the spinlock is released. This adds a MapleGuard type without using the existing SpinLock type. This ensures that the MapleGuard type is not unnecessarily large, and that it is easy to swap out the type of lock in case the C maple tree is changed to use a different kind of lock. There are two ways of using the lock guard: You can call load() directly to load a value under the lock, or you can create an MaState to iterate the tree with find(). The find() method does not have the mas_ prefix since it's a method on MaState, and being a method on that struct serves a similar purpose to the mas_ prefix in C. Link: https://lkml.kernel.org/r/20250902-maple-tree-v3-2-fb5c8958fb1e@google.com Co-developed-by: Andrew Ballance Signed-off-by: Andrew Ballance Reviewed-by: Andrew Ballance Reviewed-by: Danilo Krummrich Signed-off-by: Alice Ryhl Cc: Andreas Hindborg Cc: Björn Roy Baron Cc: Boqun Feng Cc: Daniel Almeida Cc: Gary Guo Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Signed-off-by: Andrew Morton --- rust/kernel/maple_tree.rs | 140 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/rust/kernel/maple_tree.rs b/rust/kernel/maple_tree.rs index 319772878b89..7acb8478d1d9 100644 --- a/rust/kernel/maple_tree.rs +++ b/rust/kernel/maple_tree.rs @@ -213,6 +213,23 @@ impl MapleTree { unsafe { T::try_from_foreign(ret) } } + /// Lock the internal spinlock. + #[inline] + pub fn lock(&self) -> MapleGuard<'_, T> { + // SAFETY: It's safe to lock the spinlock in a maple tree. + unsafe { bindings::spin_lock(self.ma_lock()) }; + + // INVARIANT: We just took the spinlock. + MapleGuard(self) + } + + #[inline] + fn ma_lock(&self) -> *mut bindings::spinlock_t { + // SAFETY: This pointer offset operation stays in-bounds. + let lock_ptr = unsafe { &raw mut (*self.tree.get()).__bindgen_anon_1.ma_lock }; + lock_ptr.cast() + } + /// Free all `T` instances in this tree. /// /// # Safety @@ -256,6 +273,91 @@ impl PinnedDrop for MapleTree { } } +/// A reference to a [`MapleTree`] that owns the inner lock. +/// +/// # Invariants +/// +/// This guard owns the inner spinlock. +#[must_use = "if unused, the lock will be immediately unlocked"] +pub struct MapleGuard<'tree, T: ForeignOwnable>(&'tree MapleTree); + +impl<'tree, T: ForeignOwnable> Drop for MapleGuard<'tree, T> { + #[inline] + fn drop(&mut self) { + // SAFETY: By the type invariants, we hold this spinlock. + unsafe { bindings::spin_unlock(self.0.ma_lock()) }; + } +} + +impl<'tree, T: ForeignOwnable> MapleGuard<'tree, T> { + /// Create a [`MaState`] protected by this lock guard. + pub fn ma_state(&mut self, first: usize, end: usize) -> MaState<'_, T> { + // SAFETY: The `MaState` borrows this `MapleGuard`, so it can also borrow the `MapleGuard`s + // read/write permissions to the maple tree. + unsafe { MaState::new_raw(self.0, first, end) } + } + + /// Load the value at the given index. + /// + /// # Examples + /// + /// Read the value while holding the spinlock. + /// + /// ``` + /// use kernel::maple_tree::MapleTree; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// tree.insert(100, ten, GFP_KERNEL)?; + /// tree.insert(200, twenty, GFP_KERNEL)?; + /// + /// let mut lock = tree.lock(); + /// assert_eq!(lock.load(100).map(|v| *v), Some(10)); + /// assert_eq!(lock.load(200).map(|v| *v), Some(20)); + /// assert_eq!(lock.load(300).map(|v| *v), None); + /// # Ok::<_, Error>(()) + /// ``` + /// + /// Increment refcount under the lock, to keep value alive afterwards. + /// + /// ``` + /// use kernel::maple_tree::MapleTree; + /// use kernel::sync::Arc; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = Arc::new(10, GFP_KERNEL)?; + /// let twenty = Arc::new(20, GFP_KERNEL)?; + /// tree.insert(100, ten, GFP_KERNEL)?; + /// tree.insert(200, twenty, GFP_KERNEL)?; + /// + /// // Briefly take the lock to increment the refcount. + /// let value = tree.lock().load(100).map(Arc::from); + /// + /// // At this point, another thread might remove the value. + /// tree.erase(100); + /// + /// // But we can still access it because we took a refcount. + /// assert_eq!(value.map(|v| *v), Some(10)); + /// # Ok::<_, Error>(()) + /// ``` + #[inline] + pub fn load(&mut self, index: usize) -> Option> { + // SAFETY: `self.tree` contains a valid maple tree. + let ret = unsafe { bindings::mtree_load(self.0.tree.get(), index) }; + if ret.is_null() { + return None; + } + + // SAFETY: If the pointer is not null, then it references a valid instance of `T`. It is + // safe to borrow the instance mutably because the signature of this function enforces that + // the mutable borrow is not used after the spinlock is dropped. + Some(unsafe { T::borrow_mut(ret) }) + } +} + /// A helper type used for navigating a [`MapleTree`]. /// /// # Invariants @@ -309,6 +411,44 @@ impl<'tree, T: ForeignOwnable> MaState<'tree, T> { // to the tree. unsafe { bindings::mas_find(self.as_raw(), max) } } + + /// Find the next entry in the maple tree. + /// + /// # Examples + /// + /// Iterate the maple tree. + /// + /// ``` + /// use kernel::maple_tree::MapleTree; + /// use kernel::sync::Arc; + /// + /// let tree = KBox::pin_init(MapleTree::>::new(), GFP_KERNEL)?; + /// + /// let ten = Arc::new(10, GFP_KERNEL)?; + /// let twenty = Arc::new(20, GFP_KERNEL)?; + /// tree.insert(100, ten, GFP_KERNEL)?; + /// tree.insert(200, twenty, GFP_KERNEL)?; + /// + /// let mut ma_lock = tree.lock(); + /// let mut iter = ma_lock.ma_state(0, usize::MAX); + /// + /// assert_eq!(iter.find(usize::MAX).map(|v| *v), Some(10)); + /// assert_eq!(iter.find(usize::MAX).map(|v| *v), Some(20)); + /// assert!(iter.find(usize::MAX).is_none()); + /// # Ok::<_, Error>(()) + /// ``` + #[inline] + pub fn find(&mut self, max: usize) -> Option> { + let ret = self.mas_find_raw(max); + if ret.is_null() { + return None; + } + + // SAFETY: If the pointer is not null, then it references a valid instance of `T`. It's + // safe to access it mutably as the returned reference borrows this `MaState`, and the + // `MaState` has read/write access to the maple tree. + Some(unsafe { T::borrow_mut(ret) }) + } } /// Error type for failure to insert a new value. From 56b1852e82bd5550c8987bb381a3d930f27b4058 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 2 Sep 2025 08:35:13 +0000 Subject: [PATCH 259/372] rust: maple_tree: add MapleTreeAlloc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To support allocation trees, we introduce a new type MapleTreeAlloc for the case where the tree is created using MT_FLAGS_ALLOC_RANGE. To ensure that you can only call mtree_alloc_range on an allocation tree, we restrict thta method to the new MapleTreeAlloc type. However, all methods on MapleTree remain accessible to MapleTreeAlloc as allocation trees can use the other methods without issues. Link: https://lkml.kernel.org/r/20250902-maple-tree-v3-3-fb5c8958fb1e@google.com Signed-off-by: Alice Ryhl Reviewed-by: Daniel Almeida Reviewed-by: Danilo Krummrich Cc: Andreas Hindborg Cc: Andrew Ballance Cc: Björn Roy Baron Cc: Boqun Feng Cc: Gary Guo Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Miguel Ojeda Cc: Trevor Gross Signed-off-by: Andrew Morton --- rust/kernel/maple_tree.rs | 158 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) diff --git a/rust/kernel/maple_tree.rs b/rust/kernel/maple_tree.rs index 7acb8478d1d9..e72eec56bf57 100644 --- a/rust/kernel/maple_tree.rs +++ b/rust/kernel/maple_tree.rs @@ -32,6 +32,26 @@ pub struct MapleTree { _p: PhantomData, } +/// A maple tree with `MT_FLAGS_ALLOC_RANGE` set. +/// +/// All methods on [`MapleTree`] are also accessible on this type. +#[pin_data] +#[repr(transparent)] +pub struct MapleTreeAlloc { + #[pin] + tree: MapleTree, +} + +// Make MapleTree methods usable on MapleTreeAlloc. +impl core::ops::Deref for MapleTreeAlloc { + type Target = MapleTree; + + #[inline] + fn deref(&self) -> &MapleTree { + &self.tree + } +} + #[inline] fn to_maple_range(range: impl RangeBounds) -> Option<(usize, usize)> { let first = match range.start_bound() { @@ -358,6 +378,107 @@ impl<'tree, T: ForeignOwnable> MapleGuard<'tree, T> { } } +impl MapleTreeAlloc { + /// Create a new allocation tree. + pub fn new() -> impl PinInit { + let tree = pin_init!(MapleTree { + // SAFETY: This initializes a maple tree into a pinned slot. The maple tree will be + // destroyed in Drop before the memory location becomes invalid. + tree <- Opaque::ffi_init(|slot| unsafe { + bindings::mt_init_flags(slot, bindings::MT_FLAGS_ALLOC_RANGE) + }), + _p: PhantomData, + }); + + pin_init!(MapleTreeAlloc { tree <- tree }) + } + + /// Insert an entry with the given size somewhere in the given range. + /// + /// The maple tree will search for a location in the given range where there is space to insert + /// the new range. If there is not enough available space, then an error will be returned. + /// + /// The index of the new range is returned. + /// + /// # Examples + /// + /// ``` + /// use kernel::maple_tree::{MapleTreeAlloc, AllocErrorKind}; + /// + /// let tree = KBox::pin_init(MapleTreeAlloc::>::new(), GFP_KERNEL)?; + /// + /// let ten = KBox::new(10, GFP_KERNEL)?; + /// let twenty = KBox::new(20, GFP_KERNEL)?; + /// let thirty = KBox::new(30, GFP_KERNEL)?; + /// let hundred = KBox::new(100, GFP_KERNEL)?; + /// + /// // Allocate three ranges. + /// let idx1 = tree.alloc_range(100, ten, ..1000, GFP_KERNEL)?; + /// let idx2 = tree.alloc_range(100, twenty, ..1000, GFP_KERNEL)?; + /// let idx3 = tree.alloc_range(100, thirty, ..1000, GFP_KERNEL)?; + /// + /// assert_eq!(idx1, 0); + /// assert_eq!(idx2, 100); + /// assert_eq!(idx3, 200); + /// + /// // This will fail because the remaining space is too small. + /// assert_eq!( + /// tree.alloc_range(800, hundred, ..1000, GFP_KERNEL).unwrap_err().cause, + /// AllocErrorKind::Busy, + /// ); + /// # Ok::<_, Error>(()) + /// ``` + pub fn alloc_range( + &self, + size: usize, + value: T, + range: R, + gfp: Flags, + ) -> Result> + where + R: RangeBounds, + { + let Some((min, max)) = to_maple_range(range) else { + return Err(AllocError { + value, + cause: AllocErrorKind::InvalidRequest, + }); + }; + + let ptr = T::into_foreign(value); + let mut index = 0; + + // SAFETY: The tree is valid, and we are passing a pointer to an owned instance of `T`. + let res = to_result(unsafe { + bindings::mtree_alloc_range( + self.tree.tree.get(), + &mut index, + ptr, + size, + min, + max, + gfp.as_raw(), + ) + }); + + if let Err(err) = res { + // SAFETY: As `mtree_alloc_range` failed, it is safe to take back ownership. + let value = unsafe { T::from_foreign(ptr) }; + + let cause = if err == ENOMEM { + AllocErrorKind::AllocError(kernel::alloc::AllocError) + } else if err == EBUSY { + AllocErrorKind::Busy + } else { + AllocErrorKind::InvalidRequest + }; + Err(AllocError { value, cause }) + } else { + Ok(index) + } + } +} + /// A helper type used for navigating a [`MapleTree`]. /// /// # Invariants @@ -487,3 +608,40 @@ impl From> for Error { Error::from(insert_err.cause) } } + +/// Error type for failure to insert a new value. +pub struct AllocError { + /// The value that could not be inserted. + pub value: T, + /// The reason for the failure to insert. + pub cause: AllocErrorKind, +} + +/// The reason for the failure to insert. +#[derive(PartialEq, Eq, Copy, Clone)] +pub enum AllocErrorKind { + /// There is not enough space for the requested allocation. + Busy, + /// Failure to allocate memory. + AllocError(kernel::alloc::AllocError), + /// The insertion request was invalid. + InvalidRequest, +} + +impl From for Error { + #[inline] + fn from(kind: AllocErrorKind) -> Error { + match kind { + AllocErrorKind::Busy => EBUSY, + AllocErrorKind::AllocError(kernel::alloc::AllocError) => ENOMEM, + AllocErrorKind::InvalidRequest => EINVAL, + } + } +} + +impl From> for Error { + #[inline] + fn from(insert_err: AllocError) -> Error { + Error::from(insert_err.cause) + } +} From 69e0a3b490030d8d9e628ea69bdb3ff0010a0bb7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 3 Sep 2025 16:54:24 +0800 Subject: [PATCH 260/372] mm: shmem: fix the strategy for the tmpfs 'huge=' options After commit acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs"), we have extended tmpfs to allow any sized large folios, rather than just PMD-sized large folios. The strategy discussed previously was: : Considering that tmpfs already has the 'huge=' option to control the : PMD-sized large folios allocation, we can extend the 'huge=' option to : allow any sized large folios. The semantics of the 'huge=' mount option : are: : : huge=never: no any sized large folios : huge=always: any sized large folios : huge=within_size: like 'always' but respect the i_size : huge=advise: like 'always' if requested with madvise() : : Note: for tmpfs mmap() faults, due to the lack of a write size hint, still : allocate the PMD-sized huge folios if huge=always/within_size/advise is : set. : : Moreover, the 'deny' and 'force' testing options controlled by : '/sys/kernel/mm/transparent_hugepage/shmem_enabled', still retain the same : semantics. The 'deny' can disable any sized large folios for tmpfs, while : the 'force' can enable PMD sized large folios for tmpfs. This means that when tmpfs is mounted with 'huge=always' or 'huge=within_size', tmpfs will allow getting a highest order hint based on the size of write() and fallocate() paths. It will then try each allowable large order, rather than continually attempting to allocate PMD-sized large folios as before. However, this might break some user scenarios for those who want to use PMD-sized large folios, such as the i915 driver which did not supply a write size hint when allocating shmem [1]. Moreover, Hugh also complained that this will cause a regression in userspace with 'huge=always' or 'huge=within_size'. So, let's revisit the strategy for tmpfs large page allocation. A simple fix would be to always try PMD-sized large folios first, and if that fails, fall back to smaller large folios. This approach differs from the strategy for large folio allocation used by other file systems, however, tmpfs is somewhat different from other file systems, as quoted from David's opinion: : There were opinions in the past that tmpfs should just behave like any : other fs, and I think that's what we tried to satisfy here: use the write : size as an indication. : : I assume there will be workloads where either approach will be beneficial. : I also assume that workloads that use ordinary fs'es could benefit from : the same strategy (start with PMD), while others will clearly not. Link: https://lkml.kernel.org/r/10e7ac6cebe6535c137c064d5c5a235643eebb4a.1756888965.git.baolin.wang@linux.alibaba.com Link: https://lore.kernel.org/lkml/0d734549d5ed073c80b11601da3abdd5223e1889.1753689802.git.baolin.wang@linux.alibaba.com/ [1] Fixes: acd7ccb284b8 ("mm: shmem: add large folio support for tmpfs") Signed-off-by: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Hugh Dickins Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Zi Yan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 6 ++- mm/shmem.c | 47 +++------------------- 2 files changed, 10 insertions(+), 43 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index a16a04841b96..1654211cc6cf 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -419,6 +419,8 @@ option: ``huge=``. It can have following values: always Attempt to allocate huge pages every time we need a new page; + Always try PMD-sized huge pages first, and fall back to smaller-sized + huge pages if the PMD-sized huge page allocation fails; never Do not allocate huge pages. Note that ``madvise(..., MADV_COLLAPSE)`` @@ -426,7 +428,9 @@ never is specified everywhere; within_size - Only allocate huge page if it will be fully within i_size. + Only allocate huge page if it will be fully within i_size; + Always try PMD-sized huge pages first, and fall back to smaller-sized + huge pages if the PMD-sized huge page allocation fails; Also respect madvise() hints; advise diff --git a/mm/shmem.c b/mm/shmem.c index 2df26f4d6e60..29e1eb690125 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -573,42 +573,6 @@ static int shmem_confirm_swap(struct address_space *mapping, pgoff_t index, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER; -/** - * shmem_mapping_size_orders - Get allowable folio orders for the given file size. - * @mapping: Target address_space. - * @index: The page index. - * @write_end: end of a write, could extend inode size. - * - * This returns huge orders for folios (when supported) based on the file size - * which the mapping currently allows at the given index. The index is relevant - * due to alignment considerations the mapping might have. The returned order - * may be less than the size passed. - * - * Return: The orders. - */ -static inline unsigned int -shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end) -{ - unsigned int order; - size_t size; - - if (!mapping_large_folio_support(mapping) || !write_end) - return 0; - - /* Calculate the write size based on the write_end */ - size = write_end - (index << PAGE_SHIFT); - order = filemap_get_order(size); - if (!order) - return 0; - - /* If we're not aligned, allocate a smaller folio */ - if (index & ((1UL << order) - 1)) - order = __ffs(index); - - order = min_t(size_t, order, MAX_PAGECACHE_ORDER); - return order > 0 ? BIT(order + 1) - 1 : 0; -} - static unsigned int shmem_get_orders_within_size(struct inode *inode, unsigned long within_size_orders, pgoff_t index, loff_t write_end) @@ -655,22 +619,21 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index * For tmpfs mmap()'s huge order, we still use PMD-sized order to * allocate huge pages due to lack of a write size hint. * - * Otherwise, tmpfs will allow getting a highest order hint based on - * the size of write and fallocate paths, then will try each allowable - * huge orders. + * For tmpfs with 'huge=always' or 'huge=within_size' mount option, + * we will always try PMD-sized order first. If that failed, it will + * fall back to small large folios. */ switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: if (vma) return maybe_pmd_order; - return shmem_mapping_size_orders(inode->i_mapping, index, write_end); + return THP_ORDERS_ALL_FILE_DEFAULT; case SHMEM_HUGE_WITHIN_SIZE: if (vma) within_size_orders = maybe_pmd_order; else - within_size_orders = shmem_mapping_size_orders(inode->i_mapping, - index, write_end); + within_size_orders = THP_ORDERS_ALL_FILE_DEFAULT; within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, index, write_end); From 0d0e03d5b83ea481801783906d230b2ad591d8c1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 3 Sep 2025 09:02:52 +0200 Subject: [PATCH 261/372] selftests/mm: split_huge_page_test: fix occasional is_backed_by_folio() wrong results Patch series "selftests/mm: split_huge_page_test: split_pte_mapped_thp improvements", v2. One fix for occasional failures I found while testing and a bunch of cleanups that should make that test easier to digest. This patch (of 2): When checking for actual tail or head pages of a folio, we must make sure that the KPF_COMPOUND_HEAD/KPF_COMPOUND_TAIL flag is paired with KPF_THP. For example, if we have another large folio after our large folio in physical memory, our "pfn_flags & (KPF_THP | KPF_COMPOUND_TAIL)" would trigger even though it's actually a head page of the next folio. If is_backed_by_folio() returns a wrong result, split_pte_mapped_thp() can fail with "Some THPs are missing during mremap". Fix it by checking for head/tail pages of folios properly. Add folio_tail_flags/folio_head_flags to improve readability and use these masks also when just testing for any compound page. Link: https://lkml.kernel.org/r/20250903070253.34556-1-david@redhat.com Link: https://lkml.kernel.org/r/20250903070253.34556-2-david@redhat.com Fixes: 169b456b0162 ("selftests/mm: reimplement is_backed_by_thp() with more precise check") Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/split_huge_page_test.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 10ae65ea032f..72d6d8bb329e 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -44,6 +44,8 @@ int kpageflags_fd; static bool is_backed_by_folio(char *vaddr, int order, int pagemap_fd, int kpageflags_fd) { + const uint64_t folio_head_flags = KPF_THP | KPF_COMPOUND_HEAD; + const uint64_t folio_tail_flags = KPF_THP | KPF_COMPOUND_TAIL; const unsigned long nr_pages = 1UL << order; unsigned long pfn_head; uint64_t pfn_flags; @@ -61,7 +63,7 @@ static bool is_backed_by_folio(char *vaddr, int order, int pagemap_fd, /* check for order-0 pages */ if (!order) { - if (pfn_flags & (KPF_THP | KPF_COMPOUND_HEAD | KPF_COMPOUND_TAIL)) + if (pfn_flags & (folio_head_flags | folio_tail_flags)) return false; return true; } @@ -76,14 +78,14 @@ static bool is_backed_by_folio(char *vaddr, int order, int pagemap_fd, goto fail; /* head PFN has no compound_head flag set */ - if (!(pfn_flags & (KPF_THP | KPF_COMPOUND_HEAD))) + if ((pfn_flags & folio_head_flags) != folio_head_flags) return false; /* check all tail PFN flags */ for (i = 1; i < nr_pages; i++) { if (pageflags_get(pfn_head + i, kpageflags_fd, &pfn_flags)) goto fail; - if (!(pfn_flags & (KPF_THP | KPF_COMPOUND_TAIL))) + if ((pfn_flags & folio_tail_flags) != folio_tail_flags) return false; } @@ -94,11 +96,8 @@ static bool is_backed_by_folio(char *vaddr, int order, int pagemap_fd, if (pageflags_get(pfn_head + nr_pages, kpageflags_fd, &pfn_flags)) return true; - /* this folio is bigger than the given order */ - if (pfn_flags & (KPF_THP | KPF_COMPOUND_TAIL)) - return false; - - return true; + /* If we find another tail page, then the folio is larger. */ + return (pfn_flags & folio_tail_flags) != folio_tail_flags; fail: ksft_exit_fail_msg("Failed to get folio info\n"); return false; From 24a3c7af3bb2acbda5e2ce92b4867fb0a58dbb40 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 3 Sep 2025 09:02:53 +0200 Subject: [PATCH 262/372] selftests/mm: split_huge_page_test: cleanups for split_pte_mapped_thp test There is room for improvement, so let's clean up a bit: (1) Define "4" as a constant. (2) SKIP if we fail to allocate all THPs (e.g., fragmented) and add recovery code for all other failure cases: no need to exit the test. (3) Rename "len" to thp_area_size, and "one_page" to "thp_area". (4) Allocate a new area "page_area" into which we will mremap the pages; add "page_area_size". Now we can easily merge the two mremap instances into a single one. (5) Iterate THPs instead of bytes when checking for missed THPs after mremap. (6) Rename "pte_mapped2" to "tmp", used to verify mremap(MAP_FIXED) result. (7) Split the corruption test from the failed-split test, so we can just iterate bytes vs. thps naturally. (8) Extend comments and clarify why we are using mremap in the first place. Link: https://lkml.kernel.org/r/20250903070253.34556-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Wei Yang Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- .../selftests/mm/split_huge_page_test.c | 125 +++++++++++------- 1 file changed, 75 insertions(+), 50 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 72d6d8bb329e..7731191cc8e9 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -389,67 +389,92 @@ static void split_pmd_thp_to_order(int order) static void split_pte_mapped_thp(void) { - char *one_page, *pte_mapped, *pte_mapped2; - size_t len = 4 * pmd_pagesize; - uint64_t thp_size; + const size_t nr_thps = 4; + const size_t thp_area_size = nr_thps * pmd_pagesize; + const size_t page_area_size = nr_thps * pagesize; + char *thp_area, *tmp, *page_area = MAP_FAILED; size_t i; - one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + thp_area = mmap((void *)(1UL << 30), thp_area_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (one_page == MAP_FAILED) - ksft_exit_fail_msg("Fail to allocate memory: %s\n", strerror(errno)); - - madvise(one_page, len, MADV_HUGEPAGE); - - for (i = 0; i < len; i++) - one_page[i] = (char)i; - - if (!check_huge_anon(one_page, 4, pmd_pagesize)) - ksft_exit_fail_msg("No THP is allocated\n"); - - /* remap the first pagesize of first THP */ - pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); - - /* remap the Nth pagesize of Nth THP */ - for (i = 1; i < 4; i++) { - pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, - pagesize, pagesize, - MREMAP_MAYMOVE|MREMAP_FIXED, - pte_mapped + pagesize * i); - if (pte_mapped2 == MAP_FAILED) - ksft_exit_fail_msg("mremap failed: %s\n", strerror(errno)); + if (thp_area == MAP_FAILED) { + ksft_test_result_fail("Fail to allocate memory: %s\n", strerror(errno)); + return; } - /* smap does not show THPs after mremap, use kpageflags instead */ - thp_size = 0; - for (i = 0; i < pagesize * 4; i++) - if (i % pagesize == 0 && - is_backed_by_folio(&pte_mapped[i], pmd_order, pagemap_fd, kpageflags_fd)) - thp_size++; + madvise(thp_area, thp_area_size, MADV_HUGEPAGE); - if (thp_size != 4) - ksft_exit_fail_msg("Some THPs are missing during mremap\n"); + for (i = 0; i < thp_area_size; i++) + thp_area[i] = (char)i; - /* split all remapped THPs */ - write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, - (uint64_t)pte_mapped + pagesize * 4, 0); - - /* smap does not show THPs after mremap, use kpageflags instead */ - thp_size = 0; - for (i = 0; i < pagesize * 4; i++) { - if (pte_mapped[i] != (char)i) - ksft_exit_fail_msg("%ld byte corrupted\n", i); - - if (i % pagesize == 0 && - !is_backed_by_folio(&pte_mapped[i], 0, pagemap_fd, kpageflags_fd)) - thp_size++; + if (!check_huge_anon(thp_area, nr_thps, pmd_pagesize)) { + ksft_test_result_skip("Not all THPs allocated\n"); + goto out; } - if (thp_size) - ksft_exit_fail_msg("Still %ld THPs not split\n", thp_size); + /* + * To challenge spitting code, we will mremap a single page of each + * THP (page[i] of thp[i]) in the thp_area into page_area. This will + * replace the PMD mappings in the thp_area by PTE mappings first, + * but leaving the THP unsplit, to then create a page-sized hole in + * the thp_area. + * We will then manually trigger splitting of all THPs through the + * single mremap'ed pages of each THP in the page_area. + */ + page_area = mmap(NULL, page_area_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (page_area == MAP_FAILED) { + ksft_test_result_fail("Fail to allocate memory: %s\n", strerror(errno)); + goto out; + } + + for (i = 0; i < nr_thps; i++) { + tmp = mremap(thp_area + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, MREMAP_MAYMOVE|MREMAP_FIXED, + page_area + pagesize * i); + if (tmp != MAP_FAILED) + continue; + ksft_test_result_fail("mremap failed: %s\n", strerror(errno)); + goto out; + } + + /* + * Verify that our THPs were not split yet. Note that + * check_huge_anon() cannot be used as it checks for PMD mappings. + */ + for (i = 0; i < nr_thps; i++) { + if (is_backed_by_folio(page_area + i * pagesize, pmd_order, + pagemap_fd, kpageflags_fd)) + continue; + ksft_test_result_fail("THP %zu missing after mremap\n", i); + goto out; + } + + /* Split all THPs through the remapped pages. */ + write_debugfs(PID_FMT, getpid(), (uint64_t)page_area, + (uint64_t)page_area + page_area_size, 0); + + /* Corruption during mremap or split? */ + for (i = 0; i < page_area_size; i++) { + if (page_area[i] == (char)i) + continue; + ksft_test_result_fail("%zu byte corrupted\n", i); + goto out; + } + + /* Split failed? */ + for (i = 0; i < nr_thps; i++) { + if (is_backed_by_folio(page_area + i * pagesize, 0, + pagemap_fd, kpageflags_fd)) + continue; + ksft_test_result_fail("THP %zu not split\n", i); + } ksft_test_result_pass("Split PTE-mapped huge pages successful\n"); - munmap(one_page, len); +out: + munmap(thp_area, thp_area_size); + if (page_area != MAP_FAILED) + munmap(page_area, page_area_size); } static void split_file_backed_thp(int order) From 0faa77afe72b0705cdba8a59a0969e20300f6548 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Thu, 4 Sep 2025 21:27:37 +0800 Subject: [PATCH 263/372] filemap: optimize folio refount update in filemap_map_pages There are two meaningless folio refcount update for order0 folio in filemap_map_pages(). First, filemap_map_order0_folio() adds folio refcount after the folio is mapped to pte. And then, filemap_map_pages() drops a refcount grabbed by next_uptodate_folio(). We could remain the refcount unchanged in this case. As Matthew metenioned in [1], it is safe to call folio_unlock() before calling folio_put() here, because the folio is in page cache with refcount held, and truncation will wait for the unlock. Optimize filemap_map_folio_range() with the same method too. With this patch, we can get 8% performance gain for lmbench testcase 'lat_pagefault -P 1 file' in order0 folio case, the size of file is 512M. Link: https://lkml.kernel.org/r/20250904132737.1250368-1-tujinjiang@huawei.com Link: https://lore.kernel.org/all/aKcU-fzxeW3xT5Wv@casper.infradead.org/ [1] Signed-off-by: Jinjiang Tu Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 344ab106c21c..8d078aa2738a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3665,6 +3665,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, unsigned long addr, unsigned int nr_pages, unsigned long *rss, unsigned short *mmap_miss) { + unsigned int ref_from_caller = 1; vm_fault_t ret = 0; struct page *page = folio_page(folio, start); unsigned int count = 0; @@ -3698,7 +3699,8 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; - folio_ref_add(folio, count); + folio_ref_add(folio, count - ref_from_caller); + ref_from_caller = 0; if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } @@ -3713,12 +3715,16 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); *rss += count; - folio_ref_add(folio, count); + folio_ref_add(folio, count - ref_from_caller); + ref_from_caller = 0; if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } vmf->pte = old_ptep; + if (ref_from_caller) + /* Locked folios cannot get truncated. */ + folio_ref_dec(folio); return ret; } @@ -3731,7 +3737,7 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, struct page *page = &folio->page; if (PageHWPoison(page)) - return ret; + goto out; /* See comment of filemap_map_folio_range() */ if (!folio_test_workingset(folio)) @@ -3743,15 +3749,18 @@ static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, * the fault-around logic. */ if (!pte_none(ptep_get(vmf->pte))) - return ret; + goto out; if (vmf->address == addr) ret = VM_FAULT_NOPAGE; set_pte_range(vmf, folio, page, 1, addr); (*rss)++; - folio_ref_inc(folio); + return ret; +out: + /* Locked folios cannot get truncated. */ + folio_ref_dec(folio); return ret; } @@ -3811,7 +3820,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, nr_pages, &rss, &mmap_miss); folio_unlock(folio); - folio_put(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); add_mm_counter(vma->vm_mm, folio_type, rss); pte_unmap_unlock(vmf->pte, vmf->ptl); From a488ba3124c82d704963fcd760fe653df1987b13 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 5 Sep 2025 17:00:12 +0200 Subject: [PATCH 264/372] huge_memory: return -EINVAL in folio split functions when THP is disabled split_huge_page_to_list_[to_order](), split_huge_page() and try_folio_split() return 0 on success and error codes on failure. When THP is disabled, these functions return 0 indicating success even though an error code should be returned as it is not possible to split a folio when THP is disabled. Make all these functions return -EINVAL to indicate failure instead of 0. As large folios depend on CONFIG_THP, issue warning as this function should not be called without a large folio. Link: https://lkml.kernel.org/r/20250905150012.93714-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509051753.riCeG7LC-lkp@intel.com/ Acked-by: David Hildenbrand Acked-by: Zi Yan Acked-by: Kiryl Shutsemau Reviewed-by: Lorenzo Stoakes Reviewed-by: Barry Song Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 29ef70022da1..f327d62fc985 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -588,22 +588,26 @@ static inline int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, unsigned int new_order) { - return 0; + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; } static inline int split_huge_page(struct page *page) { - return 0; + VM_WARN_ON_ONCE_PAGE(1, page); + return -EINVAL; } static inline int split_folio_to_list(struct folio *folio, struct list_head *list) { - return 0; + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; } static inline int try_folio_split(struct folio *folio, struct page *page, struct list_head *list) { - return 0; + VM_WARN_ON_ONCE_FOLIO(1, folio); + return -EINVAL; } static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {} From 5ce1dbfdd8e3d4dca2f842dd833ca7e264ace85b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 4 Sep 2025 14:22:58 +0800 Subject: [PATCH 265/372] mm/hwpoison: decouple hwpoison_filter from mm/memory-failure.c mm/memory-failure.c defines and uses hwpoison_filter_* parameters but the values of those parameters can only be modified via mm/hwpoison-inject.c from userspace. They have a potentially different life time. Decouple those parameters from mm/memory-failure.c to fix this broken layering. Link: https://lkml.kernel.org/r/20250904062258.3336092-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: Michal Hocko Cc: David Hildenbrand Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- fs/proc/page.c | 1 + mm/hwpoison-inject.c | 91 +++++++++++++++++++++++++++++++++++++ mm/internal.h | 10 ++-- mm/memcontrol.c | 1 + mm/memory-failure.c | 106 +++++++------------------------------------ 5 files changed, 113 insertions(+), 96 deletions(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index 771e0b6bc630..fc64f23e05e5 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -256,6 +256,7 @@ u64 stable_page_flags(const struct page *page) return u; } +EXPORT_SYMBOL_GPL(stable_page_flags); /* /proc/kpageflags - an array exposing page flags * diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 7ecaa1900137..a11222572f97 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -7,8 +7,96 @@ #include #include #include +#include +#include #include "internal.h" +static u32 hwpoison_filter_enable; +static u32 hwpoison_filter_dev_major = ~0U; +static u32 hwpoison_filter_dev_minor = ~0U; +static u64 hwpoison_filter_flags_mask; +static u64 hwpoison_filter_flags_value; + +static int hwpoison_filter_dev(struct page *p) +{ + struct folio *folio = page_folio(p); + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + mapping = folio_mapping(folio); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + +/* + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. + */ +#ifdef CONFIG_MEMCG +static u64 hwpoison_filter_memcg; +static int hwpoison_filter_task(struct page *p) +{ + if (!hwpoison_filter_memcg) + return 0; + + if (page_cgroup_ino(p) != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + +static int hwpoison_filter(struct page *p) +{ + if (!hwpoison_filter_enable) + return 0; + + if (hwpoison_filter_dev(p)) + return -EINVAL; + + if (hwpoison_filter_flags(p)) + return -EINVAL; + + if (hwpoison_filter_task(p)) + return -EINVAL; + + return 0; +} + static struct dentry *hwpoison_dir; static int hwpoison_inject(void *data, u64 val) @@ -67,6 +155,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void __exit pfn_inject_exit(void) { hwpoison_filter_enable = 0; + hwpoison_filter_unregister(); debugfs_remove_recursive(hwpoison_dir); } @@ -105,6 +194,8 @@ static int __init pfn_inject_init(void) &hwpoison_filter_memcg); #endif + hwpoison_filter_register(hwpoison_filter); + return 0; } diff --git a/mm/internal.h b/mm/internal.h index 9b0129531d00..c4657ffd342e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1228,14 +1228,10 @@ static inline bool node_reclaim_enabled(void) #ifdef CONFIG_MEMORY_FAILURE int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill); void shake_folio(struct folio *folio); -extern int hwpoison_filter(struct page *p); +typedef int hwpoison_filter_func_t(struct page *p); +void hwpoison_filter_register(hwpoison_filter_func_t *filter); +void hwpoison_filter_unregister(void); -extern u32 hwpoison_filter_dev_major; -extern u32 hwpoison_filter_dev_minor; -extern u64 hwpoison_filter_flags_mask; -extern u64 hwpoison_filter_flags_value; -extern u64 hwpoison_filter_memcg; -extern u32 hwpoison_filter_enable; #define MAGIC_HWPOISON 0x48575053U /* HWPS */ void SetPageHWPoisonTakenOff(struct page *page); void ClearPageHWPoisonTakenOff(struct page *page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9712a751690f..8dc470aa6c3c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -287,6 +287,7 @@ ino_t page_cgroup_ino(struct page *page) rcu_read_unlock(); return ino; } +EXPORT_SYMBOL_GPL(page_cgroup_ino); /* Subset of node_stat_item for memcg stats */ static const unsigned int memcg_node_stat_items[] = { diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b93ab99ad3ef..2a95b41e0535 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -212,106 +212,34 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo return true; } -#if IS_ENABLED(CONFIG_HWPOISON_INJECT) +static hwpoison_filter_func_t __rcu *hwpoison_filter_func __read_mostly; -u32 hwpoison_filter_enable = 0; -u32 hwpoison_filter_dev_major = ~0U; -u32 hwpoison_filter_dev_minor = ~0U; -u64 hwpoison_filter_flags_mask; -u64 hwpoison_filter_flags_value; -EXPORT_SYMBOL_GPL(hwpoison_filter_enable); -EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); -EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); -EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); -EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); - -static int hwpoison_filter_dev(struct page *p) +void hwpoison_filter_register(hwpoison_filter_func_t *filter) { - struct folio *folio = page_folio(p); - struct address_space *mapping; - dev_t dev; - - if (hwpoison_filter_dev_major == ~0U && - hwpoison_filter_dev_minor == ~0U) - return 0; - - mapping = folio_mapping(folio); - if (mapping == NULL || mapping->host == NULL) - return -EINVAL; - - dev = mapping->host->i_sb->s_dev; - if (hwpoison_filter_dev_major != ~0U && - hwpoison_filter_dev_major != MAJOR(dev)) - return -EINVAL; - if (hwpoison_filter_dev_minor != ~0U && - hwpoison_filter_dev_minor != MINOR(dev)) - return -EINVAL; - - return 0; + rcu_assign_pointer(hwpoison_filter_func, filter); } +EXPORT_SYMBOL_GPL(hwpoison_filter_register); -static int hwpoison_filter_flags(struct page *p) +void hwpoison_filter_unregister(void) { - if (!hwpoison_filter_flags_mask) - return 0; - - if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == - hwpoison_filter_flags_value) - return 0; - else - return -EINVAL; + RCU_INIT_POINTER(hwpoison_filter_func, NULL); + synchronize_rcu(); } +EXPORT_SYMBOL_GPL(hwpoison_filter_unregister); -/* - * This allows stress tests to limit test scope to a collection of tasks - * by putting them under some memcg. This prevents killing unrelated/important - * processes such as /sbin/init. Note that the target task may share clean - * pages with init (eg. libc text), which is harmless. If the target task - * share _dirty_ pages with another task B, the test scheme must make sure B - * is also included in the memcg. At last, due to race conditions this filter - * can only guarantee that the page either belongs to the memcg tasks, or is - * a freed page. - */ -#ifdef CONFIG_MEMCG -u64 hwpoison_filter_memcg; -EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); -static int hwpoison_filter_task(struct page *p) +static int hwpoison_filter(struct page *p) { - if (!hwpoison_filter_memcg) - return 0; + int ret = 0; + hwpoison_filter_func_t *filter; - if (page_cgroup_ino(p) != hwpoison_filter_memcg) - return -EINVAL; + rcu_read_lock(); + filter = rcu_dereference(hwpoison_filter_func); + if (filter) + ret = filter(p); + rcu_read_unlock(); - return 0; + return ret; } -#else -static int hwpoison_filter_task(struct page *p) { return 0; } -#endif - -int hwpoison_filter(struct page *p) -{ - if (!hwpoison_filter_enable) - return 0; - - if (hwpoison_filter_dev(p)) - return -EINVAL; - - if (hwpoison_filter_flags(p)) - return -EINVAL; - - if (hwpoison_filter_task(p)) - return -EINVAL; - - return 0; -} -EXPORT_SYMBOL_GPL(hwpoison_filter); -#else -int hwpoison_filter(struct page *p) -{ - return 0; -} -#endif /* * Kill all processes that have a poisoned page mapped and then isolate From 4805ef3707608e04477caeba6a8a0de04d1d77b5 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 5 Sep 2025 14:03:58 +0000 Subject: [PATCH 266/372] mm/page_alloc: check the correct buddy if it is a starting block find_large_buddy() search buddy based on start_pfn, which maybe different from page's pfn, e.g. when page is not pageblock aligned, because prep_move_freepages_block() always align start_pfn to pageblock. This means when we found a starting block at start_pfn, it may check on the wrong page theoretically. And not split the free page as it is supposed to, causing a freelist migratetype mismatch. The good news is the page passed to __move_freepages_block_isolate() has only two possible cases: * page is pageblock aligned * page is __first_valid_page() of this block So it is safe for the first case, and it won't get a buddy larger than pageblock for the second case. To fix the issue, check the returned pfn of find_large_buddy() to decide whether to split the free page: 1. if it is not a PageBuddy pfn, no split; 2. if it is a PageBuddy pfn but order <= pageblock_order, no split; 3. if it is a PageBuddy pfn with order > pageblock_order, start_pfn is either in the starting block or tail block, split the PageBuddy at pageblock_order level. Link: https://lkml.kernel.org/r/20250905140358.28849-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Zi Yan Cc: Johannes Weiner Cc: David Hildenbrand Cc: Baolin Wang Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8de5fb5528eb..df6df302d0c5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2090,9 +2090,10 @@ static inline void toggle_pageblock_isolate(struct page *page, bool isolate) static bool __move_freepages_block_isolate(struct zone *zone, struct page *page, bool isolate) { - unsigned long start_pfn, pfn; + unsigned long start_pfn, buddy_pfn; int from_mt; int to_mt; + struct page *buddy; if (isolate == get_pageblock_isolate(page)) { VM_WARN_ONCE(1, "%s a pageblock that is already in that state", @@ -2107,29 +2108,19 @@ static bool __move_freepages_block_isolate(struct zone *zone, if (pageblock_order == MAX_PAGE_ORDER) goto move; - /* We're a tail block in a larger buddy */ - pfn = find_large_buddy(start_pfn); - if (pfn != start_pfn) { - struct page *buddy = pfn_to_page(pfn); + buddy_pfn = find_large_buddy(start_pfn); + buddy = pfn_to_page(buddy_pfn); + /* We're a part of a larger buddy */ + if (PageBuddy(buddy) && buddy_order(buddy) > pageblock_order) { int order = buddy_order(buddy); del_page_from_free_list(buddy, zone, order, - get_pfnblock_migratetype(buddy, pfn)); + get_pfnblock_migratetype(buddy, buddy_pfn)); toggle_pageblock_isolate(page, isolate); - split_large_buddy(zone, buddy, pfn, order, FPI_NONE); + split_large_buddy(zone, buddy, buddy_pfn, order, FPI_NONE); return true; } - /* We're the starting block of a larger buddy */ - if (PageBuddy(page) && buddy_order(page) > pageblock_order) { - int order = buddy_order(page); - - del_page_from_free_list(page, zone, order, - get_pfnblock_migratetype(page, pfn)); - toggle_pageblock_isolate(page, isolate); - split_large_buddy(zone, page, pfn, order, FPI_NONE); - return true; - } move: /* Use MIGRATETYPE_MASK to get non-isolate migratetype */ if (isolate) { From d3f7922b929a92873e14f7c61a026a6b576e8773 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 4 Sep 2025 08:59:25 +1000 Subject: [PATCH 267/372] mm/gup: remove dead pgmap refcounting code Prior to commit aed877c2b425 ("device/dax: properly refcount device dax pages when mapping") ZONE_DEVICE pages were not fully reference counted when mapped into user page tables. Instead GUP would take a reference on the associated pgmap to ensure the results of pfn_to_page() remained valid. This is no longer required and most of the code was removed by commit fd2825b0760a ("mm/gup: remove pXX_devmap usage from get_user_pages()"). Finish cleaning this up by removing the dead calls to put_dev_pagemap() and the temporary context struct. Link: https://lkml.kernel.org/r/20250903225926.34702-1-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Reviewed-by: John Hubbard Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/gup.c | 67 ++++++++++++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 83438bbbf2f6..00f30e03f736 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -28,11 +28,6 @@ #include "internal.h" #include "swap.h" -struct follow_page_context { - struct dev_pagemap *pgmap; - unsigned int page_mask; -}; - static inline void sanity_check_pinned_pages(struct page **pages, unsigned long npages) { @@ -653,7 +648,7 @@ static inline bool can_follow_write_pud(pud_t pud, struct page *page, static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, - int flags, struct follow_page_context *ctx) + int flags, unsigned long *page_mask) { struct mm_struct *mm = vma->vm_mm; struct page *page; @@ -680,7 +675,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, if (ret) page = ERR_PTR(ret); else - ctx->page_mask = HPAGE_PUD_NR - 1; + *page_mask = HPAGE_PUD_NR - 1; return page; } @@ -706,7 +701,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { struct mm_struct *mm = vma->vm_mm; pmd_t pmdval = *pmd; @@ -743,7 +738,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; - ctx->page_mask = HPAGE_PMD_NR - 1; + *page_mask = HPAGE_PMD_NR - 1; return page; } @@ -751,7 +746,7 @@ static struct page *follow_huge_pmd(struct vm_area_struct *vma, #else /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */ static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, - int flags, struct follow_page_context *ctx) + int flags, unsigned long *page_mask) { return NULL; } @@ -759,7 +754,7 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, static struct page *follow_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { return NULL; } @@ -805,8 +800,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, } static struct page *follow_page_pte(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, unsigned int flags, - struct dev_pagemap **pgmap) + unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; struct folio *folio; @@ -904,7 +898,7 @@ no_page: static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pmd_t *pmd, pmdval; spinlock_t *ptl; @@ -918,7 +912,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (!pmd_present(pmdval)) return no_page_table(vma, flags, address); if (likely(!pmd_leaf(pmdval))) - return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + return follow_page_pte(vma, address, pmd, flags); if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags, address); @@ -931,16 +925,16 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } if (unlikely(!pmd_leaf(pmdval))) { spin_unlock(ptl); - return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + return follow_page_pte(vma, address, pmd, flags); } if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) { spin_unlock(ptl); split_huge_pmd(vma, pmd, address); /* If pmd was left empty, stuff a page table in there quickly */ return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) : - follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); + follow_page_pte(vma, address, pmd, flags); } - page = follow_huge_pmd(vma, address, pmd, flags, ctx); + page = follow_huge_pmd(vma, address, pmd, flags, page_mask); spin_unlock(ptl); return page; } @@ -948,7 +942,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, static struct page *follow_pud_mask(struct vm_area_struct *vma, unsigned long address, p4d_t *p4dp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pud_t *pudp, pud; spinlock_t *ptl; @@ -961,7 +955,7 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, return no_page_table(vma, flags, address); if (pud_leaf(pud)) { ptl = pud_lock(mm, pudp); - page = follow_huge_pud(vma, address, pudp, flags, ctx); + page = follow_huge_pud(vma, address, pudp, flags, page_mask); spin_unlock(ptl); if (page) return page; @@ -970,13 +964,13 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, if (unlikely(pud_bad(pud))) return no_page_table(vma, flags, address); - return follow_pmd_mask(vma, address, pudp, flags, ctx); + return follow_pmd_mask(vma, address, pudp, flags, page_mask); } static struct page *follow_p4d_mask(struct vm_area_struct *vma, unsigned long address, pgd_t *pgdp, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { p4d_t *p4dp, p4d; @@ -987,7 +981,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, if (!p4d_present(p4d) || p4d_bad(p4d)) return no_page_table(vma, flags, address); - return follow_pud_mask(vma, address, p4dp, flags, ctx); + return follow_pud_mask(vma, address, p4dp, flags, page_mask); } /** @@ -995,20 +989,16 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour - * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a - * pointer to output page_mask + * @page_mask: a pointer to output page_mask * * @flags can have FOLL_ flags set, defined in * - * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches - * the device's dev_pagemap metadata to avoid repeating expensive lookups. - * * When getting an anonymous page and the caller has to trigger unsharing * of a shared anonymous page first, -EMLINK is returned. The caller should * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only * relevant with FOLL_PIN and !FOLL_WRITE. * - * On output, the @ctx->page_mask is set according to the size of the page. + * On output, @page_mask is set according to the size of the page. * * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented @@ -1016,7 +1006,7 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, */ static struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, - struct follow_page_context *ctx) + unsigned long *page_mask) { pgd_t *pgd; struct mm_struct *mm = vma->vm_mm; @@ -1024,13 +1014,13 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, vma_pgtable_walk_begin(vma); - ctx->page_mask = 0; + *page_mask = 0; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) page = no_page_table(vma, flags, address); else - page = follow_p4d_mask(vma, address, pgd, flags, ctx); + page = follow_p4d_mask(vma, address, pgd, flags, page_mask); vma_pgtable_walk_end(vma); @@ -1368,7 +1358,7 @@ static long __get_user_pages(struct mm_struct *mm, { long ret = 0, i = 0; struct vm_area_struct *vma = NULL; - struct follow_page_context ctx = { NULL }; + unsigned long page_mask = 0; if (!nr_pages) return 0; @@ -1410,7 +1400,7 @@ static long __get_user_pages(struct mm_struct *mm, pages ? &page : NULL); if (ret) goto out; - ctx.page_mask = 0; + page_mask = 0; goto next_page; } @@ -1433,7 +1423,7 @@ retry: } cond_resched(); - page = follow_page_mask(vma, start, gup_flags, &ctx); + page = follow_page_mask(vma, start, gup_flags, &page_mask); if (!page || PTR_ERR(page) == -EMLINK) { ret = faultin_page(vma, start, gup_flags, PTR_ERR(page) == -EMLINK, locked); @@ -1466,7 +1456,7 @@ retry: goto out; } next_page: - page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); if (page_increm > nr_pages) page_increm = nr_pages; @@ -1516,8 +1506,6 @@ next_page: nr_pages -= page_increm; } while (nr_pages); out: - if (ctx.pgmap) - put_dev_pagemap(ctx.pgmap); return i ? i : ret; } @@ -2853,7 +2841,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { - struct dev_pagemap *pgmap = NULL; int ret = 0; pte_t *ptep, *ptem; @@ -2926,8 +2913,6 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, ret = 1; pte_unmap: - if (pgmap) - put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } From 614d850efda98e4455e4f2b55e64864f68a4e370 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Thu, 4 Sep 2025 08:59:26 +1000 Subject: [PATCH 268/372] mm/memremap: remove unused get_dev_pagemap() parameter GUP no longer uses get_dev_pagemap(). As it was the only user of the get_dev_pagemap() pgmap caching feature it can be removed. Link: https://lkml.kernel.org/r/20250903225926.34702-2-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Reviewed-by: Dan Williams Cc: John Hubbard Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/memremap.h | 6 ++---- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 2 +- mm/memremap.c | 22 ++++------------------ 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/include/linux/memremap.h b/include/linux/memremap.h index aa1b6aa877a0..e5951ba12a28 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -211,8 +211,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap); +struct dev_pagemap *get_dev_pagemap(unsigned long pfn); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long memremap_compat_align(void); @@ -234,8 +233,7 @@ static inline void devm_memunmap_pages(struct device *dev, { } -static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) +static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { return NULL; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2a95b41e0535..6d9134e3d115 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2194,7 +2194,7 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; if (pfn_valid(pfn)) { - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_ref_page(pfn, flags); if (pgmap) { res = memory_failure_dev_pagemap(pfn, flags, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 74318c787715..883b8e4d51ba 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -375,7 +375,7 @@ struct page *pfn_to_online_page(unsigned long pfn) * the section may be 'offline' but 'valid'. Only * get_dev_pagemap() can determine sub-section online status. */ - pgmap = get_dev_pagemap(pfn, NULL); + pgmap = get_dev_pagemap(pfn); put_dev_pagemap(pgmap); /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */ diff --git a/mm/memremap.c b/mm/memremap.c index a2d4bb88f64b..46cb1b0b6f72 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -153,14 +153,14 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, "altmap not supported for multiple ranges\n")) return -EINVAL; - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->start)); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); return -ENOMEM; } - conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end), NULL); + conflict_pgmap = get_dev_pagemap(PHYS_PFN(range->end)); if (conflict_pgmap) { WARN(1, "Conflicting mapping in same section\n"); put_dev_pagemap(conflict_pgmap); @@ -397,26 +397,12 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map - * @pgmap: optional known pgmap that already has a reference - * - * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap - * is non-NULL but does not cover @pfn the reference to it will be released. */ -struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) +struct dev_pagemap *get_dev_pagemap(unsigned long pfn) { + struct dev_pagemap *pgmap; resource_size_t phys = PFN_PHYS(pfn); - /* - * In the cached case we're already holding a live reference. - */ - if (pgmap) { - if (phys >= pgmap->range.start && phys <= pgmap->range.end) - return pgmap; - put_dev_pagemap(pgmap); - } - - /* fall back to slow path lookup */ rcu_read_lock(); pgmap = xa_load(&pgmap_array, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live_rcu(&pgmap->ref)) From 87cc51571a7790f50151c4fdca10c43aa18a29ba Mon Sep 17 00:00:00 2001 From: Chris Li Date: Wed, 17 Sep 2025 00:00:46 +0800 Subject: [PATCH 269/372] docs/mm: add document for swap table Patch series "mm, swap: introduce swap table as swap cache (phase I)", v4. This is the first phase of the bigger series implementing basic infrastructures for the Swap Table idea proposed at the LSF/MM/BPF topic "Integrate swap cache, swap maps with swap allocator" [1]. To give credit where it is due, this is based on Chris Li's idea and a prototype of using cluster size atomic arrays to implement swap cache. This phase I contains 15 patches, introduces the swap table infrastructure and uses it as the swap cache backend. By doing so, we have up to ~5-20% performance gain in throughput, RPS or build time for benchmark and workload tests. The speed up is due to less contention on the swap cache access and shallower swap cache lookup path. The cluster size is much finer-grained than the 64M address space split, which is removed in this phase I. It also unifies and cleans up the swap code base. Each swap cluster will dynamically allocate the swap table, which is an atomic array to cover every swap slot in the cluster. It replaces the swap cache backed by XArray. In phase I, the static allocated swap_map still co-exists with the swap table. The memory usage is about the same as the original on average. A few exception test cases show about 1% higher in memory usage. In the following phases of the series, swap_map will merge into the swap table without additional memory allocation. It will result in net memory reduction compared to the original swap cache. Testing has shown that phase I has a significant performance improvement from 8c/1G ARM machine to 48c96t/128G x86_64 servers in many practical workloads. The full picture with a summary can be found at [2]. An older bigger series of 28 patches is posted at [3]. vm-scability test: ================== Test with: usemem --init-time -O -y -x -n 31 1G (4G memcg, PMEM as swap) Before: After: System time: 219.12s 158.16s (-27.82%) Sum Throughput: 4767.13 MB/s 6128.59 MB/s (+28.55%) Single process Throughput: 150.21 MB/s 196.52 MB/s (+30.83%) Free latency: 175047.58 us 131411.87 us (-24.92%) usemem --init-time -O -y -x -n 32 1536M (16G memory, global pressure, PMEM as swap) Before: After: System time: 356.16s 284.68s (-20.06%) Sum Throughput: 4648.35 MB/s 5453.52 MB/s (+17.32%) Single process Throughput: 141.63 MB/s 168.35 MB/s (+18.86%) Free latency: 499907.71 us 484977.03 us (-2.99%) This shows an improvement of more than 20% improvement in most readings. Build kernel test: ================== The following result matrix is from building kernel with defconfig on tmpfs with ZSWAP / ZRAM, using different memory pressure and setups. Measuring sys and real time in seconds, less is better (user time is almost identical as expected): -j / Mem | Sys before / after | Real before / after Using 16G ZRAM with memcg limit: 6 / 192M | 9686 / 9472 -2.21% | 2130 / 2096 -1.59% 12 / 256M | 6610 / 6451 -2.41% | 827 / 812 -1.81% 24 / 384M | 5938 / 5701 -3.37% | 414 / 405 -2.17% 48 / 768M | 4696 / 4409 -6.11% | 188 / 182 -3.19% With 64k folio: 24 / 512M | 4222 / 4162 -1.42% | 326 / 321 -1.53% 48 / 1G | 3688 / 3622 -1.79% | 151 / 149 -1.32% With ZSWAP with 3G memcg (using higher limit due to kmem account): 48 / 3G | 603 / 581 -3.65% | 81 / 80 -1.23% Testing extremely high global memory and schedule pressure: Using ZSWAP with 32G NVMEs in a 48c VM that has 4G memory, no memcg limit, system components take up about 1.5G already, using make -j48 to build defconfig: Before: sys time: 2069.53s real time: 135.76s After: sys time: 2021.13s (-2.34%) real time: 134.23s (-1.12%) On another 48c 4G memory VM, using 16G ZRAM as swap, testing make -j48 with same config: Before: sys time: 1756.96s real time: 111.01s After: sys time: 1715.90s (-2.34%) real time: 109.51s (-1.35%) All cases are more or less faster, and no regression even under extremely heavy global memory pressure. Redis / Valkey bench: ===================== The test machine is a ARM64 VM with 1536M memory 12 cores, Redis is set to use 2500M memory, and ZRAM swap size is set to 5G: Testing with: redis-benchmark -r 2000000 -n 2000000 -d 1024 -c 12 -P 32 -t get no BGSAVE with BGSAVE Before: 487576.06 RPS 280016.02 RPS After: 487541.76 RPS (-0.01%) 300155.32 RPS (+7.19%) Testing with: redis-benchmark -r 2500000 -n 2500000 -d 1024 -c 12 -P 32 -t get no BGSAVE with BGSAVE Before: 466789.59 RPS 281213.92 RPS After: 466402.89 RPS (-0.08%) 298411.84 RPS (+6.12%) With BGSAVE enabled, most Redis memory will have a swap count > 1 so swap cache is heavily in use. We can see a about 6% performance gain. No BGSAVE is very slightly slower (<0.1%) due to the higher memory pressure of the co-existence of swap_map and swap table. This will be optimzed into a net gain and up to 20% gain in BGSAVE case in the following phases. HDD swap is also ~40% faster with usemem because we removed an old contention workaround. This patch (of 15): Swap table is the new swap cache. [chrisl@kernel.org: move swap table document, redo swap table size sentence] Link: https://lkml.kernel.org/r/CACePvbXjaUyzB_9RSSSgR6BNvz+L9anvn0vcNf_J0jD7-4Yy6Q@mail.gmail.com Link: https://lkml.kernel.org/r/20250916160100.31545-1-ryncsn@gmail.com Link: https://lore.kernel.org/linux-mm/20250514201729.48420-1-ryncsn@gmail.com/ [3] Link: https://lkml.kernel.org/r/20250916160100.31545-2-ryncsn@gmail.com Link: https://lore.kernel.org/CAMgjq7BvQ0ZXvyLGp2YP96+i+6COCBBJCYmjXHGBnfisCAb8VA@mail.gmail.com [1] Link: https://github.com/ryncsn/linux/tree/kasong/devel/swap-table [2] Signed-off-by: Chris Li Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: kernel test robot Cc: SeongJae Park Signed-off-by: Andrew Morton --- Documentation/mm/index.rst | 1 + Documentation/mm/swap-table.rst | 69 +++++++++++++++++++++++++++++++++ MAINTAINERS | 1 + 3 files changed, 71 insertions(+) create mode 100644 Documentation/mm/swap-table.rst diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst index fb45acba16ac..ba6a8872849b 100644 --- a/Documentation/mm/index.rst +++ b/Documentation/mm/index.rst @@ -20,6 +20,7 @@ see the :doc:`admin guide <../admin-guide/mm/index>`. highmem page_reclaim swap + swap-table page_cache shmfs oom diff --git a/Documentation/mm/swap-table.rst b/Documentation/mm/swap-table.rst new file mode 100644 index 000000000000..da10bb7a0dc3 --- /dev/null +++ b/Documentation/mm/swap-table.rst @@ -0,0 +1,69 @@ +.. SPDX-License-Identifier: GPL-2.0 + +:Author: Chris Li , Kairui Song + +========== +Swap Table +========== + +Swap table implements swap cache as a per-cluster swap cache value array. + +Swap Entry +---------- + +A swap entry contains the information required to serve the anonymous page +fault. + +Swap entry is encoded as two parts: swap type and swap offset. + +The swap type indicates which swap device to use. +The swap offset is the offset of the swap file to read the page data from. + +Swap Cache +---------- + +Swap cache is a map to look up folios using swap entry as the key. The result +value can have three possible types depending on which stage of this swap entry +was in. + +1. NULL: This swap entry is not used. + +2. folio: A folio has been allocated and bound to this swap entry. This is + the transient state of swap out or swap in. The folio data can be in + the folio or swap file, or both. + +3. shadow: The shadow contains the working set information of the swapped + out folio. This is the normal state for a swapped out page. + +Swap Table Internals +-------------------- + +The previous swap cache is implemented by XArray. The XArray is a tree +structure. Each lookup will go through multiple nodes. Can we do better? + +Notice that most of the time when we look up the swap cache, we are either +in a swap in or swap out path. We should already have the swap cluster, +which contains the swap entry. + +If we have a per-cluster array to store swap cache value in the cluster. +Swap cache lookup within the cluster can be a very simple array lookup. + +We give such a per-cluster swap cache value array a name: the swap table. + +A swap table is an array of pointers. Each pointer is the same size as a +PTE. The size of a swap table for one swap cluster typically matches a PTE +page table, which is one page on modern 64-bit systems. + +With swap table, swap cache lookup can achieve great locality, simpler, +and faster. + +Locking +------- + +Swap table modification requires taking the cluster lock. If a folio +is being added to or removed from the swap table, the folio must be +locked prior to the cluster lock. After adding or removing is done, the +folio shall be unlocked. + +Swap table lookup is protected by RCU and atomic read. If the lookup +returns a folio, the user must lock the folio before use. diff --git a/MAINTAINERS b/MAINTAINERS index 68d29f0220fc..3d113bfc3c82 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16225,6 +16225,7 @@ R: Barry Song R: Chris Li L: linux-mm@kvack.org S: Maintained +F: Documentation/mm/swap-table.rst F: include/linux/swap.h F: include/linux/swapfile.h F: include/linux/swapops.h From f28124617f34153b34d7716eaff084e25a2d71fa Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:47 +0800 Subject: [PATCH 270/372] mm, swap: use unified helper for swap cache look up The swap cache lookup helper swap_cache_get_folio currently does readahead updates as well, so callers that are not doing swapin from any VMA or mapping are forced to reuse filemap helpers instead, and have to access the swap cache space directly. So decouple readahead update with swap cache lookup. Move the readahead update part into a standalone helper. Let the caller call the readahead update helper if they do readahead. And convert all swap cache lookups to use swap_cache_get_folio. After this commit, there are only three special cases for accessing swap cache space now: huge memory splitting, migration, and shmem replacing, because they need to lock the XArray. The following commits will wrap their accesses to the swap cache too, with special helpers. And worth noting, currently dropbehind is not supported for anon folio, and we will never see a dropbehind folio in swap cache. The unified helper can be updated later to handle that. While at it, add proper kernedoc for touched helpers. No functional change. Link: https://lkml.kernel.org/r/20250916160100.31545-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Reviewed-by: Barry Song Acked-by: David Hildenbrand Acked-by: Chris Li Acked-by: Nhat Pham Suggested-by: Chris Li Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/memory.c | 6 ++- mm/mincore.c | 3 +- mm/shmem.c | 4 +- mm/swap.h | 13 ++++-- mm/swap_state.c | 105 +++++++++++++++++++++++++---------------------- mm/swapfile.c | 11 +++-- mm/userfaultfd.c | 5 +-- 7 files changed, 79 insertions(+), 68 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d9de6c056179..10ef528a5f44 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4660,9 +4660,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - folio = swap_cache_get_folio(entry, vma, vmf->address); - if (folio) + folio = swap_cache_get_folio(entry); + if (folio) { + swap_update_readahead(folio, vma, vmf->address); page = folio_file_page(folio, swp_offset(entry)); + } swapcache = folio; if (!folio) { diff --git a/mm/mincore.c b/mm/mincore.c index 2f3e1816a30d..8ec4719370e1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -76,8 +76,7 @@ static unsigned char mincore_swap(swp_entry_t entry, bool shmem) if (!si) return 0; } - folio = filemap_get_entry(swap_address_space(entry), - swap_cache_index(entry)); + folio = swap_cache_get_folio(entry); if (shmem) put_swap_device(si); /* The swap cache space contains either folio, shadow or NULL */ diff --git a/mm/shmem.c b/mm/shmem.c index 29e1eb690125..410f27bc4752 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2317,7 +2317,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } /* Look it up and read it in.. */ - folio = swap_cache_get_folio(swap, NULL, 0); + folio = swap_cache_get_folio(swap); if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO)) { /* Direct swapin skipping swap cache & readahead */ @@ -2342,6 +2342,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_vm_event(PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT); } + } else { + swap_update_readahead(folio, NULL, 0); } if (order > folio_order(folio)) { diff --git a/mm/swap.h b/mm/swap.h index 1ae44d4193b1..efb6d7ff9f30 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -62,8 +62,7 @@ void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); -struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr); +struct folio *swap_cache_get_folio(swp_entry_t entry); struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); @@ -74,6 +73,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct mempolicy *mpol, pgoff_t ilx); struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr); static inline unsigned int folio_swap_flags(struct folio *folio) { @@ -159,6 +160,11 @@ static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, return NULL; } +static inline void swap_update_readahead(struct folio *folio, + struct vm_area_struct *vma, unsigned long addr) +{ +} + static inline int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug) { @@ -169,8 +175,7 @@ static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entr { } -static inline struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr) +static inline struct folio *swap_cache_get_folio(swp_entry_t entry) { return NULL; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 99513b74b5d8..68ec531d0f2b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -69,6 +69,27 @@ void show_swap_cache_info(void) printk("Total swap = %lukB\n", K(total_swap_pages)); } +/** + * swap_cache_get_folio - Looks up a folio in the swap cache. + * @entry: swap entry used for the lookup. + * + * A found folio will be returned unlocked and with its refcount increased. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns the found folio on success, NULL otherwise. The caller + * must lock and check if the folio still matches the swap entry before + * use. + */ +struct folio *swap_cache_get_folio(swp_entry_t entry) +{ + struct folio *folio = filemap_get_folio(swap_address_space(entry), + swap_cache_index(entry)); + if (IS_ERR(folio)) + return NULL; + return folio; +} + void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); @@ -272,55 +293,43 @@ static inline bool swap_use_vma_readahead(void) return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } -/* - * Lookup a swap entry in the swap cache. A found folio will be returned - * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the folio - * lock before returning. - * - * Caller must lock the swap device or hold a reference to keep it valid. +/** + * swap_update_readahead - Update the readahead statistics of VMA or globally. + * @folio: the swap cache folio that just got hit. + * @vma: the VMA that should be updated, could be NULL for global update. + * @addr: the addr that triggered the swapin, ignored if @vma is NULL. */ -struct folio *swap_cache_get_folio(swp_entry_t entry, - struct vm_area_struct *vma, unsigned long addr) +void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr) { - struct folio *folio; + bool readahead, vma_ra = swap_use_vma_readahead(); - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); - if (!IS_ERR(folio)) { - bool vma_ra = swap_use_vma_readahead(); - bool readahead; + /* + * At the moment, we don't support PG_readahead for anon THP + * so let's bail out rather than confusing the readahead stat. + */ + if (unlikely(folio_test_large(folio))) + return; - /* - * At the moment, we don't support PG_readahead for anon THP - * so let's bail out rather than confusing the readahead stat. - */ - if (unlikely(folio_test_large(folio))) - return folio; + readahead = folio_test_clear_readahead(folio); + if (vma && vma_ra) { + unsigned long ra_val; + int win, hits; - readahead = folio_test_clear_readahead(folio); - if (vma && vma_ra) { - unsigned long ra_val; - int win, hits; - - ra_val = GET_SWAP_RA_VAL(vma); - win = SWAP_RA_WIN(ra_val); - hits = SWAP_RA_HITS(ra_val); - if (readahead) - hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); - atomic_long_set(&vma->swap_readahead_info, - SWAP_RA_VAL(addr, win, hits)); - } - - if (readahead) { - count_vm_event(SWAP_RA_HIT); - if (!vma || !vma_ra) - atomic_inc(&swapin_readahead_hits); - } - } else { - folio = NULL; + ra_val = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + if (readahead) + hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); + atomic_long_set(&vma->swap_readahead_info, + SWAP_RA_VAL(addr, win, hits)); } - return folio; + if (readahead) { + count_vm_event(SWAP_RA_HIT); + if (!vma || !vma_ra) + atomic_inc(&swapin_readahead_hits); + } } struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, @@ -336,14 +345,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, *new_page_allocated = false; for (;;) { int err; - /* - * First check the swap cache. Since this is normally - * called after swap_cache_get_folio() failed, re-calling - * that would confuse statistics. - */ - folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (!IS_ERR(folio)) + + /* Check the swap cache in case the folio is already there */ + folio = swap_cache_get_folio(entry); + if (folio) goto got_folio; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index a7ffabbe65ef..4b8ab2cb49ca 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -213,15 +213,14 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); - struct address_space *address_space = swap_address_space(entry); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; bool need_reclaim; again: - folio = filemap_get_folio(address_space, swap_cache_index(entry)); - if (IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (!folio) return 0; nr_pages = folio_nr_pages(folio); @@ -2131,7 +2130,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_unmap(pte); pte = NULL; - folio = swap_cache_get_folio(entry, vma, addr); + folio = swap_cache_get_folio(entry); if (!folio) { struct vm_fault vmf = { .vma = vma, @@ -2357,8 +2356,8 @@ retry: (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry)); - if (IS_ERR(folio)) + folio = swap_cache_get_folio(entry); + if (!folio) continue; /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 50aaa8dcd24c..af61b95c89e4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1489,9 +1489,8 @@ retry: * separately to allow proper handling. */ if (!src_folio) - folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (!IS_ERR_OR_NULL(folio)) { + folio = swap_cache_get_folio(entry); + if (folio) { if (folio_test_large(folio)) { ret = -EBUSY; folio_put(folio); From a733d8de7f1ccbf093ce2fde424616e529073876 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:48 +0800 Subject: [PATCH 271/372] mm, swap: fix swap cache index error when retrying reclaim The allocator will reclaim cached slots while scanning. Currently, it will try again if reclaim found a folio that is already removed from the swap cache due to a race. But the following lookup will be using the wrong index. It won't cause any OOB issue since the swap cache index is truncated upon lookup, but it may lead to reclaiming of an irrelevant folio. This should not cause a measurable issue, but we should fix it. Link: https://lkml.kernel.org/r/20250916160100.31545-4-ryncsn@gmail.com Fixes: fae859550531 ("mm, swap: avoid reclaiming irrelevant swap cache") Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Acked-by: Nhat Pham Acked-by: Chris Li Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/swapfile.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4b8ab2cb49ca..4baebd8b48f4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -212,7 +212,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { - swp_entry_t entry = swp_entry(si->type, offset); + const swp_entry_t entry = swp_entry(si->type, offset); struct swap_cluster_info *ci; struct folio *folio; int ret, nr_pages; @@ -240,13 +240,13 @@ again: * Offset could point to the middle of a large folio, or folio * may no longer point to the expected offset before it's locked. */ - entry = folio->swap; - if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) { + if (offset < swp_offset(folio->swap) || + offset >= swp_offset(folio->swap) + nr_pages) { folio_unlock(folio); folio_put(folio); goto again; } - offset = swp_offset(entry); + offset = swp_offset(folio->swap); need_reclaim = ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || From 3518b931df0cc57b1b97e5c3a307ad7d5fe17650 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:49 +0800 Subject: [PATCH 272/372] mm, swap: check page poison flag after locking it Instead of checking the poison flag only in the fast swap cache lookup path, always check the poison flags after locking a swap cache folio. There are two reasons to do so. The folio is unstable and could be removed from the swap cache anytime, so it's totally possible that the folio is no longer the backing folio of a swap entry, and could be an irrelevant poisoned folio. We might mistakenly kill a faulting process. And it's totally possible or even common for the slow swap in path (swapin_readahead) to bring in a cached folio. The cache folio could be poisoned, too. Only checking the poison flag in the fast path will miss such folios. The race window is tiny, so it's very unlikely to happen, though. While at it, also add a unlikely prefix. Link: https://lkml.kernel.org/r/20250916160100.31545-5-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: David Hildenbrand Acked-by: Nhat Pham Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 10ef528a5f44..94a5928e8ace 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4661,10 +4661,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; folio = swap_cache_get_folio(entry); - if (folio) { + if (folio) swap_update_readahead(folio, vma, vmf->address); - page = folio_file_page(folio, swp_offset(entry)); - } swapcache = folio; if (!folio) { @@ -4735,20 +4733,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - page = folio_file_page(folio, swp_offset(entry)); - } else if (PageHWPoison(page)) { - /* - * hwpoisoned dirty swapcache pages are kept for killing - * owner processes (which may be unknown at hwpoison time) - */ - ret = VM_FAULT_HWPOISON; - goto out_release; } ret |= folio_lock_or_retry(folio, vmf); if (ret & VM_FAULT_RETRY) goto out_release; + page = folio_file_page(folio, swp_offset(entry)); if (swapcache) { /* * Make sure folio_free_swap() or swapoff did not release the @@ -4761,6 +4752,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page_swap_entry(page).val != entry.val)) goto out_page; + if (unlikely(PageHWPoison(page))) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + goto out_page; + } + /* * KSM sometimes has to copy on read faults, for example, if * folio->index of non-ksm folios would be nonlinear inside the From ae38eb210590ed69fa4f70f29959df76a4bdc4da Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:50 +0800 Subject: [PATCH 273/372] mm, swap: always lock and check the swap cache folio before use Swap cache lookup only increases the reference count of the returned folio. That's not enough to ensure a folio is stable in the swap cache, so the folio could be removed from the swap cache at any time. The caller should always lock and check the folio before using it. We have just documented this in kerneldoc, now introduce a helper for swap cache folio verification with proper sanity checks. Also, sanitize a few current users to use this convention and the new helper for easier debugging. They were not having observable problems yet, only trivial issues like wasted CPU cycles on swapoff or reclaiming. They would fail in some other way, but it is still better to always follow this convention to make things robust and make later commits easier to do. Link: https://lkml.kernel.org/r/20250916160100.31545-6-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: David Hildenbrand Acked-by: Chris Li Acked-by: Nhat Pham Suggested-by: Chris Li Reviewed-by: Barry Song Cc: Baolin Wang Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/memory.c | 3 +-- mm/swap.h | 27 +++++++++++++++++++++++++++ mm/swap_state.c | 7 +++++-- mm/swapfile.c | 10 ++++++++-- 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 94a5928e8ace..5808c4ef21b3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4748,8 +4748,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!folio_test_swapcache(folio) || - page_swap_entry(page).val != entry.val)) + if (unlikely(!folio_matches_swap_entry(folio, entry))) goto out_page; if (unlikely(PageHWPoison(page))) { diff --git a/mm/swap.h b/mm/swap.h index efb6d7ff9f30..7d868f8de696 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -52,6 +52,28 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry) return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; } +/** + * folio_matches_swap_entry - Check if a folio matches a given swap entry. + * @folio: The folio. + * @entry: The swap entry to check against. + * + * Context: The caller should have the folio locked to ensure it's stable + * and nothing will move it in or out of the swap cache. + * Return: true or false. + */ +static inline bool folio_matches_swap_entry(const struct folio *folio, + swp_entry_t entry) +{ + swp_entry_t folio_entry = folio->swap; + long nr_pages = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + if (!folio_test_swapcache(folio)) + return false; + VM_WARN_ON_ONCE_FOLIO(!IS_ALIGNED(folio_entry.val, nr_pages), folio); + return folio_entry.val == round_down(entry.val, nr_pages); +} + void show_swap_cache_info(void); void *get_shadow_from_swap_cache(swp_entry_t entry); int add_to_swap_cache(struct folio *folio, swp_entry_t entry, @@ -144,6 +166,11 @@ static inline pgoff_t swap_cache_index(swp_entry_t entry) return 0; } +static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) +{ + return false; +} + static inline void show_swap_cache_info(void) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 68ec531d0f2b..9225d6b695ad 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -79,7 +79,7 @@ void show_swap_cache_info(void) * with reference count or locks. * Return: Returns the found folio on success, NULL otherwise. The caller * must lock and check if the folio still matches the swap entry before - * use. + * use (e.g. with folio_matches_swap_entry). */ struct folio *swap_cache_get_folio(swp_entry_t entry) { @@ -346,7 +346,10 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, for (;;) { int err; - /* Check the swap cache in case the folio is already there */ + /* + * Check the swap cache first, if a cached folio is found, + * return it unlocked. The caller will lock and check it. + */ folio = swap_cache_get_folio(entry); if (folio) goto got_folio; diff --git a/mm/swapfile.c b/mm/swapfile.c index 4baebd8b48f4..c3c3364cb42e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -240,8 +240,7 @@ again: * Offset could point to the middle of a large folio, or folio * may no longer point to the expected offset before it's locked. */ - if (offset < swp_offset(folio->swap) || - offset >= swp_offset(folio->swap) + nr_pages) { + if (!folio_matches_swap_entry(folio, entry)) { folio_unlock(folio); folio_put(folio); goto again; @@ -2004,6 +2003,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, bool hwpoisoned = false; int ret = 1; + /* + * If the folio is removed from swap cache by others, continue to + * unuse other PTEs. try_to_unuse may try again if we missed this one. + */ + if (!folio_matches_swap_entry(folio, entry)) + return 0; + swapcache = folio; folio = ksm_might_need_to_copy(folio, vma, addr); if (unlikely(!folio)) From 4522aed4fffbbd18ab3581d733d0572d45780d07 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:51 +0800 Subject: [PATCH 274/372] mm, swap: rename and move some swap cluster definition and helpers No feature change, move cluster related definitions and helpers to mm/swap.h, also tidy up and add a "swap_" prefix for cluster lock/unlock helpers, so they can be used outside of swap files. And while at it, add kerneldoc. Link: https://lkml.kernel.org/r/20250916160100.31545-7-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Reviewed-by: Barry Song Acked-by: Chris Li Acked-by: David Hildenbrand Suggested-by: Chris Li Acked-by: Nhat Pham Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 34 ---------------- mm/swap.h | 70 ++++++++++++++++++++++++++++++++ mm/swapfile.c | 97 +++++++++++++------------------------------- 3 files changed, 99 insertions(+), 102 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index a2bb20841616..78cc48a65512 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -235,40 +235,6 @@ enum { /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ -/* - * We use this to track usage of a cluster. A cluster is a block of swap disk - * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All - * free clusters are organized into a list. We fetch an entry from the list to - * get a free cluster. - * - * The flags field determines if a cluster is free. This is - * protected by cluster lock. - */ -struct swap_cluster_info { - spinlock_t lock; /* - * Protect swap_cluster_info fields - * other than list, and swap_info_struct->swap_map - * elements corresponding to the swap cluster. - */ - u16 count; - u8 flags; - u8 order; - struct list_head list; -}; - -/* All on-list cluster must have a non-zero flag. */ -enum swap_cluster_flags { - CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ - CLUSTER_FLAG_FREE, - CLUSTER_FLAG_NONFULL, - CLUSTER_FLAG_FRAG, - /* Clusters with flags above are allocatable */ - CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, - CLUSTER_FLAG_FULL, - CLUSTER_FLAG_DISCARD, - CLUSTER_FLAG_MAX, -}; - /* * The first page in the swap file is the swap header, which is always marked * bad to prevent it from being allocated as an entry. This also prevents the diff --git a/mm/swap.h b/mm/swap.h index 7d868f8de696..138b5197c35e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -7,10 +7,80 @@ struct swap_iocb; extern int page_cluster; +#ifdef CONFIG_THP_SWAP +#define SWAPFILE_CLUSTER HPAGE_PMD_NR +#define swap_entry_order(order) (order) +#else +#define SWAPFILE_CLUSTER 256 +#define swap_entry_order(order) 0 +#endif + +/* + * We use this to track usage of a cluster. A cluster is a block of swap disk + * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * + * The flags field determines if a cluster is free. This is + * protected by cluster lock. + */ +struct swap_cluster_info { + spinlock_t lock; /* + * Protect swap_cluster_info fields + * other than list, and swap_info_struct->swap_map + * elements corresponding to the swap cluster. + */ + u16 count; + u8 flags; + u8 order; + struct list_head list; +}; + +/* All on-list cluster must have a non-zero flag. */ +enum swap_cluster_flags { + CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */ + CLUSTER_FLAG_FREE, + CLUSTER_FLAG_NONFULL, + CLUSTER_FLAG_FRAG, + /* Clusters with flags above are allocatable */ + CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG, + CLUSTER_FLAG_FULL, + CLUSTER_FLAG_DISCARD, + CLUSTER_FLAG_MAX, +}; + #ifdef CONFIG_SWAP #include /* for swp_offset */ #include /* for bio_end_io_t */ +static inline struct swap_cluster_info *swp_offset_cluster( + struct swap_info_struct *si, pgoff_t offset) +{ + return &si->cluster_info[offset / SWAPFILE_CLUSTER]; +} + +/** + * swap_cluster_lock - Lock and return the swap cluster of given offset. + * @si: swap device the cluster belongs to. + * @offset: the swap entry offset, pointing to a valid slot. + * + * Context: The caller must ensure the offset is in the valid range and + * protect the swap device with reference count or locks. + */ +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset) +{ + struct swap_cluster_info *ci = swp_offset_cluster(si, offset); + + spin_lock(&ci->lock); + return ci; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ + spin_unlock(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; diff --git a/mm/swapfile.c b/mm/swapfile.c index c3c3364cb42e..700e07cb1cbd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -58,9 +58,6 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset); -static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -258,9 +255,9 @@ again: * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); need_reclaim = swap_only_has_cache(si, offset, nr_pages); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (!need_reclaim) goto out_unlock; @@ -385,19 +382,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -#ifdef CONFIG_THP_SWAP -#define SWAPFILE_CLUSTER HPAGE_PMD_NR - -#define swap_entry_order(order) (order) -#else -#define SWAPFILE_CLUSTER 256 - -/* - * Define swap_entry_order() as constant to let compiler to optimize - * out some code if !CONFIG_THP_SWAP - */ -#define swap_entry_order(order) 0 -#endif #define LATENCY_LIMIT 256 static inline bool cluster_is_empty(struct swap_cluster_info *info) @@ -425,34 +409,12 @@ static inline unsigned int cluster_index(struct swap_info_struct *si, return ci - si->cluster_info; } -static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - return &si->cluster_info[offset / SWAPFILE_CLUSTER]; -} - static inline unsigned int cluster_offset(struct swap_info_struct *si, struct swap_cluster_info *ci) { return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, - unsigned long offset) -{ - struct swap_cluster_info *ci; - - ci = offset_to_cluster(si, offset); - spin_lock(&ci->lock); - - return ci; -} - -static inline void unlock_cluster(struct swap_cluster_info *ci) -{ - spin_unlock(&ci->lock); -} - static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) @@ -808,7 +770,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, } out: relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (si->flags & SWP_SOLIDSTATE) { this_cpu_write(percpu_swap_cluster.offset[order], next); this_cpu_write(percpu_swap_cluster.si[order], si); @@ -875,7 +837,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (ci->flags == CLUSTER_FLAG_NONE) relocate_cluster(si, ci); - unlock_cluster(ci); + swap_cluster_unlock(ci); if (to_scan <= 0) break; } @@ -914,7 +876,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o if (offset == SWAP_ENTRY_INVALID) goto new_cluster; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); /* Cluster could have been used by another order */ if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) @@ -922,7 +884,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o found = alloc_swap_scan_cluster(si, ci, offset, order, usage); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } if (found) goto done; @@ -1203,7 +1165,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (!si || !offset || !get_swap_device_info(si)) return false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (cluster_is_usable(ci, order)) { if (cluster_is_empty(ci)) offset = cluster_offset(si, ci); @@ -1211,7 +1173,7 @@ static bool swap_alloc_fast(swp_entry_t *entry, if (found) *entry = swp_entry(si->type, found); } else { - unlock_cluster(ci); + swap_cluster_unlock(ci); } put_swap_device(si); @@ -1479,14 +1441,14 @@ static void swap_entries_put_cache(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); struct swap_cluster_info *ci; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, nr)) + ci = swap_cluster_lock(si, offset); + if (swap_only_has_cache(si, offset, nr)) { swap_entries_free(si, ci, entry, nr); - else { + } else { for (int i = 0; i < nr; i++, entry.val++) swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } - unlock_cluster(ci); + swap_cluster_unlock(ci); } static bool swap_entries_put_map(struct swap_info_struct *si, @@ -1504,7 +1466,7 @@ static bool swap_entries_put_map(struct swap_info_struct *si, if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { goto locked_fallback; } @@ -1513,21 +1475,20 @@ static bool swap_entries_put_map(struct swap_info_struct *si, else for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; fallback: - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); locked_fallback: for (i = 0; i < nr; i++, entry.val++) { count = swap_entry_put_locked(si, ci, entry, 1); if (count == SWAP_HAS_CACHE) has_cache = true; } - unlock_cluster(ci); + swap_cluster_unlock(ci); return has_cache; - } /* @@ -1577,7 +1538,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != swp_offset_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1652,9 +1613,9 @@ bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster(ci); + swap_cluster_unlock(ci); return !!count; } @@ -1677,7 +1638,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1700,7 +1661,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return count; } @@ -1715,7 +1676,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; @@ -1728,7 +1689,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return ret; } @@ -2662,8 +2623,8 @@ static void wait_for_allocation(struct swap_info_struct *si) BUG_ON(si->flags & SWP_WRITEOK); for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) { - ci = lock_cluster(si, offset); - unlock_cluster(ci); + ci = swap_cluster_lock(si, offset); + swap_cluster_unlock(ci); } } @@ -3579,7 +3540,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3634,7 +3595,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster(ci); + swap_cluster_unlock(ci); return err; } @@ -3733,7 +3694,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) offset = swp_offset(entry); - ci = lock_cluster(si, offset); + ci = swap_cluster_lock(si, offset); count = swap_count(si->swap_map[offset]); @@ -3793,7 +3754,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) out_unlock_cont: spin_unlock(&si->cont_lock); out: - unlock_cluster(ci); + swap_cluster_unlock(ci); put_swap_device(si); outer: if (page) From 0fcf8ef4fdab8e5c91d1bce39c7fe6565974ffad Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:52 +0800 Subject: [PATCH 275/372] mm, swap: tidy up swap device and cluster info helpers swp_swap_info is the most commonly used helper for retrieving swap info. It has an internal check that may lead to a NULL return value, but almost none of its caller checks the return value, making the internal check pointless. In fact, most of these callers already ensured the entry is valid and never expect a NULL value. Tidy this up and improve the function names. If the caller can make sure the swap entry/type is valid and the device is pinned, use the new introduced __swap_entry_to_info/__swap_type_to_info instead. They have more debug sanity checks and lower overhead as they are inlined. Callers that may expect a NULL value should use swap_entry_to_info/swap_type_to_info instead. No feature change. The rearranged codes should have had no effect, or they should have been hitting NULL de-ref bugs already. Only some new sanity checks are added so potential issues may show up in debug build. The new helpers will be frequently used with swap table later when working with swap cache folios. A locked swap cache folio ensures the entries are valid and stable so these helpers are very helpful. Link: https://lkml.kernel.org/r/20250916160100.31545-8-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Barry Song Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ------ mm/page_io.c | 12 ++++++------ mm/swap.h | 38 +++++++++++++++++++++++++++++++++----- mm/swap_state.c | 4 ++-- mm/swapfile.c | 37 +++++++++++++++++++------------------ 5 files changed, 60 insertions(+), 37 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 78cc48a65512..762f8db0e811 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -479,7 +479,6 @@ extern sector_t swapdev_block(int, pgoff_t); extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); -struct swap_info_struct *swp_swap_info(swp_entry_t entry); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); @@ -492,11 +491,6 @@ static inline void put_swap_device(struct swap_info_struct *si) } #else /* CONFIG_SWAP */ -static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return NULL; -} - static inline struct swap_info_struct *get_swap_device(swp_entry_t entry) { return NULL; diff --git a/mm/page_io.c b/mm/page_io.c index a2056a5ecb13..3c342db77ce3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -204,7 +204,7 @@ static bool is_folio_zero_filled(struct folio *folio) static void swap_zeromap_folio_set(struct folio *folio) { struct obj_cgroup *objcg = get_obj_cgroup_from_folio(folio); - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); int nr_pages = folio_nr_pages(folio); swp_entry_t entry; unsigned int i; @@ -223,7 +223,7 @@ static void swap_zeromap_folio_set(struct folio *folio) static void swap_zeromap_folio_clear(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); swp_entry_t entry; unsigned int i; @@ -374,7 +374,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) static void swap_writepage_fs(struct folio *folio, struct swap_iocb **swap_plug) { struct swap_iocb *sio = swap_plug ? *swap_plug : NULL; - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct file *swap_file = sis->swap_file; loff_t pos = swap_dev_pos(folio->swap); @@ -446,7 +446,7 @@ static void swap_writepage_bdev_async(struct folio *folio, void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); /* @@ -537,7 +537,7 @@ static bool swap_read_folio_zeromap(struct folio *folio) static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_iocb *sio = NULL; loff_t pos = swap_dev_pos(folio->swap); @@ -608,7 +608,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); bool synchronous = sis->flags & SWP_SYNCHRONOUS_IO; bool workingset = folio_test_workingset(folio); unsigned long pflags; diff --git a/mm/swap.h b/mm/swap.h index 138b5197c35e..30b1039c27fe 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -15,6 +15,8 @@ extern int page_cluster; #define swap_entry_order(order) 0 #endif +extern struct swap_info_struct *swap_info[]; + /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All @@ -53,9 +55,29 @@ enum swap_cluster_flags { #include /* for swp_offset */ #include /* for bio_end_io_t */ -static inline struct swap_cluster_info *swp_offset_cluster( +/* + * Callers of all helpers below must ensure the entry, type, or offset is + * valid, and protect the swap device with reference count or locks. + */ +static inline struct swap_info_struct *__swap_type_to_info(int type) +{ + struct swap_info_struct *si; + + si = READ_ONCE(swap_info[type]); /* rcu_dereference() */ + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + return si; +} + +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return __swap_type_to_info(swp_type(entry)); +} + +static inline struct swap_cluster_info *__swap_offset_to_cluster( struct swap_info_struct *si, pgoff_t offset) { + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + VM_WARN_ON_ONCE(offset >= si->max); return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } @@ -70,8 +92,9 @@ static inline struct swap_cluster_info *swp_offset_cluster( static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { - struct swap_cluster_info *ci = swp_offset_cluster(si, offset); + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ spin_lock(&ci->lock); return ci; } @@ -170,7 +193,7 @@ void swap_update_readahead(struct folio *folio, struct vm_area_struct *vma, static inline unsigned int folio_swap_flags(struct folio *folio) { - return swp_swap_info(folio->swap)->flags; + return __swap_entry_to_info(folio->swap)->flags; } /* @@ -181,7 +204,7 @@ static inline unsigned int folio_swap_flags(struct folio *folio) static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, bool *is_zeromap) { - struct swap_info_struct *sis = swp_swap_info(entry); + struct swap_info_struct *sis = __swap_entry_to_info(entry); unsigned long start = swp_offset(entry); unsigned long end = start + max_nr; bool first_bit; @@ -200,7 +223,7 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); int i; @@ -219,6 +242,11 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) +{ + return NULL; +} + static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 9225d6b695ad..0ad4f3b41f1b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -336,7 +336,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated, bool skip_if_exists) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct folio *folio; struct folio *new_folio = NULL; struct folio *result = NULL; @@ -560,7 +560,7 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; diff --git a/mm/swapfile.c b/mm/swapfile.c index 700e07cb1cbd..6f7a8c98d14d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -102,7 +102,7 @@ static PLIST_HEAD(swap_active_head); static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -124,14 +124,20 @@ static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = { .lock = INIT_LOCAL_LOCK(), }; -static struct swap_info_struct *swap_type_to_swap_info(int type) +/* May return NULL on invalid type, caller must check for NULL return */ +static struct swap_info_struct *swap_type_to_info(int type) { if (type >= MAX_SWAPFILES) return NULL; - return READ_ONCE(swap_info[type]); /* rcu_dereference() */ } +/* May return NULL on invalid entry, caller must check for NULL return */ +static struct swap_info_struct *swap_entry_to_info(swp_entry_t entry) +{ + return swap_type_to_info(swp_type(entry)); +} + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ @@ -342,7 +348,7 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset) sector_t swap_folio_sector(struct folio *folio) { - struct swap_info_struct *sis = swp_swap_info(folio->swap); + struct swap_info_struct *sis = __swap_entry_to_info(folio->swap); struct swap_extent *se; sector_t sector; pgoff_t offset; @@ -1300,7 +1306,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (data_race(!(si->flags & SWP_USED))) @@ -1415,7 +1421,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry) if (!entry.val) goto out; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (!si) goto bad_nofile; if (!get_swap_device_info(si)) @@ -1538,7 +1544,7 @@ static void swap_entries_free(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; /* It should never free entries across different clusters */ - VM_BUG_ON(ci != swp_offset_cluster(si, offset + nr_pages - 1)); + VM_BUG_ON(ci != __swap_offset_to_cluster(si, offset + nr_pages - 1)); VM_BUG_ON(cluster_is_empty(ci)); VM_BUG_ON(ci->count < nr_pages); @@ -1596,7 +1602,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) int __swap_count(swp_entry_t entry) { - struct swap_info_struct *si = swp_swap_info(entry); + struct swap_info_struct *si = __swap_entry_to_info(entry); pgoff_t offset = swp_offset(entry); return swap_count(si->swap_map[offset]); @@ -1827,7 +1833,7 @@ out: swp_entry_t get_swap_page_of_type(int type) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); unsigned long offset; swp_entry_t entry = {0}; @@ -1908,7 +1914,7 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_info_struct *si = swap_type_to_info(type); struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) @@ -2837,7 +2843,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (type = 0; (si = swap_type_to_swap_info(type)); type++) { + for (type = 0; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) @@ -2858,7 +2864,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) type = si->type + 1; ++(*pos); - for (; (si = swap_type_to_swap_info(type)); type++) { + for (; (si = swap_type_to_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; return si; @@ -3531,7 +3537,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) unsigned char has_cache; int err, i; - si = swp_swap_info(entry); + si = swap_entry_to_info(entry); if (WARN_ON_ONCE(!si)) { pr_err("%s%08lx\n", Bad_file, entry.val); return -EINVAL; @@ -3646,11 +3652,6 @@ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) swap_entries_put_cache(si, entry, nr); } -struct swap_info_struct *swp_swap_info(swp_entry_t entry) -{ - return swap_type_to_swap_info(swp_type(entry)); -} - /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's From fd8d4f862f8c278fd1f5b61cef20056e88d8dfa5 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:53 +0800 Subject: [PATCH 276/372] mm, swap: cleanup swap cache API and add kerneldoc In preparation for replacing the swap cache backend with the swap table, clean up and add proper kernel doc for all swap cache APIs. Now all swap cache APIs are well-defined with consistent names. No feature change, only renaming and documenting. Link: https://lkml.kernel.org/r/20250916160100.31545-9-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- mm/memory-failure.c | 2 +- mm/memory.c | 2 +- mm/shmem.c | 10 +++--- mm/swap.h | 50 ++++++++++++++------------ mm/swap_state.c | 86 ++++++++++++++++++++++++++++++++------------- mm/swapfile.c | 8 ++--- mm/vmscan.c | 2 +- mm/zswap.c | 2 +- 9 files changed, 104 insertions(+), 60 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 8d078aa2738a..2a05b1fdd445 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4525,7 +4525,7 @@ static void filemap_cachestat(struct address_space *mapping, * invalidation, so there might not be * a shadow in the swapcache (yet). */ - shadow = get_shadow_from_swap_cache(swp); + shadow = swap_cache_get_shadow(swp); if (!shadow) goto resched; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6d9134e3d115..3edebb0cda30 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1127,7 +1127,7 @@ static int me_swapcache_clean(struct page_state *ps, struct page *p) struct folio *folio = page_folio(p); int ret; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); ret = delete_from_lru_cache(folio) ? MF_FAILED : MF_RECOVERED; folio_unlock(folio); diff --git a/mm/memory.c b/mm/memory.c index 5808c4ef21b3..41e641823558 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4699,7 +4699,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) memcg1_swapin(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(folio, shadow); diff --git a/mm/shmem.c b/mm/shmem.c index 410f27bc4752..077744a9e9da 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1661,13 +1661,13 @@ try_split: } /* - * The delete_from_swap_cache() below could be left for + * The swap_cache_del_folio() below could be left for * shrink_folio_list()'s folio_free_swap() to dispose of; * but I'm a little nervous about letting this folio out of * shmem_writeout() in a hybrid half-tmpfs-half-swap state * e.g. folio_mapping(folio) might give an unexpected answer. */ - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); goto redirty; } if (nr_pages > 1) @@ -2045,7 +2045,7 @@ retry: new->swap = entry; memcg1_swapin(entry, nr_pages); - shadow = get_shadow_from_swap_cache(entry); + shadow = swap_cache_get_shadow(entry); if (shadow) workingset_refault(new, shadow); folio_add_lru(new); @@ -2183,7 +2183,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, nr_pages = folio_nr_pages(folio); folio_wait_writeback(folio); if (!skip_swapcache) - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); /* * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks) @@ -2422,7 +2422,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio->swap.val = 0; swapcache_clear(si, swap, nr_pages); } else { - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); } folio_mark_dirty(folio); swap_free_nr(swap, nr_pages); diff --git a/mm/swap.h b/mm/swap.h index 30b1039c27fe..6c4acb549bec 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -167,17 +167,29 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, return folio_entry.val == round_down(entry.val, nr_pages); } -void show_swap_cache_info(void); -void *get_shadow_from_swap_cache(swp_entry_t entry); -int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp); -void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow); -void delete_from_swap_cache(struct folio *folio); -void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end); -void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); +/* + * All swap cache helpers below require the caller to ensure the swap entries + * used are valid and stablize the device by any of the following ways: + * - Hold a reference by get_swap_device(): this ensures a single entry is + * valid and increases the swap device's refcount. + * - Locking a folio in the swap cache: this ensures the folio's swap entries + * are valid and pinned, also implies reference to the device. + * - Locking anything referencing the swap entry: e.g. PTL that protects + * swap entries in the page table, similar to locking swap cache folio. + * - See the comment of get_swap_device() for more complex usage. + */ struct folio *swap_cache_get_folio(swp_entry_t entry); +void *swap_cache_get_shadow(swp_entry_t entry); +int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + gfp_t gfp, void **shadow); +void swap_cache_del_folio(struct folio *folio); +void __swap_cache_del_folio(struct folio *folio, + swp_entry_t entry, void *shadow); +void swap_cache_clear_shadow(int type, unsigned long begin, + unsigned long end); + +void show_swap_cache_info(void); +void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug); @@ -305,28 +317,22 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry) return NULL; } -static inline void *get_shadow_from_swap_cache(swp_entry_t entry) +static inline void *swap_cache_get_shadow(swp_entry_t entry) { return NULL; } -static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp_mask, void **shadowp) +static inline int swap_cache_add_folio(swp_entry_t entry, struct folio *folio, + gfp_t gfp, void **shadow) { - return -1; + return -EINVAL; } -static inline void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow) +static inline void swap_cache_del_folio(struct folio *folio) { } -static inline void delete_from_swap_cache(struct folio *folio) -{ -} - -static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) +static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow) { } diff --git a/mm/swap_state.c b/mm/swap_state.c index 0ad4f3b41f1b..f3a32a06a950 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -78,8 +78,8 @@ void show_swap_cache_info(void) * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. * Return: Returns the found folio on success, NULL otherwise. The caller - * must lock and check if the folio still matches the swap entry before - * use (e.g. with folio_matches_swap_entry). + * must lock nd check if the folio still matches the swap entry before + * use (e.g., folio_matches_swap_entry). */ struct folio *swap_cache_get_folio(swp_entry_t entry) { @@ -90,7 +90,15 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) return folio; } -void *get_shadow_from_swap_cache(swp_entry_t entry) +/** + * swap_cache_get_shadow - Looks up a shadow in the swap cache. + * @entry: swap entry used for the lookup. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * Return: Returns either NULL or an XA_VALUE (shadow). + */ +void *swap_cache_get_shadow(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swap_cache_index(entry); @@ -102,12 +110,21 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) return NULL; } -/* - * add_to_swap_cache resembles filemap_add_folio on swapper_space, - * but sets SwapCache flag and 'swap' instead of mapping and index. +/** + * swap_cache_add_folio - Add a folio into the swap cache. + * @folio: The folio to be added. + * @entry: The swap entry corresponding to the folio. + * @gfp: gfp_mask for XArray node allocation. + * @shadowp: If a shadow is found, return the shadow. + * + * Context: Caller must ensure @entry is valid and protect the swap device + * with reference count or locks. + * The caller also needs to mark the corresponding swap_map slots with + * SWAP_HAS_CACHE to avoid race or conflict. + * Return: Returns 0 on success, error code otherwise. */ -int add_to_swap_cache(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp) +int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, + gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swap_cache_index(entry); @@ -155,12 +172,20 @@ unlock: return xas_error(&xas); } -/* - * This must be called only on folios that have - * been verified to be in the swap cache. +/** + * __swap_cache_del_folio - Removes a folio from the swap cache. + * @folio: The folio. + * @entry: The first swap entry that the folio corresponds to. + * @shadow: shadow value to be filled in the swap cache. + * + * Removes a folio from the swap cache and fills a shadow in place. + * This won't put the folio's refcount. The caller has to do that. + * + * Context: Caller must hold the xa_lock, ensure the folio is + * locked and in the swap cache, using the index of @entry. */ -void __delete_from_swap_cache(struct folio *folio, - swp_entry_t entry, void *shadow) +void __swap_cache_del_folio(struct folio *folio, + swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); int i; @@ -186,27 +211,40 @@ void __delete_from_swap_cache(struct folio *folio, __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); } -/* - * This must be called only on folios that have - * been verified to be in the swap cache and locked. - * It will never put the folio into the free list, - * the caller has a reference on the folio. +/** + * swap_cache_del_folio - Removes a folio from the swap cache. + * @folio: The folio. + * + * Same as __swap_cache_del_folio, but handles lock and refcount. The + * caller must ensure the folio is either clean or has a swap count + * equal to zero, or it may cause data loss. + * + * Context: Caller must ensure the folio is locked and in the swap cache. */ -void delete_from_swap_cache(struct folio *folio) +void swap_cache_del_folio(struct folio *folio) { swp_entry_t entry = folio->swap; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); - __delete_from_swap_cache(folio, entry, NULL); + __swap_cache_del_folio(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } -void clear_shadow_from_swap_cache(int type, unsigned long begin, - unsigned long end) +/** + * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. + * @type: Indicates the swap device. + * @begin: Beginning offset of the range. + * @end: Ending offset of the range. + * + * Context: Caller must ensure the range is valid and hold a reference to + * the swap device. + */ +void swap_cache_clear_shadow(int type, unsigned long begin, + unsigned long end) { unsigned long curr = begin; void *old; @@ -393,7 +431,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto put_and_return; /* - * We might race against __delete_from_swap_cache(), and + * We might race against __swap_cache_del_folio(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE @@ -412,7 +450,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (swap_cache_add_folio(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; memcg1_swapin(entry, 1); diff --git a/mm/swapfile.c b/mm/swapfile.c index 6f7a8c98d14d..51f781c43537 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -267,7 +267,7 @@ again: if (!need_reclaim) goto out_unlock; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_set_dirty(folio); ret = nr_pages; out_unlock: @@ -1124,7 +1124,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - clear_shadow_from_swap_cache(si->type, begin, end); + swap_cache_clear_shadow(si->type, begin, end); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1289,7 +1289,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) * TODO: this could cause a theoretical memory reclaim * deadlock in the swap out path. */ - if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) + if (swap_cache_add_folio(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) goto out_free; return 0; @@ -1759,7 +1759,7 @@ bool folio_free_swap(struct folio *folio) if (folio_swapped(folio)) return false; - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_set_dirty(folio); return true; } diff --git a/mm/vmscan.c b/mm/vmscan.c index ca9e1cd3cd68..c79c6806560b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -776,7 +776,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __delete_from_swap_cache(folio, swap, shadow); + __swap_cache_del_folio(folio, swap, shadow); memcg1_swapout(folio, swap); xa_unlock_irq(&mapping->i_pages); put_swap_folio(folio, swap); diff --git a/mm/zswap.c b/mm/zswap.c index 63045e3fb1f5..1b1edecde6a7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1069,7 +1069,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, out: if (ret && ret != -EEXIST) { - delete_from_swap_cache(folio); + swap_cache_del_folio(folio); folio_unlock(folio); } folio_put(folio); From 84a7a9823e73fe3c0adcc4780fa7a091981048ef Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:54 +0800 Subject: [PATCH 277/372] mm/shmem, swap: remove redundant error handling for replacing folio Shmem may replace a folio in the swap cache if the cached one doesn't fit the swapin's GFP zone. When doing so, shmem has already double checked that the swap cache folio is locked, still has the swap cache flag set, and contains the wanted swap entry. So it is impossible to fail due to an XArray mismatch. There is even a comment for that. Delete the defensive error handling path, and add a WARN_ON instead: if that happened, something has broken the basic principle of how the swap cache works, we should catch and fix that. Link: https://lkml.kernel.org/r/20250916160100.31545-10-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/shmem.c | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 077744a9e9da..dc17717e5631 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2121,35 +2121,17 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); for (i = 0; i < nr_pages; i++) { - void *item = xas_load(&xas); - - if (item != old) { - error = -ENOENT; - break; - } - - xas_store(&xas, new); + WARN_ON_ONCE(xas_store(&xas, new) != old); xas_next(&xas); } - if (!error) { - mem_cgroup_replace_folio(old, new); - shmem_update_stats(new, nr_pages); - shmem_update_stats(old, -nr_pages); - } + + mem_cgroup_replace_folio(old, new); + shmem_update_stats(new, nr_pages); + shmem_update_stats(old, -nr_pages); xa_unlock_irq(&swap_mapping->i_pages); - if (unlikely(error)) { - /* - * Is this possible? I think not, now that our callers - * check both the swapcache flag and folio->private - * after getting the folio lock; but be defensive. - * Reverse old to newpage for clear and free. - */ - old = new; - } else { - folio_add_lru(new); - *foliop = new; - } + folio_add_lru(new); + *foliop = new; folio_clear_swapcache(old); old->private = NULL; From 094dc8b059b11eef0888f43eeb0f3ac53ade5c87 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:55 +0800 Subject: [PATCH 278/372] mm, swap: wrap swap cache replacement with a helper There are currently three swap cache users that are trying to replace an existing folio with a new one: huge memory splitting, migration, and shmem replacement. What they are doing is quite similar. Introduce a common helper for this. In later commits, this can be easily switched to use the swap table by updating this helper. The newly added helper also makes the swap cache API better defined, and make debugging easier by adding a few more debug checks. Migration and shmem replace are meant to clone the folio, including content, swap entry value, and flags. And splitting will adjust each sub folio's swap entry according to order, which could be non-uniform in the future. So document it clearly that it's the caller's responsibility to set up the new folio's swap entries and flags before calling the helper. The helper will just follow the new folio's entry value. This also prepares for replacing high-order folios in the swap cache. Currently, only splitting to order 0 is allowed for swap cache folios. Using the new helper, we can handle high-order folio splitting better. Link: https://lkml.kernel.org/r/20250916160100.31545-11-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Acked-by: Chris Li Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 +--- mm/migrate.c | 11 +++-------- mm/shmem.c | 11 ++--------- mm/swap.h | 5 +++++ mm/swap_state.c | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 44 insertions(+), 20 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 26cedfcd7418..4c66e358685b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3798,9 +3798,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * NOTE: shmem in swap cache is not supported yet. */ if (swap_cache) { - __xa_store(&swap_cache->i_pages, - swap_cache_index(new_folio->swap), - new_folio, 0); + __swap_cache_replace_folio(folio, new_folio); continue; } diff --git a/mm/migrate.c b/mm/migrate.c index 8e435a078fc3..c69cc13db692 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -566,7 +566,6 @@ static int __folio_migrate_mapping(struct address_space *mapping, struct zone *oldzone, *newzone; int dirty; long nr = folio_nr_pages(folio); - long entries, i; if (!mapping) { /* Take off deferred split queue while frozen and memcg set */ @@ -615,9 +614,6 @@ static int __folio_migrate_mapping(struct address_space *mapping, if (folio_test_swapcache(folio)) { folio_set_swapcache(newfolio); newfolio->private = folio_get_private(folio); - entries = nr; - } else { - entries = 1; } /* Move dirty while folio refs frozen and newfolio not yet exposed */ @@ -627,11 +623,10 @@ static int __folio_migrate_mapping(struct address_space *mapping, folio_set_dirty(newfolio); } - /* Swap cache still stores N entries instead of a high-order entry */ - for (i = 0; i < entries; i++) { + if (folio_test_swapcache(folio)) + __swap_cache_replace_folio(folio, newfolio); + else xas_store(&xas, newfolio); - xas_next(&xas); - } /* * Drop cache reference from old folio by unfreezing diff --git a/mm/shmem.c b/mm/shmem.c index dc17717e5631..bbfbbc1bc4d6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2086,10 +2086,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; struct address_space *swap_mapping = swap_address_space(entry); - pgoff_t swap_index = swap_cache_index(entry); - XA_STATE(xas, &swap_mapping->i_pages, swap_index); int nr_pages = folio_nr_pages(old); - int error = 0, i; + int error = 0; /* * We have arrived here because our zones are constrained, so don't @@ -2118,13 +2116,8 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - /* Swap cache still stores N entries instead of a high-order entry */ xa_lock_irq(&swap_mapping->i_pages); - for (i = 0; i < nr_pages; i++) { - WARN_ON_ONCE(xas_store(&xas, new) != old); - xas_next(&xas); - } - + __swap_cache_replace_folio(old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); diff --git a/mm/swap.h b/mm/swap.h index 6c4acb549bec..fe579c81c6c4 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -185,6 +185,7 @@ int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void swap_cache_del_folio(struct folio *folio); void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow); +void __swap_cache_replace_folio(struct folio *old, struct folio *new); void swap_cache_clear_shadow(int type, unsigned long begin, unsigned long end); @@ -336,6 +337,10 @@ static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry { } +static inline void __swap_cache_replace_folio(struct folio *old, struct folio *new) +{ +} + static inline unsigned int folio_swap_flags(struct folio *folio) { return 0; diff --git a/mm/swap_state.c b/mm/swap_state.c index f3a32a06a950..d1f5b8fa52fc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -234,6 +234,39 @@ void swap_cache_del_folio(struct folio *folio) folio_ref_sub(folio, folio_nr_pages(folio)); } +/** + * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @old: The old folio to be replaced. + * @new: The new folio. + * + * Replace an existing folio in the swap cache with a new folio. The + * caller is responsible for setting up the new folio's flag and swap + * entries. Replacement will take the new folio's swap entry value as + * the starting offset to override all slots covered by the new folio. + * + * Context: Caller must ensure both folios are locked, also lock the + * swap address_space that holds the old folio to avoid races. + */ +void __swap_cache_replace_folio(struct folio *old, struct folio *new) +{ + swp_entry_t entry = new->swap; + unsigned long nr_pages = folio_nr_pages(new); + unsigned long offset = swap_cache_index(entry); + unsigned long end = offset + nr_pages; + + XA_STATE(xas, &swap_address_space(entry)->i_pages, offset); + + VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); + VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); + VM_WARN_ON_ONCE(!entry.val); + + /* Swap cache still stores N entries instead of a high-order entry */ + do { + WARN_ON_ONCE(xas_store(&xas, new) != old); + xas_next(&xas); + } while (++offset < end); +} + /** * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. * @type: Indicates the swap device. From 8578e0c00dcf0c58fbc32d4904ecaf8e802a6590 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:56 +0800 Subject: [PATCH 279/372] mm, swap: use the swap table for the swap cache and switch API Introduce basic swap table infrastructures, which are now just a fixed-sized flat array inside each swap cluster, with access wrappers. Each cluster contains a swap table of 512 entries. Each table entry is an opaque atomic long. It could be in 3 types: a shadow type (XA_VALUE), a folio type (pointer), or NULL. In this first step, it only supports storing a folio or shadow, and it is a drop-in replacement for the current swap cache. Convert all swap cache users to use the new sets of APIs. Chris Li has been suggesting using a new infrastructure for swap cache for better performance, and that idea combined well with the swap table as the new backing structure. Now the lock contention range is reduced to 2M clusters, which is much smaller than the 64M address_space. And we can also drop the multiple address_space design. All the internal works are done with swap_cache_get_* helpers. Swap cache lookup is still lock-less like before, and the helper's contexts are same with original swap cache helpers. They still require a pin on the swap device to prevent the backing data from being freed. Swap cache updates are now protected by the swap cluster lock instead of the XArray lock. This is mostly handled internally, but new __swap_cache_* helpers require the caller to lock the cluster. So, a few new cluster access and locking helpers are also introduced. A fully cluster-based unified swap table can be implemented on top of this to take care of all count tracking and synchronization work, with dynamic allocation. It should reduce the memory usage while making the performance even better. Link: https://lkml.kernel.org/r/20250916160100.31545-12-ryncsn@gmail.com Co-developed-by: Chris Li Signed-off-by: Chris Li Signed-off-by: Kairui Song Acked-by: Chris Li Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + include/linux/swap.h | 2 - mm/huge_memory.c | 13 +- mm/migrate.c | 19 ++- mm/shmem.c | 8 +- mm/swap.h | 154 +++++++++++++++++------ mm/swap_state.c | 283 +++++++++++++++++++------------------------ mm/swap_table.h | 97 +++++++++++++++ mm/swapfile.c | 100 +++++++++++---- mm/vmscan.c | 20 ++- 10 files changed, 454 insertions(+), 243 deletions(-) create mode 100644 mm/swap_table.h diff --git a/MAINTAINERS b/MAINTAINERS index 3d113bfc3c82..4c8bbf70a3c7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16232,6 +16232,7 @@ F: include/linux/swapops.h F: mm/page_io.c F: mm/swap.c F: mm/swap.h +F: mm/swap_table.h F: mm/swap_state.c F: mm/swapfile.c diff --git a/include/linux/swap.h b/include/linux/swap.h index 762f8db0e811..e818fbade1e2 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -480,8 +480,6 @@ extern int __swap_count(swp_entry_t entry); extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); struct backing_dev_info; -extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); -extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_folio_sector(struct folio *folio); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4c66e358685b..a9fc7a09167a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3720,7 +3720,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - struct address_space *swap_cache = NULL; + struct swap_cluster_info *ci = NULL; struct lruvec *lruvec; int expected_refs; @@ -3764,8 +3764,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order, goto fail; } - swap_cache = swap_address_space(folio->swap); - xa_lock(&swap_cache->i_pages); + ci = swap_cluster_get_and_lock(folio); } /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ @@ -3797,8 +3796,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, * Anonymous folio with swap cache. * NOTE: shmem in swap cache is not supported yet. */ - if (swap_cache) { - __swap_cache_replace_folio(folio, new_folio); + if (ci) { + __swap_cache_replace_folio(ci, folio, new_folio); continue; } @@ -3833,8 +3832,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order, unlock_page_lruvec(lruvec); - if (swap_cache) - xa_unlock(&swap_cache->i_pages); + if (ci) + swap_cluster_unlock(ci); } else { spin_unlock(&ds_queue->split_queue_lock); ret = -EAGAIN; diff --git a/mm/migrate.c b/mm/migrate.c index c69cc13db692..aee61a980374 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -563,6 +563,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, struct folio *newfolio, struct folio *folio, int expected_count) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); + struct swap_cluster_info *ci = NULL; struct zone *oldzone, *newzone; int dirty; long nr = folio_nr_pages(folio); @@ -591,9 +592,16 @@ static int __folio_migrate_mapping(struct address_space *mapping, oldzone = folio_zone(folio); newzone = folio_zone(newfolio); - xas_lock_irq(&xas); + if (folio_test_swapcache(folio)) + ci = swap_cluster_get_and_lock_irq(folio); + else + xas_lock_irq(&xas); + if (!folio_ref_freeze(folio, expected_count)) { - xas_unlock_irq(&xas); + if (ci) + swap_cluster_unlock_irq(ci); + else + xas_unlock_irq(&xas); return -EAGAIN; } @@ -624,7 +632,7 @@ static int __folio_migrate_mapping(struct address_space *mapping, } if (folio_test_swapcache(folio)) - __swap_cache_replace_folio(folio, newfolio); + __swap_cache_replace_folio(ci, folio, newfolio); else xas_store(&xas, newfolio); @@ -635,8 +643,11 @@ static int __folio_migrate_mapping(struct address_space *mapping, */ folio_ref_unfreeze(folio, expected_count - nr); - xas_unlock(&xas); /* Leave irq disabled to prevent preemption while updating stats */ + if (ci) + swap_cluster_unlock(ci); + else + xas_unlock(&xas); /* * If moved to a different zone then also account diff --git a/mm/shmem.c b/mm/shmem.c index bbfbbc1bc4d6..cf0171a72e47 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2083,9 +2083,9 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index, struct vm_area_struct *vma) { + struct swap_cluster_info *ci; struct folio *new, *old = *foliop; swp_entry_t entry = old->swap; - struct address_space *swap_mapping = swap_address_space(entry); int nr_pages = folio_nr_pages(old); int error = 0; @@ -2116,12 +2116,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, new->swap = entry; folio_set_swapcache(new); - xa_lock_irq(&swap_mapping->i_pages); - __swap_cache_replace_folio(old, new); + ci = swap_cluster_get_and_lock_irq(old); + __swap_cache_replace_folio(ci, old, new); mem_cgroup_replace_folio(old, new); shmem_update_stats(new, nr_pages); shmem_update_stats(old, -nr_pages); - xa_unlock_irq(&swap_mapping->i_pages); + swap_cluster_unlock_irq(ci); folio_add_lru(new); *foliop = new; diff --git a/mm/swap.h b/mm/swap.h index fe579c81c6c4..742db4d46d23 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,7 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +#include /* for atomic_long_t */ struct mempolicy; struct swap_iocb; @@ -35,6 +36,7 @@ struct swap_cluster_info { u16 count; u8 flags; u8 order; + atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ struct list_head list; }; @@ -55,6 +57,11 @@ enum swap_cluster_flags { #include /* for swp_offset */ #include /* for bio_end_io_t */ +static inline unsigned int swp_cluster_offset(swp_entry_t entry) +{ + return swp_offset(entry) % SWAPFILE_CLUSTER; +} + /* * Callers of all helpers below must ensure the entry, type, or offset is * valid, and protect the swap device with reference count or locks. @@ -81,6 +88,25 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( return &si->cluster_info[offset / SWAPFILE_CLUSTER]; } +static inline struct swap_cluster_info *__swap_entry_to_cluster(swp_entry_t entry) +{ + return __swap_offset_to_cluster(__swap_entry_to_info(entry), + swp_offset(entry)); +} + +static __always_inline struct swap_cluster_info *__swap_cluster_lock( + struct swap_info_struct *si, unsigned long offset, bool irq) +{ + struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + + VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ + if (irq) + spin_lock_irq(&ci->lock); + else + spin_lock(&ci->lock); + return ci; +} + /** * swap_cluster_lock - Lock and return the swap cluster of given offset. * @si: swap device the cluster belongs to. @@ -92,11 +118,49 @@ static inline struct swap_cluster_info *__swap_offset_to_cluster( static inline struct swap_cluster_info *swap_cluster_lock( struct swap_info_struct *si, unsigned long offset) { - struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + return __swap_cluster_lock(si, offset, false); +} - VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ - spin_lock(&ci->lock); - return ci; +static inline struct swap_cluster_info *__swap_cluster_get_and_lock( + const struct folio *folio, bool irq) +{ + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + return __swap_cluster_lock(__swap_entry_to_info(folio->swap), + swp_offset(folio->swap), irq); +} + +/* + * swap_cluster_get_and_lock - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * This locks and returns the swap cluster that contains a folio's swap + * entries. The swap entries of a folio are always in one single cluster. + * The folio has to be locked so its swap entries won't change and the + * cluster won't be freed. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + const struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, false); +} + +/* + * swap_cluster_get_and_lock_irq - Locks the cluster that holds a folio's entries. + * @folio: The folio. + * + * Same as swap_cluster_get_and_lock but also disable IRQ. + * + * Context: Caller must ensure the folio is locked and in the swap cache. + * Return: Pointer to the swap cluster. + */ +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + const struct folio *folio) +{ + return __swap_cluster_get_and_lock(folio, true); } static inline void swap_cluster_unlock(struct swap_cluster_info *ci) @@ -104,6 +168,11 @@ static inline void swap_cluster_unlock(struct swap_cluster_info *ci) spin_unlock(&ci->lock); } +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ + spin_unlock_irq(&ci->lock); +} + /* linux/mm/page_io.c */ int sio_pool_init(void); struct swap_iocb; @@ -123,10 +192,11 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) -extern struct address_space *swapper_spaces[]; -#define swap_address_space(entry) \ - (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ - >> SWAP_ADDRESS_SPACE_SHIFT]) +extern struct address_space swap_space; +static inline struct address_space *swap_address_space(swp_entry_t entry) +{ + return &swap_space; +} /* * Return the swap device position of the swap entry. @@ -136,15 +206,6 @@ static inline loff_t swap_dev_pos(swp_entry_t entry) return ((loff_t)swp_offset(entry)) << PAGE_SHIFT; } -/* - * Return the swap cache index of the swap entry. - */ -static inline pgoff_t swap_cache_index(swp_entry_t entry) -{ - BUILD_BUG_ON((SWP_OFFSET_MASK | SWAP_ADDRESS_SPACE_MASK) != SWP_OFFSET_MASK); - return swp_offset(entry) & SWAP_ADDRESS_SPACE_MASK; -} - /** * folio_matches_swap_entry - Check if a folio matches a given swap entry. * @folio: The folio. @@ -180,14 +241,14 @@ static inline bool folio_matches_swap_entry(const struct folio *folio, */ struct folio *swap_cache_get_folio(swp_entry_t entry); void *swap_cache_get_shadow(swp_entry_t entry); -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadow); +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow); void swap_cache_del_folio(struct folio *folio); -void __swap_cache_del_folio(struct folio *folio, - swp_entry_t entry, void *shadow); -void __swap_cache_replace_folio(struct folio *old, struct folio *new); -void swap_cache_clear_shadow(int type, unsigned long begin, - unsigned long end); +/* Below helpers require the caller to lock and pass in the swap cluster. */ +void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow); +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new); +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents); void show_swap_cache_info(void); void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr); @@ -255,6 +316,32 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) #else /* CONFIG_SWAP */ struct swap_iocb; +static inline struct swap_cluster_info *swap_cluster_lock( + struct swap_info_struct *si, pgoff_t offset, bool irq) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock( + struct folio *folio) +{ + return NULL; +} + +static inline struct swap_cluster_info *swap_cluster_get_and_lock_irq( + struct folio *folio) +{ + return NULL; +} + +static inline void swap_cluster_unlock(struct swap_cluster_info *ci) +{ +} + +static inline void swap_cluster_unlock_irq(struct swap_cluster_info *ci) +{ +} + static inline struct swap_info_struct *__swap_entry_to_info(swp_entry_t entry) { return NULL; @@ -272,11 +359,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry) return NULL; } -static inline pgoff_t swap_cache_index(swp_entry_t entry) -{ - return 0; -} - static inline bool folio_matches_swap_entry(const struct folio *folio, swp_entry_t entry) { return false; @@ -323,21 +405,21 @@ static inline void *swap_cache_get_shadow(swp_entry_t entry) return NULL; } -static inline int swap_cache_add_folio(swp_entry_t entry, struct folio *folio, - gfp_t gfp, void **shadow) +static inline void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadow) { - return -EINVAL; } static inline void swap_cache_del_folio(struct folio *folio) { } -static inline void __swap_cache_del_folio(struct folio *folio, swp_entry_t entry, void *shadow) +static inline void __swap_cache_del_folio(struct swap_cluster_info *ci, + struct folio *folio, swp_entry_t entry, void *shadow) { } -static inline void __swap_cache_replace_folio(struct folio *old, struct folio *new) +static inline void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { } @@ -371,8 +453,10 @@ static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) */ static inline pgoff_t folio_index(struct folio *folio) { +#ifdef CONFIG_SWAP if (unlikely(folio_test_swapcache(folio))) - return swap_cache_index(folio->swap); + return swp_offset(folio->swap); +#endif return folio->index; } diff --git a/mm/swap_state.c b/mm/swap_state.c index d1f5b8fa52fc..2558a648d671 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -23,6 +23,7 @@ #include #include #include "internal.h" +#include "swap_table.h" #include "swap.h" /* @@ -36,8 +37,10 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +struct address_space swap_space __read_mostly = { + .a_ops = &swap_aops, +}; + static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_ORDER_CEILING 5 @@ -83,11 +86,20 @@ void show_swap_cache_info(void) */ struct folio *swap_cache_get_folio(swp_entry_t entry) { - struct folio *folio = filemap_get_folio(swap_address_space(entry), - swap_cache_index(entry)); - if (IS_ERR(folio)) - return NULL; - return folio; + unsigned long swp_tb; + struct folio *folio; + + for (;;) { + swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (!swp_tb_is_folio(swp_tb)) + return NULL; + folio = swp_tb_to_folio(swp_tb); + if (likely(folio_try_get(folio))) + return folio; + } + + return NULL; } /** @@ -100,13 +112,13 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) */ void *swap_cache_get_shadow(swp_entry_t entry) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - void *shadow; + unsigned long swp_tb; + + swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); + if (swp_tb_is_shadow(swp_tb)) + return swp_tb_to_shadow(swp_tb); - shadow = xa_load(&address_space->i_pages, idx); - if (xa_is_value(shadow)) - return shadow; return NULL; } @@ -119,61 +131,48 @@ void *swap_cache_get_shadow(swp_entry_t entry) * * Context: Caller must ensure @entry is valid and protect the swap device * with reference count or locks. - * The caller also needs to mark the corresponding swap_map slots with - * SWAP_HAS_CACHE to avoid race or conflict. - * Return: Returns 0 on success, error code otherwise. + * The caller also needs to update the corresponding swap_map slots with + * SWAP_HAS_CACHE bit to avoid race or conflict. */ -int swap_cache_add_folio(struct folio *folio, swp_entry_t entry, - gfp_t gfp, void **shadowp) +void swap_cache_add_folio(struct folio *folio, swp_entry_t entry, void **shadowp) { - struct address_space *address_space = swap_address_space(entry); - pgoff_t idx = swap_cache_index(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); - unsigned long i, nr = folio_nr_pages(folio); - void *old; + void *shadow = NULL; + unsigned long old_tb, new_tb; + struct swap_cluster_info *ci; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); - xas_set_update(&xas, workingset_update_node); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapbacked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + new_tb = folio_to_swp_tb(folio); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + do { + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(swp_tb_is_folio(old_tb)); + if (swp_tb_is_shadow(old_tb)) + shadow = swp_tb_to_shadow(old_tb); + } while (++ci_off < ci_end); - folio_ref_add(folio, nr); + folio_ref_add(folio, nr_pages); folio_set_swapcache(folio); folio->swap = entry; + swap_cluster_unlock(ci); - do { - xas_lock_irq(&xas); - xas_create_range(&xas); - if (xas_error(&xas)) - goto unlock; - for (i = 0; i < nr; i++) { - VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); - if (shadowp) { - old = xas_load(&xas); - if (xa_is_value(old)) - *shadowp = old; - } - xas_store(&xas, folio); - xas_next(&xas); - } - address_space->nrpages += nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); -unlock: - xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); + node_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr_pages); - if (!xas_error(&xas)) - return 0; - - folio_clear_swapcache(folio); - folio_ref_sub(folio, nr); - return xas_error(&xas); + if (shadowp) + *shadowp = shadow; } /** * __swap_cache_del_folio - Removes a folio from the swap cache. + * @ci: The locked swap cluster. * @folio: The folio. * @entry: The first swap entry that the folio corresponds to. * @shadow: shadow value to be filled in the swap cache. @@ -181,34 +180,36 @@ unlock: * Removes a folio from the swap cache and fills a shadow in place. * This won't put the folio's refcount. The caller has to do that. * - * Context: Caller must hold the xa_lock, ensure the folio is - * locked and in the swap cache, using the index of @entry. + * Context: Caller must ensure the folio is locked and in the swap cache + * using the index of @entry, and lock the cluster that holds the entries. */ -void __swap_cache_del_folio(struct folio *folio, +void __swap_cache_del_folio(struct swap_cluster_info *ci, struct folio *folio, swp_entry_t entry, void *shadow) { - struct address_space *address_space = swap_address_space(entry); - int i; - long nr = folio_nr_pages(folio); - pgoff_t idx = swap_cache_index(entry); - XA_STATE(xas, &address_space->i_pages, idx); + unsigned long old_tb, new_tb; + unsigned int ci_start, ci_off, ci_end; + unsigned long nr_pages = folio_nr_pages(folio); - xas_set_update(&xas, workingset_update_node); + VM_WARN_ON_ONCE(__swap_entry_to_cluster(entry) != ci); + VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio); + VM_WARN_ON_ONCE_FOLIO(!folio_test_swapcache(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_writeback(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio); - VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); + new_tb = shadow_swp_to_tb(shadow); + ci_start = swp_cluster_offset(entry); + ci_end = ci_start + nr_pages; + ci_off = ci_start; + do { + /* If shadow is NULL, we sets an empty shadow */ + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || + swp_tb_to_folio(old_tb) != folio); + } while (++ci_off < ci_end); - for (i = 0; i < nr; i++) { - void *entry = xas_store(&xas, shadow); - VM_BUG_ON_PAGE(entry != folio, entry); - xas_next(&xas); - } folio->swap.val = 0; folio_clear_swapcache(folio); - address_space->nrpages -= nr; - __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); - __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr); + node_stat_mod_folio(folio, NR_FILE_PAGES, -nr_pages); + lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr_pages); } /** @@ -223,12 +224,12 @@ void __swap_cache_del_folio(struct folio *folio, */ void swap_cache_del_folio(struct folio *folio) { + struct swap_cluster_info *ci; swp_entry_t entry = folio->swap; - struct address_space *address_space = swap_address_space(entry); - xa_lock_irq(&address_space->i_pages); - __swap_cache_del_folio(folio, entry, NULL); - xa_unlock_irq(&address_space->i_pages); + ci = swap_cluster_lock(__swap_entry_to_info(entry), swp_offset(entry)); + __swap_cache_del_folio(ci, folio, entry, NULL); + swap_cluster_unlock(ci); put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); @@ -236,6 +237,7 @@ void swap_cache_del_folio(struct folio *folio) /** * __swap_cache_replace_folio - Replace a folio in the swap cache. + * @ci: The locked swap cluster. * @old: The old folio to be replaced. * @new: The new folio. * @@ -244,65 +246,62 @@ void swap_cache_del_folio(struct folio *folio) * entries. Replacement will take the new folio's swap entry value as * the starting offset to override all slots covered by the new folio. * - * Context: Caller must ensure both folios are locked, also lock the - * swap address_space that holds the old folio to avoid races. + * Context: Caller must ensure both folios are locked, and lock the + * cluster that holds the old folio to be replaced. */ -void __swap_cache_replace_folio(struct folio *old, struct folio *new) +void __swap_cache_replace_folio(struct swap_cluster_info *ci, + struct folio *old, struct folio *new) { swp_entry_t entry = new->swap; unsigned long nr_pages = folio_nr_pages(new); - unsigned long offset = swap_cache_index(entry); - unsigned long end = offset + nr_pages; - - XA_STATE(xas, &swap_address_space(entry)->i_pages, offset); + unsigned int ci_off = swp_cluster_offset(entry); + unsigned int ci_end = ci_off + nr_pages; + unsigned long old_tb, new_tb; VM_WARN_ON_ONCE(!folio_test_swapcache(old) || !folio_test_swapcache(new)); VM_WARN_ON_ONCE(!folio_test_locked(old) || !folio_test_locked(new)); VM_WARN_ON_ONCE(!entry.val); /* Swap cache still stores N entries instead of a high-order entry */ + new_tb = folio_to_swp_tb(new); do { - WARN_ON_ONCE(xas_store(&xas, new) != old); - xas_next(&xas); - } while (++offset < end); + old_tb = __swap_table_xchg(ci, ci_off, new_tb); + WARN_ON_ONCE(!swp_tb_is_folio(old_tb) || swp_tb_to_folio(old_tb) != old); + } while (++ci_off < ci_end); + + /* + * If the old folio is partially replaced (e.g., splitting a large + * folio, the old folio is shrunk, and new split sub folios replace + * the shrunk part), ensure the new folio doesn't overlap it. + */ + if (IS_ENABLED(CONFIG_DEBUG_VM) && + folio_order(old) != folio_order(new)) { + ci_off = swp_cluster_offset(old->swap); + ci_end = ci_off + folio_nr_pages(old); + while (ci_off++ < ci_end) + WARN_ON_ONCE(swp_tb_to_folio(__swap_table_get(ci, ci_off)) != old); + } } /** * swap_cache_clear_shadow - Clears a set of shadows in the swap cache. - * @type: Indicates the swap device. - * @begin: Beginning offset of the range. - * @end: Ending offset of the range. + * @entry: The starting index entry. + * @nr_ents: How many slots need to be cleared. * - * Context: Caller must ensure the range is valid and hold a reference to - * the swap device. + * Context: Caller must ensure the range is valid, all in one single cluster, + * not occupied by any folio, and lock the cluster. */ -void swap_cache_clear_shadow(int type, unsigned long begin, - unsigned long end) +void __swap_cache_clear_shadow(swp_entry_t entry, int nr_ents) { - unsigned long curr = begin; - void *old; + struct swap_cluster_info *ci = __swap_entry_to_cluster(entry); + unsigned int ci_off = swp_cluster_offset(entry), ci_end; + unsigned long old; - for (;;) { - swp_entry_t entry = swp_entry(type, curr); - unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK; - struct address_space *address_space = swap_address_space(entry); - XA_STATE(xas, &address_space->i_pages, index); - - xas_set_update(&xas, workingset_update_node); - - xa_lock_irq(&address_space->i_pages); - xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) { - if (!xa_is_value(old)) - continue; - xas_store(&xas, NULL); - } - xa_unlock_irq(&address_space->i_pages); - - /* search the next swapcache until we meet end */ - curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); - if (curr > end) - break; - } + ci_end = ci_off + nr_ents; + do { + old = __swap_table_xchg(ci, ci_off, null_to_swp_tb()); + WARN_ON_ONCE(swp_tb_is_folio(old)); + } while (++ci_off < ci_end); } /* @@ -482,10 +481,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry)) goto fail_unlock; - /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (swap_cache_add_folio(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) - goto fail_unlock; - + swap_cache_add_folio(new_folio, entry, &shadow); memcg1_swapin(entry, 1); if (shadow) @@ -677,41 +673,6 @@ skip: return folio; } -int init_swap_address_space(unsigned int type, unsigned long nr_pages) -{ - struct address_space *spaces, *space; - unsigned int i, nr; - - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); - if (!spaces) - return -ENOMEM; - for (i = 0; i < nr; i++) { - space = spaces + i; - xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); - atomic_set(&space->i_mmap_writable, 0); - space->a_ops = &swap_aops; - /* swap cache doesn't use writeback related tags */ - mapping_set_no_writeback_tags(space); - } - nr_swapper_spaces[type] = nr; - swapper_spaces[type] = spaces; - - return 0; -} - -void exit_swap_address_space(unsigned int type) -{ - int i; - struct address_space *spaces = swapper_spaces[type]; - - for (i = 0; i < nr_swapper_spaces[type]; i++) - VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); - kvfree(spaces); - nr_swapper_spaces[type] = 0; - swapper_spaces[type] = NULL; -} - static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start, unsigned long *end) { @@ -884,7 +845,7 @@ static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; -static int __init swap_init_sysfs(void) +static int __init swap_init(void) { int err; struct kobject *swap_kobj; @@ -899,11 +860,13 @@ static int __init swap_init_sysfs(void) pr_err("failed to register swap group\n"); goto delete_obj; } + /* Swap cache writeback is LRU based, no tags for it */ + mapping_set_no_writeback_tags(&swap_space); return 0; delete_obj: kobject_put(swap_kobj); return err; } -subsys_initcall(swap_init_sysfs); +subsys_initcall(swap_init); #endif diff --git a/mm/swap_table.h b/mm/swap_table.h new file mode 100644 index 000000000000..e1f7cc009701 --- /dev/null +++ b/mm/swap_table.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_SWAP_TABLE_H +#define _MM_SWAP_TABLE_H + +#include "swap.h" + +/* + * A swap table entry represents the status of a swap slot on a swap + * (physical or virtual) device. The swap table in each cluster is a + * 1:1 map of the swap slots in this cluster. + * + * Each swap table entry could be a pointer (folio), a XA_VALUE + * (shadow), or NULL. + */ + +/* + * Helpers for casting one type of info into a swap table entry. + */ +static inline unsigned long null_to_swp_tb(void) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(atomic_long_t)); + return 0; +} + +static inline unsigned long folio_to_swp_tb(struct folio *folio) +{ + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); + return (unsigned long)folio; +} + +static inline unsigned long shadow_swp_to_tb(void *shadow) +{ + BUILD_BUG_ON((BITS_PER_XA_VALUE + 1) != + BITS_PER_BYTE * sizeof(unsigned long)); + VM_WARN_ON_ONCE(shadow && !xa_is_value(shadow)); + return (unsigned long)shadow; +} + +/* + * Helpers for swap table entry type checking. + */ +static inline bool swp_tb_is_null(unsigned long swp_tb) +{ + return !swp_tb; +} + +static inline bool swp_tb_is_folio(unsigned long swp_tb) +{ + return !xa_is_value((void *)swp_tb) && !swp_tb_is_null(swp_tb); +} + +static inline bool swp_tb_is_shadow(unsigned long swp_tb) +{ + return xa_is_value((void *)swp_tb); +} + +/* + * Helpers for retrieving info from swap table. + */ +static inline struct folio *swp_tb_to_folio(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_folio(swp_tb)); + return (void *)swp_tb; +} + +static inline void *swp_tb_to_shadow(unsigned long swp_tb) +{ + VM_WARN_ON(!swp_tb_is_shadow(swp_tb)); + return (void *)swp_tb; +} + +/* + * Helpers for accessing or modifying the swap table of a cluster, + * the swap cluster must be locked. + */ +static inline void __swap_table_set(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + atomic_long_set(&ci->table[off], swp_tb); +} + +static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, + unsigned int off, unsigned long swp_tb) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + /* Ordering is guaranteed by cluster lock, relax */ + return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); +} + +static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); + return atomic_long_read(&ci->table[off]); +} +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 51f781c43537..b183e96be289 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,6 +46,7 @@ #include #include #include +#include "swap_table.h" #include "internal.h" #include "swap.h" @@ -421,6 +422,34 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } +static int swap_cluster_alloc_table(struct swap_cluster_info *ci) +{ + WARN_ON(ci->table); + ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); + if (!ci->table) + return -ENOMEM; + return 0; +} + +static void swap_cluster_free_table(struct swap_cluster_info *ci) +{ + unsigned int ci_off; + unsigned long swp_tb; + + if (!ci->table) + return; + + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { + swp_tb = __swap_table_get(ci, ci_off); + if (!swp_tb_is_null(swp_tb)) + pr_err_once("swap: unclean swap space on swapoff: 0x%lx", + swp_tb); + } + + kfree(ci->table); + ci->table = NULL; +} + static void move_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci, struct list_head *list, enum swap_cluster_flags new_flags) @@ -703,6 +732,26 @@ static bool cluster_scan_range(struct swap_info_struct *si, return true; } +/* + * Currently, the swap table is not used for count tracking, just + * do a sanity check here to ensure nothing leaked, so the swap + * table should be empty upon freeing. + */ +static void swap_cluster_assert_table_empty(struct swap_cluster_info *ci, + unsigned int start, unsigned int nr) +{ + unsigned int ci_off = start % SWAPFILE_CLUSTER; + unsigned int ci_end = ci_off + nr; + unsigned long swp_tb; + + if (IS_ENABLED(CONFIG_DEBUG_VM)) { + do { + swp_tb = __swap_table_get(ci, ci_off); + VM_WARN_ON_ONCE(!swp_tb_is_null(swp_tb)); + } while (++ci_off < ci_end); + } +} + static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci, unsigned int start, unsigned char usage, unsigned int order) @@ -722,6 +771,7 @@ static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster ci->order = order; memset(si->swap_map + start, usage, nr_pages); + swap_cluster_assert_table_empty(ci, start, nr_pages); swap_range_alloc(si, nr_pages); ci->count += nr_pages; @@ -1124,7 +1174,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify(si->bdev, offset); offset++; } - swap_cache_clear_shadow(si->type, begin, end); + __swap_cache_clear_shadow(swp_entry(si->type, begin), nr_entries); /* * Make sure that try_to_unuse() observes si->inuse_pages reaching 0 @@ -1281,16 +1331,7 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) if (!entry.val) return -ENOMEM; - /* - * XArray node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - if (swap_cache_add_folio(folio, entry, gfp | __GFP_NOMEMALLOC, NULL)) - goto out_free; + swap_cache_add_folio(folio, entry, NULL); return 0; @@ -1556,6 +1597,7 @@ static void swap_entries_free(struct swap_info_struct *si, mem_cgroup_uncharge_swap(entry, nr_pages); swap_range_free(si, offset, nr_pages); + swap_cluster_assert_table_empty(ci, offset, nr_pages); if (!ci->count) free_cluster(si, ci); @@ -2634,6 +2676,18 @@ static void wait_for_allocation(struct swap_info_struct *si) } } +static void free_cluster_info(struct swap_cluster_info *cluster_info, + unsigned long maxpages) +{ + int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + + if (!cluster_info) + return; + for (i = 0; i < nr_clusters; i++) + swap_cluster_free_table(&cluster_info[i]); + kvfree(cluster_info); +} + /* * Called after swap device's reference count is dead, so * neither scan nor allocation will use it. @@ -2768,12 +2822,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_file = p->swap_file; p->swap_file = NULL; - p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; cluster_info = p->cluster_info; + free_cluster_info(cluster_info, p->max); + p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); spin_unlock(&swap_lock); @@ -2784,10 +2839,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); - exit_swap_address_space(p->type); inode = mapping->host; @@ -3171,8 +3224,11 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (!cluster_info) goto err; - for (i = 0; i < nr_clusters; i++) + for (i = 0; i < nr_clusters; i++) { spin_lock_init(&cluster_info[i].lock); + if (swap_cluster_alloc_table(&cluster_info[i])) + goto err_free; + } if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc(sizeof(*si->global_cluster), @@ -3233,9 +3289,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } return cluster_info; - err_free: - kvfree(cluster_info); + free_cluster_info(cluster_info, maxpages); err: return ERR_PTR(err); } @@ -3429,13 +3484,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } } - error = init_swap_address_space(si->type, maxpages); - if (error) - goto bad_swap_unlock_inode; - error = zswap_swapon(si->type, maxpages); if (error) - goto free_swap_address_space; + goto bad_swap_unlock_inode; /* * Flush any pending IO and dirty mappings before we start using this @@ -3470,8 +3521,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto out; free_swap_zswap: zswap_swapoff(si->type); -free_swap_address_space: - exit_swap_address_space(si->type); bad_swap_unlock_inode: inode_unlock(inode); bad_swap: @@ -3486,7 +3535,8 @@ bad_swap: spin_unlock(&swap_lock); vfree(swap_map); kvfree(zeromap); - kvfree(cluster_info); + if (cluster_info) + free_cluster_info(cluster_info, maxpages); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) diff --git a/mm/vmscan.c b/mm/vmscan.c index c79c6806560b..e170c12e2065 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -730,13 +730,18 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, { int refcount; void *shadow = NULL; + struct swap_cluster_info *ci; BUG_ON(!folio_test_locked(folio)); BUG_ON(mapping != folio_mapping(folio)); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + ci = swap_cluster_get_and_lock_irq(folio); + } else { spin_lock(&mapping->host->i_lock); - xa_lock_irq(&mapping->i_pages); + xa_lock_irq(&mapping->i_pages); + } + /* * The non racy check for a busy folio. * @@ -776,9 +781,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - __swap_cache_del_folio(folio, swap, shadow); + __swap_cache_del_folio(ci, folio, swap, shadow); memcg1_swapout(folio, swap); - xa_unlock_irq(&mapping->i_pages); + swap_cluster_unlock_irq(ci); put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); @@ -816,9 +821,12 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, return 1; cannot_free: - xa_unlock_irq(&mapping->i_pages); - if (!folio_test_swapcache(folio)) + if (folio_test_swapcache(folio)) { + swap_cluster_unlock_irq(ci); + } else { + xa_unlock_irq(&mapping->i_pages); spin_unlock(&mapping->host->i_lock); + } return 0; } From 8b47299a411a178d572aaac31ff7ab33a8bd27e2 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:57 +0800 Subject: [PATCH 280/372] mm, swap: mark swap address space ro and add context debug check Swap cache is now backed by swap table, and the address space is not holding any mutable data anymore. And swap cache is now protected by the swap cluster lock, instead of the XArray lock. All access to swap cache are wrapped by swap cache helpers. Locking is mostly handled internally by swap cache helpers, only a few __swap_cache_* helpers require the caller to lock the cluster by themselves. Worth noting that, unlike XArray, the cluster lock is not IRQ safe. The swap cache was very different compared to filemap, and now it's completely separated from filemap. Nothing wants to mark or change anything or do a writeback callback in IRQ. So explicitly document this and add a debug check to avoid further potential misuse. And mark the swap cache space as read-only to avoid any user wrongly mixing unexpected filemap helpers with swap cache. Link: https://lkml.kernel.org/r/20250916160100.31545-13-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Chris Li Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/swap.h | 12 +++++++++++- mm/swap_state.c | 3 ++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 742db4d46d23..adcd85fa8538 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -99,6 +99,16 @@ static __always_inline struct swap_cluster_info *__swap_cluster_lock( { struct swap_cluster_info *ci = __swap_offset_to_cluster(si, offset); + /* + * Nothing modifies swap cache in an IRQ context. All access to + * swap cache is wrapped by swap_cache_* helpers, and swap cache + * writeback is handled outside of IRQs. Swapin or swapout never + * occurs in IRQ, and neither does in-place split or replace. + * + * Besides, modifying swap cache requires synchronization with + * swap_map, which was never IRQ safe. + */ + VM_WARN_ON_ONCE(!in_task()); VM_WARN_ON_ONCE(percpu_ref_is_zero(&si->users)); /* race with swapoff */ if (irq) spin_lock_irq(&ci->lock); @@ -192,7 +202,7 @@ void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) #define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) -extern struct address_space swap_space; +extern struct address_space swap_space __ro_after_init; static inline struct address_space *swap_address_space(swp_entry_t entry) { return &swap_space; diff --git a/mm/swap_state.c b/mm/swap_state.c index 2558a648d671..a1478cbff384 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -37,7 +37,8 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space swap_space __read_mostly = { +/* Set swap_space as read only as swap cache is handled by swap table */ +struct address_space swap_space __ro_after_init = { .a_ops = &swap_aops, }; From 685a17fbd35e66ae9b6440979b438caa2ae540cd Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:58 +0800 Subject: [PATCH 281/372] mm, swap: remove contention workaround for swap cache Swap cluster setup will try to shuffle the clusters on initialization. It was helpful to avoid contention for the swap cache space. The cluster size (2M) was much smaller than each swap cache space (64M), so shuffling the cluster means the allocator will try to allocate swap slots that are in different swap cache spaces for each CPU, reducing the chance of two CPUs using the same swap cache space, and hence reducing the contention. Now, swap cache is managed by swap clusters, this shuffle is pointless. Just remove it, and clean up related macros. This also improves the HDD swap performance as shuffling IO is a bad idea for HDD, and now the shuffling is gone. Test have shown a ~40% performance gain for HDD [1]: Doing sequential swap in of 8G data using 8 processes with usemem, average of 3 test runs: Before: 1270.91 KB/s per process After: 1849.54 KB/s per process Link: https://lore.kernel.org/linux-mm/CAMgjq7AdauQ8=X0zeih2r21QoV=-WWj1hyBxLWRzq74n-C=-Ng@mail.gmail.com/ [1] Link: https://lkml.kernel.org/r/20250916160100.31545-14-ryncsn@gmail.com Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202504241621.f27743ec-lkp@intel.com Signed-off-by: Kairui Song Acked-by: Chris Li Reviewed-by: Barry Song Acked-by: David Hildenbrand Suggested-by: Chris Li Cc: Baolin Wang Cc: Baoquan He Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/swap.h | 4 ---- mm/swapfile.c | 32 ++++++++------------------------ mm/zswap.c | 7 +++++-- 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index adcd85fa8538..fe5c20922082 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -198,10 +198,6 @@ int swap_writeout(struct folio *folio, struct swap_iocb **swap_plug); void __swap_writepage(struct folio *folio, struct swap_iocb **swap_plug); /* linux/mm/swap_state.c */ -/* One swap address space for each 64M swap space */ -#define SWAP_ADDRESS_SPACE_SHIFT 14 -#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) -#define SWAP_ADDRESS_SPACE_MASK (SWAP_ADDRESS_SPACE_PAGES - 1) extern struct address_space swap_space __ro_after_init; static inline struct address_space *swap_address_space(swp_entry_t entry) { diff --git a/mm/swapfile.c b/mm/swapfile.c index b183e96be289..314c5c10d3bd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3204,21 +3204,14 @@ static int setup_swap_map(struct swap_info_struct *si, return 0; } -#define SWAP_CLUSTER_INFO_COLS \ - DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info)) -#define SWAP_CLUSTER_SPACE_COLS \ - DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER) -#define SWAP_CLUSTER_COLS \ - max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS) - static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, union swap_header *swap_header, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); struct swap_cluster_info *cluster_info; - unsigned long i, j, idx; int err = -ENOMEM; + unsigned long i; cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL); if (!cluster_info) @@ -3267,22 +3260,13 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, INIT_LIST_HEAD(&si->frag_clusters[i]); } - /* - * Reduce false cache line sharing between cluster_info and - * sharing same address space. - */ - for (j = 0; j < SWAP_CLUSTER_COLS; j++) { - for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { - struct swap_cluster_info *ci; - idx = i * SWAP_CLUSTER_COLS + j; - ci = cluster_info + idx; - if (idx >= nr_clusters) - continue; - if (ci->count) { - ci->flags = CLUSTER_FLAG_NONFULL; - list_add_tail(&ci->list, &si->nonfull_clusters[0]); - continue; - } + for (i = 0; i < nr_clusters; i++) { + struct swap_cluster_info *ci = &cluster_info[i]; + + if (ci->count) { + ci->flags = CLUSTER_FLAG_NONFULL; + list_add_tail(&ci->list, &si->nonfull_clusters[0]); + } else { ci->flags = CLUSTER_FLAG_FREE; list_add_tail(&ci->list, &si->free_clusters); } diff --git a/mm/zswap.c b/mm/zswap.c index 1b1edecde6a7..c1af782e54ec 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -225,10 +225,13 @@ static bool zswap_has_pool; * helpers and fwd declarations **********************************/ +/* One swap address space for each 64M swap space */ +#define ZSWAP_ADDRESS_SPACE_SHIFT 14 +#define ZSWAP_ADDRESS_SPACE_PAGES (1 << ZSWAP_ADDRESS_SPACE_SHIFT) static inline struct xarray *swap_zswap_tree(swp_entry_t swp) { return &zswap_trees[swp_type(swp)][swp_offset(swp) - >> SWAP_ADDRESS_SPACE_SHIFT]; + >> ZSWAP_ADDRESS_SPACE_SHIFT]; } #define zswap_pool_debug(msg, p) \ @@ -1674,7 +1677,7 @@ int zswap_swapon(int type, unsigned long nr_pages) struct xarray *trees, *tree; unsigned int nr, i; - nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); + nr = DIV_ROUND_UP(nr_pages, ZSWAP_ADDRESS_SPACE_PAGES); trees = kvcalloc(nr, sizeof(*tree), GFP_KERNEL); if (!trees) { pr_err("alloc failed, zswap disabled for swap type %d\n", type); From 07adc4cf1ecd316e7b6f4a142e5f5e96ce697e65 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:00:59 +0800 Subject: [PATCH 282/372] mm, swap: implement dynamic allocation of swap table Now swap table is cluster based, which means free clusters can free its table since no one should modify it. There could be speculative readers, like swap cache look up, protect them by making them RCU protected. All swap table should be filled with null entries before free, so such readers will either see a NULL pointer or a null filled table being lazy freed. On allocation, allocate the table when a cluster is used by any order. This way, we can reduce the memory usage of large swap device significantly. This idea to dynamically release unused swap cluster data was initially suggested by Chris Li while proposing the cluster swap allocator and it suits the swap table idea very well. Link: https://lkml.kernel.org/r/20250916160100.31545-15-ryncsn@gmail.com Co-developed-by: Chris Li Signed-off-by: Chris Li Signed-off-by: Kairui Song Suggested-by: Chris Li Reviewed-by: Barry Song Cc: Baolin Wang Cc: Baoquan He Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/swap.h | 2 +- mm/swap_state.c | 9 +-- mm/swap_table.h | 37 ++++++++- mm/swapfile.c | 199 +++++++++++++++++++++++++++++++++++++----------- 4 files changed, 195 insertions(+), 52 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index fe5c20922082..8d8efdf1297a 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -36,7 +36,7 @@ struct swap_cluster_info { u16 count; u8 flags; u8 order; - atomic_long_t *table; /* Swap table entries, see mm/swap_table.h */ + atomic_long_t __rcu *table; /* Swap table entries, see mm/swap_table.h */ struct list_head list; }; diff --git a/mm/swap_state.c b/mm/swap_state.c index a1478cbff384..b13e9c4baa90 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -91,8 +91,8 @@ struct folio *swap_cache_get_folio(swp_entry_t entry) struct folio *folio; for (;;) { - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), - swp_cluster_offset(entry)); + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); if (!swp_tb_is_folio(swp_tb)) return NULL; folio = swp_tb_to_folio(swp_tb); @@ -115,11 +115,10 @@ void *swap_cache_get_shadow(swp_entry_t entry) { unsigned long swp_tb; - swp_tb = __swap_table_get(__swap_entry_to_cluster(entry), - swp_cluster_offset(entry)); + swp_tb = swap_table_get(__swap_entry_to_cluster(entry), + swp_cluster_offset(entry)); if (swp_tb_is_shadow(swp_tb)) return swp_tb_to_shadow(swp_tb); - return NULL; } diff --git a/mm/swap_table.h b/mm/swap_table.h index e1f7cc009701..52254e455304 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -2,8 +2,15 @@ #ifndef _MM_SWAP_TABLE_H #define _MM_SWAP_TABLE_H +#include +#include #include "swap.h" +/* A typical flat array in each cluster as swap table */ +struct swap_table { + atomic_long_t entries[SWAPFILE_CLUSTER]; +}; + /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a @@ -76,22 +83,46 @@ static inline void *swp_tb_to_shadow(unsigned long swp_tb) static inline void __swap_table_set(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); - atomic_long_set(&ci->table[off], swp_tb); + atomic_long_set(&table[off], swp_tb); } static inline unsigned long __swap_table_xchg(struct swap_cluster_info *ci, unsigned int off, unsigned long swp_tb) { + atomic_long_t *table = rcu_dereference_protected(ci->table, true); + + lockdep_assert_held(&ci->lock); VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); /* Ordering is guaranteed by cluster lock, relax */ - return atomic_long_xchg_relaxed(&ci->table[off], swp_tb); + return atomic_long_xchg_relaxed(&table[off], swp_tb); } static inline unsigned long __swap_table_get(struct swap_cluster_info *ci, unsigned int off) { + atomic_long_t *table; + VM_WARN_ON_ONCE(off >= SWAPFILE_CLUSTER); - return atomic_long_read(&ci->table[off]); + table = rcu_dereference_check(ci->table, lockdep_is_held(&ci->lock)); + + return atomic_long_read(&table[off]); +} + +static inline unsigned long swap_table_get(struct swap_cluster_info *ci, + unsigned int off) +{ + atomic_long_t *table; + unsigned long swp_tb; + + rcu_read_lock(); + table = rcu_dereference(ci->table); + swp_tb = table ? atomic_long_read(&table[off]) : null_to_swp_tb(); + rcu_read_unlock(); + + return swp_tb; } #endif diff --git a/mm/swapfile.c b/mm/swapfile.c index 314c5c10d3bd..094e3e75849f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -59,6 +59,9 @@ static void swap_entries_free(struct swap_info_struct *si, static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); +static void move_cluster(struct swap_info_struct *si, + struct swap_cluster_info *ci, struct list_head *list, + enum swap_cluster_flags new_flags); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -105,6 +108,8 @@ static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; +static struct kmem_cache *swap_table_cachep; + static DEFINE_MUTEX(swapon_mutex); static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); @@ -401,10 +406,17 @@ static inline bool cluster_is_discard(struct swap_cluster_info *info) return info->flags == CLUSTER_FLAG_DISCARD; } +static inline bool cluster_table_is_alloced(struct swap_cluster_info *ci) +{ + return rcu_dereference_protected(ci->table, lockdep_is_held(&ci->lock)); +} + static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order) { if (unlikely(ci->flags > CLUSTER_FLAG_USABLE)) return false; + if (!cluster_table_is_alloced(ci)) + return false; if (!order) return true; return cluster_is_empty(ci) || order == ci->order; @@ -422,32 +434,90 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } -static int swap_cluster_alloc_table(struct swap_cluster_info *ci) -{ - WARN_ON(ci->table); - ci->table = kzalloc(sizeof(unsigned long) * SWAPFILE_CLUSTER, GFP_KERNEL); - if (!ci->table) - return -ENOMEM; - return 0; -} - static void swap_cluster_free_table(struct swap_cluster_info *ci) { unsigned int ci_off; - unsigned long swp_tb; + struct swap_table *table; - if (!ci->table) - return; + /* Only empty cluster's table is allow to be freed */ + lockdep_assert_held(&ci->lock); + VM_WARN_ON_ONCE(!cluster_is_empty(ci)); + for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) + VM_WARN_ON_ONCE(!swp_tb_is_null(__swap_table_get(ci, ci_off))); + table = (void *)rcu_dereference_protected(ci->table, true); + rcu_assign_pointer(ci->table, NULL); - for (ci_off = 0; ci_off < SWAPFILE_CLUSTER; ci_off++) { - swp_tb = __swap_table_get(ci, ci_off); - if (!swp_tb_is_null(swp_tb)) - pr_err_once("swap: unclean swap space on swapoff: 0x%lx", - swp_tb); + kmem_cache_free(swap_table_cachep, table); +} + +/* + * Allocate swap table for one cluster. Attempt an atomic allocation first, + * then fallback to sleeping allocation. + */ +static struct swap_cluster_info * +swap_cluster_alloc_table(struct swap_info_struct *si, + struct swap_cluster_info *ci) +{ + struct swap_table *table; + + /* + * Only cluster isolation from the allocator does table allocation. + * Swap allocator uses percpu clusters and holds the local lock. + */ + lockdep_assert_held(&ci->lock); + lockdep_assert_held(&this_cpu_ptr(&percpu_swap_cluster)->lock); + + /* The cluster must be free and was just isolated from the free list. */ + VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); + + table = kmem_cache_zalloc(swap_table_cachep, + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + if (table) { + rcu_assign_pointer(ci->table, table); + return ci; } - kfree(ci->table); - ci->table = NULL; + /* + * Try a sleep allocation. Each isolated free cluster may cause + * a sleep allocation, but there is a limited number of them, so + * the potential recursive allocation is limited. + */ + spin_unlock(&ci->lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_unlock(&si->global_cluster_lock); + local_unlock(&percpu_swap_cluster.lock); + + table = kmem_cache_zalloc(swap_table_cachep, + __GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + + /* + * Back to atomic context. We might have migrated to a new CPU with a + * usable percpu cluster. But just keep using the isolated cluster to + * make things easier. Migration indicates a slight change of workload + * so using a new free cluster might not be a bad idea, and the worst + * could happen with ignoring the percpu cluster is fragmentation, + * which is acceptable since this fallback and race is rare. + */ + local_lock(&percpu_swap_cluster.lock); + if (!(si->flags & SWP_SOLIDSTATE)) + spin_lock(&si->global_cluster_lock); + spin_lock(&ci->lock); + + /* Nothing except this helper should touch a dangling empty cluster. */ + if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { + if (table) + kmem_cache_free(swap_table_cachep, table); + return ci; + } + + if (!table) { + move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); + spin_unlock(&ci->lock); + return NULL; + } + + rcu_assign_pointer(ci->table, table); + return ci; } static void move_cluster(struct swap_info_struct *si, @@ -479,7 +549,7 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si, static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci) { - lockdep_assert_held(&ci->lock); + swap_cluster_free_table(ci); move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE); ci->order = 0; } @@ -494,15 +564,11 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list) + struct swap_info_struct *si, struct list_head *list, int order) { - struct swap_cluster_info *ci, *ret = NULL; + struct swap_cluster_info *ci, *found = NULL; spin_lock(&si->lock); - - if (unlikely(!(si->flags & SWP_WRITEOK))) - goto out; - list_for_each_entry(ci, list, list) { if (!spin_trylock(&ci->lock)) continue; @@ -514,13 +580,19 @@ static struct swap_cluster_info *isolate_lock_cluster( list_del(&ci->list); ci->flags = CLUSTER_FLAG_NONE; - ret = ci; + found = ci; break; } -out: spin_unlock(&si->lock); - return ret; + if (found && !cluster_table_is_alloced(found)) { + /* Only an empty free cluster's swap table can be freed. */ + VM_WARN_ON_ONCE(list != &si->free_clusters); + VM_WARN_ON_ONCE(!cluster_is_empty(found)); + return swap_cluster_alloc_table(si, found); + } + + return found; } /* @@ -653,17 +725,27 @@ static void relocate_cluster(struct swap_info_struct *si, * added to free cluster list and its usage counter will be increased by 1. * Only used for initialization. */ -static void inc_cluster_info_page(struct swap_info_struct *si, +static int inc_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *cluster_info, unsigned long page_nr) { unsigned long idx = page_nr / SWAPFILE_CLUSTER; + struct swap_table *table; struct swap_cluster_info *ci; ci = cluster_info + idx; + if (!ci->table) { + table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL); + if (!table) + return -ENOMEM; + rcu_assign_pointer(ci->table, table); + } + ci->count++; VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); VM_BUG_ON(ci->flags); + + return 0; } static bool cluster_reclaim_range(struct swap_info_struct *si, @@ -845,7 +927,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, unsigned int found = SWAP_ENTRY_INVALID; do { - struct swap_cluster_info *ci = isolate_lock_cluster(si, list); + struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); unsigned long offset; if (!ci) @@ -870,7 +952,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; @@ -1018,6 +1100,7 @@ new_cluster: done: if (!(si->flags & SWP_SOLIDSTATE)) spin_unlock(&si->global_cluster_lock); + return found; } @@ -1885,7 +1968,13 @@ swp_entry_t get_swap_page_of_type(int type) /* This is called for allocating swap entry, not cache */ if (get_swap_device_info(si)) { if (si->flags & SWP_WRITEOK) { + /* + * Grab the local lock to be complaint + * with swap table allocation. + */ + local_lock(&percpu_swap_cluster.lock); offset = cluster_alloc_swap_entry(si, 0, 1); + local_unlock(&percpu_swap_cluster.lock); if (offset) { entry = swp_entry(si->type, offset); atomic_long_dec(&nr_swap_pages); @@ -2679,12 +2768,21 @@ static void wait_for_allocation(struct swap_info_struct *si) static void free_cluster_info(struct swap_cluster_info *cluster_info, unsigned long maxpages) { + struct swap_cluster_info *ci; int i, nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); if (!cluster_info) return; - for (i = 0; i < nr_clusters; i++) - swap_cluster_free_table(&cluster_info[i]); + for (i = 0; i < nr_clusters; i++) { + ci = cluster_info + i; + /* Cluster with bad marks count will have a remaining table */ + spin_lock(&ci->lock); + if (rcu_dereference_protected(ci->table, true)) { + ci->count = 0; + swap_cluster_free_table(ci); + } + spin_unlock(&ci->lock); + } kvfree(cluster_info); } @@ -2720,6 +2818,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; + unsigned int maxpages; int err, found = 0; if (!capable(CAP_SYS_ADMIN)) @@ -2826,8 +2925,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; zeromap = p->zeromap; p->zeromap = NULL; + maxpages = p->max; cluster_info = p->cluster_info; - free_cluster_info(cluster_info, p->max); p->max = 0; p->cluster_info = NULL; spin_unlock(&p->lock); @@ -2839,6 +2938,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->global_cluster = NULL; vfree(swap_map); kvfree(zeromap); + free_cluster_info(cluster_info, maxpages); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); @@ -3217,11 +3317,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (!cluster_info) goto err; - for (i = 0; i < nr_clusters; i++) { + for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - if (swap_cluster_alloc_table(&cluster_info[i])) - goto err_free; - } if (!(si->flags & SWP_SOLIDSTATE)) { si->global_cluster = kmalloc(sizeof(*si->global_cluster), @@ -3240,16 +3337,23 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - inc_cluster_info_page(si, cluster_info, 0); + err = inc_cluster_info_page(si, cluster_info, 0); + if (err) + goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr >= maxpages) continue; - inc_cluster_info_page(si, cluster_info, page_nr); + err = inc_cluster_info_page(si, cluster_info, page_nr); + if (err) + goto err; + } + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { + err = inc_cluster_info_page(si, cluster_info, i); + if (err) + goto err; } - for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) - inc_cluster_info_page(si, cluster_info, i); INIT_LIST_HEAD(&si->free_clusters); INIT_LIST_HEAD(&si->full_clusters); @@ -3963,6 +4067,15 @@ static int __init swapfile_init(void) swapfile_maximum_size = arch_max_swapfile_size(); + /* + * Once a cluster is freed, it's swap table content is read + * only, and all swap cache readers (swap_cache_*) verifies + * the content before use. So it's safe to use RCU slab here. + */ + swap_table_cachep = kmem_cache_create("swap_table", + sizeof(struct swap_table), + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); + #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) swap_migration_ad_supported = true; From f83938e4188c44b535c18903a9761759366aa626 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 17 Sep 2025 00:01:00 +0800 Subject: [PATCH 283/372] mm, swap: use a single page for swap table when the size fits We have a cluster size of 512 slots. Each slot consumes 8 bytes in swap table so the swap table size of each cluster is exactly one page (4K). If that condition is true, allocate one page direct and disable the slab cache to reduce the memory usage of swap table and avoid fragmentation. Link: https://lkml.kernel.org/r/20250916160100.31545-16-ryncsn@gmail.com Co-developed-by: Chris Li Signed-off-by: Chris Li Signed-off-by: Kairui Song Acked-by: Chris Li Suggested-by: Chris Li Reviewed-by: Barry Song Cc: Baolin Wang Cc: Baoquan He Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kemeng Shi Cc: kernel test robot Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Yosry Ahmed Cc: Zi Yan Cc: SeongJae Park Signed-off-by: Andrew Morton --- mm/swap_table.h | 2 ++ mm/swapfile.c | 51 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/mm/swap_table.h b/mm/swap_table.h index 52254e455304..ea244a57a5b7 100644 --- a/mm/swap_table.h +++ b/mm/swap_table.h @@ -11,6 +11,8 @@ struct swap_table { atomic_long_t entries[SWAPFILE_CLUSTER]; }; +#define SWP_TABLE_USE_PAGE (sizeof(struct swap_table) == PAGE_SIZE) + /* * A swap table entry represents the status of a swap slot on a swap * (physical or virtual) device. The swap table in each cluster is a diff --git a/mm/swapfile.c b/mm/swapfile.c index 094e3e75849f..890b410d77b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -434,6 +434,38 @@ static inline unsigned int cluster_offset(struct swap_info_struct *si, return cluster_index(si, ci) * SWAPFILE_CLUSTER; } +static struct swap_table *swap_table_alloc(gfp_t gfp) +{ + struct folio *folio; + + if (!SWP_TABLE_USE_PAGE) + return kmem_cache_zalloc(swap_table_cachep, gfp); + + folio = folio_alloc(gfp | __GFP_ZERO, 0); + if (folio) + return folio_address(folio); + return NULL; +} + +static void swap_table_free_folio_rcu_cb(struct rcu_head *head) +{ + struct folio *folio; + + folio = page_folio(container_of(head, struct page, rcu_head)); + folio_put(folio); +} + +static void swap_table_free(struct swap_table *table) +{ + if (!SWP_TABLE_USE_PAGE) { + kmem_cache_free(swap_table_cachep, table); + return; + } + + call_rcu(&(folio_page(virt_to_folio(table), 0)->rcu_head), + swap_table_free_folio_rcu_cb); +} + static void swap_cluster_free_table(struct swap_cluster_info *ci) { unsigned int ci_off; @@ -447,7 +479,7 @@ static void swap_cluster_free_table(struct swap_cluster_info *ci) table = (void *)rcu_dereference_protected(ci->table, true); rcu_assign_pointer(ci->table, NULL); - kmem_cache_free(swap_table_cachep, table); + swap_table_free(table); } /* @@ -470,8 +502,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si, /* The cluster must be free and was just isolated from the free list. */ VM_WARN_ON_ONCE(ci->flags || !cluster_is_empty(ci)); - table = kmem_cache_zalloc(swap_table_cachep, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); + table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN); if (table) { rcu_assign_pointer(ci->table, table); return ci; @@ -487,8 +518,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si, spin_unlock(&si->global_cluster_lock); local_unlock(&percpu_swap_cluster.lock); - table = kmem_cache_zalloc(swap_table_cachep, - __GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); + table = swap_table_alloc(__GFP_HIGH | __GFP_NOMEMALLOC | GFP_KERNEL); /* * Back to atomic context. We might have migrated to a new CPU with a @@ -506,7 +536,7 @@ swap_cluster_alloc_table(struct swap_info_struct *si, /* Nothing except this helper should touch a dangling empty cluster. */ if (WARN_ON_ONCE(cluster_table_is_alloced(ci))) { if (table) - kmem_cache_free(swap_table_cachep, table); + swap_table_free(table); return ci; } @@ -734,7 +764,7 @@ static int inc_cluster_info_page(struct swap_info_struct *si, ci = cluster_info + idx; if (!ci->table) { - table = kmem_cache_zalloc(swap_table_cachep, GFP_KERNEL); + table = swap_table_alloc(GFP_KERNEL); if (!table) return -ENOMEM; rcu_assign_pointer(ci->table, table); @@ -4072,9 +4102,10 @@ static int __init swapfile_init(void) * only, and all swap cache readers (swap_cache_*) verifies * the content before use. So it's safe to use RCU slab here. */ - swap_table_cachep = kmem_cache_create("swap_table", - sizeof(struct swap_table), - 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); + if (!SWP_TABLE_USE_PAGE) + swap_table_cachep = kmem_cache_create("swap_table", + sizeof(struct swap_table), + 0, SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, NULL); #ifdef CONFIG_MIGRATION if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS)) From 72797d218b430c59f9e30be44aeb7632e0a2f211 Mon Sep 17 00:00:00 2001 From: Stanislav Fort Date: Fri, 5 Sep 2025 12:38:51 +0300 Subject: [PATCH 284/372] mm/memcg: v1: account event registrations and drop world-writable cgroup.event_control In cgroup v1, the legacy cgroup.event_control file is world-writable and allows unprivileged users to register unbounded events and thresholds. Each registration allocates kernel memory without capping or memcg charging, which can be abused to exhaust kernel memory in affected configurations. Make the following minimal changes: - Account allocations with __GFP_ACCOUNT in event and threshold registration. - Remove CFTYPE_WORLD_WRITABLE from cgroup.event_control to make it owner-writable. This does not affect cgroup v2. Allocations are still subject to kmem accounting being enabled, but this reduces unbounded global growth. Link: https://lkml.kernel.org/r/20250905093851.80596-1-disclosure@aisle.com Signed-off-by: Stanislav Fort Acked-by: Roman Gushchin Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol-v1.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 4b94731305b9..6eed14bff742 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -761,7 +761,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, size = thresholds->primary ? thresholds->primary->size + 1 : 1; /* Allocate memory for new array of thresholds */ - new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); + new = kmalloc(struct_size(new, entries, size), GFP_KERNEL_ACCOUNT); if (!new) { ret = -ENOMEM; goto unlock; @@ -924,7 +924,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, { struct mem_cgroup_eventfd_list *event; - event = kmalloc(sizeof(*event), GFP_KERNEL); + event = kmalloc(sizeof(*event), GFP_KERNEL_ACCOUNT); if (!event) return -ENOMEM; @@ -1087,7 +1087,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, CLASS(fd, cfile)(cfd); - event = kzalloc(sizeof(*event), GFP_KERNEL); + event = kzalloc(sizeof(*event), GFP_KERNEL_ACCOUNT); if (!event) return -ENOMEM; @@ -2053,7 +2053,7 @@ struct cftype mem_cgroup_legacy_files[] = { { .name = "cgroup.event_control", /* XXX: for compat */ .write = memcg_write_event_control, - .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, + .flags = CFTYPE_NO_PREFIX, }, { .name = "swappiness", From 152d42584a98cb2e542932e63b25cc563224129f Mon Sep 17 00:00:00 2001 From: zhang jiao Date: Wed, 3 Sep 2025 15:30:59 +0800 Subject: [PATCH 285/372] samples/cgroup: rm unused MEMCG_EVENTS macro MEMCG_EVENTS is never referenced in the code. Just remove it. Link: https://lkml.kernel.org/r/20250903073100.2477-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: zhang jiao Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- samples/cgroup/memcg_event_listener.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/samples/cgroup/memcg_event_listener.c b/samples/cgroup/memcg_event_listener.c index a1667fe2489a..41425edbd88a 100644 --- a/samples/cgroup/memcg_event_listener.c +++ b/samples/cgroup/memcg_event_listener.c @@ -18,8 +18,6 @@ #include #include -#define MEMCG_EVENTS "memory.events" - /* Size of buffer to use when reading inotify events */ #define INOTIFY_BUFFER_SIZE 8192 From 6106864b878e1ce5ecab4b8ffffff85e9ec69b78 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 2 Sep 2025 08:36:11 +0000 Subject: [PATCH 286/372] maple_tree: remove lockdep_map_p typedef Having the ma_external_lock field exist when CONFIG_LOCKDEP=n isn't used anywhere, so just get rid of it. This also avoids generating a typedef called lockdep_map_p that could overlap with typedefs in other header files. Link: https://lkml.kernel.org/r/20250902-maple-lockdep-p-v1-1-3ae5a398a379@google.com Signed-off-by: Alice Ryhl Reviewed-by: Danilo Krummrich Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 05730171d201..47f9002ae92d 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -194,7 +194,6 @@ enum store_type { #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP -typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) @@ -207,7 +206,6 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else -typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) @@ -230,8 +228,10 @@ typedef struct { /* nothing */ } lockdep_map_p; */ struct maple_tree { union { - spinlock_t ma_lock; - lockdep_map_p ma_external_lock; + spinlock_t ma_lock; +#ifdef CONFIG_LOCKDEP + struct lockdep_map *ma_external_lock; +#endif }; unsigned int ma_flags; void __rcu *ma_root; From 522abd92279a8ea55bcc687f77697d4c0aaba6c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:00 +0100 Subject: [PATCH 287/372] ptdesc: convert __page_flags to pt_flags Patch series "Some ptdesc cleanups". The first two patches here are preparation for splitting struct ptdesc from struct page and struct folio. I think their only dependency is on the memdesc_flags_t patches from August which is in mm-new. The third patch is just something I noticed while working on the code. This patch (of 3): Use the new memdesc_flags_t type to show that these are the same bits as page/folio/slab and thesefore have the zone/node/section information in them. Remove a use of ptdesc_folio() by converting pagetable_is_reserved() to use test_bit() directly. Link: https://lkml.kernel.org/r/20250908171104.2409217-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250908171104.2409217-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 ++++++- include/linux/mm_types.h | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a6bfa46937a8..8dd71392eba7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2934,6 +2934,11 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU */ +enum pt_flags { + PT_reserved = PG_reserved, + /* High bits are used for zone/node/section */ +}; + static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); @@ -2951,7 +2956,7 @@ static inline void *ptdesc_address(const struct ptdesc *pt) static inline bool pagetable_is_reserved(struct ptdesc *pt) { - return folio_test_reserved(ptdesc_folio(pt)); + return test_bit(PT_reserved, &pt->pt_flags.f); } /** diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ff2b4e13215f..f048dc80646e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -524,7 +524,7 @@ FOLIO_MATCH(compound_head, _head_3); /** * struct ptdesc - Memory descriptor for page tables. - * @__page_flags: Same as page flags. Powerpc only. + * @pt_flags: enum pt_flags plus zone/node/section. * @pt_rcu_head: For freeing page table pages. * @pt_list: List of used page tables. Used for s390 gmap shadow pages * (which are not linked into the user page tables) and x86 @@ -546,7 +546,7 @@ FOLIO_MATCH(compound_head, _head_3); * understanding of the issues. */ struct ptdesc { - unsigned long __page_flags; + memdesc_flags_t pt_flags; union { struct rcu_head pt_rcu_head; @@ -584,7 +584,7 @@ struct ptdesc { #define TABLE_MATCH(pg, pt) \ static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) -TABLE_MATCH(flags, __page_flags); +TABLE_MATCH(flags, pt_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); TABLE_MATCH(mapping, __page_mapping); From f0c92726e89f5c6c092526787465617a68af154f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:01 +0100 Subject: [PATCH 288/372] ptdesc: remove references to folios from __pagetable_ctor() and pagetable_dtor() In preparation for splitting struct ptdesc from struct page and struct folio, remove mentions of struct folio from these functions. Introduce ptdesc_nr_pages() to avoid using lruvec_stat_add/sub_folio() Link: https://lkml.kernel.org/r/20250908171104.2409217-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8dd71392eba7..25f56e209ec8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2097,9 +2097,9 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline long compound_nr(struct page *page) +static inline long compound_nr(const struct page *page) { - struct folio *folio = (struct folio *)page; + const struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags.f)) return 1; @@ -3066,21 +3066,26 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ +static inline unsigned long ptdesc_nr_pages(const struct ptdesc *ptdesc) +{ + return compound_nr(ptdesc_page(ptdesc)); +} + static inline void __pagetable_ctor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); - __folio_set_pgtable(folio); - lruvec_stat_add_folio(folio, NR_PAGETABLE); + __SetPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { - struct folio *folio = ptdesc_folio(ptdesc); + pg_data_t *pgdat = NODE_DATA(memdesc_nid(ptdesc->pt_flags)); ptlock_free(ptdesc); - __folio_clear_pgtable(folio); - lruvec_stat_sub_folio(folio, NR_PAGETABLE); + __ClearPageTable(ptdesc_page(ptdesc)); + mod_node_page_state(pgdat, NR_PAGETABLE, -ptdesc_nr_pages(ptdesc)); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) From 90ec2df9dd31653ceac4a35d2440b108bdf27550 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 8 Sep 2025 18:11:02 +0100 Subject: [PATCH 289/372] ptdesc: remove ptdesc_to_virt() This has the same effect as ptdesc_address() so convert the callers to use that and delete the function. Add kernel-doc for ptdesc_address(). Link: https://lkml.kernel.org/r/20250908171104.2409217-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- arch/arm/mm/mmu.c | 2 +- arch/s390/mm/pgalloc.c | 6 +++--- include/linux/mm.h | 11 ++++++----- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index edb7f56b7c91..8bac96e205ac 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -737,7 +737,7 @@ static void *__init late_alloc(unsigned long sz) if (!ptdesc || !pagetable_pte_ctor(NULL, ptdesc)) BUG(); - return ptdesc_to_virt(ptdesc); + return ptdesc_address(ptdesc); } static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr, diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d2f6f1f6d2fc..f56ee9aeac83 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -21,7 +21,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!ptdesc) return NULL; - table = ptdesc_to_virt(ptdesc); + table = ptdesc_address(ptdesc); __arch_set_page_dat(table, 1UL << CRST_ALLOC_ORDER); return table; } @@ -119,7 +119,7 @@ struct ptdesc *page_table_alloc_pgste(struct mm_struct *mm) ptdesc = pagetable_alloc(GFP_KERNEL, 0); if (ptdesc) { - table = (u64 *)ptdesc_to_virt(ptdesc); + table = (u64 *)ptdesc_address(ptdesc); __arch_set_page_dat(table, 1); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); @@ -146,7 +146,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) pagetable_free(ptdesc); return NULL; } - table = ptdesc_to_virt(ptdesc); + table = ptdesc_address(ptdesc); __arch_set_page_dat(table, 1); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); diff --git a/include/linux/mm.h b/include/linux/mm.h index 25f56e209ec8..da6e0abad2cb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2944,11 +2944,12 @@ static inline struct ptdesc *virt_to_ptdesc(const void *x) return page_ptdesc(virt_to_page(x)); } -static inline void *ptdesc_to_virt(const struct ptdesc *pt) -{ - return page_to_virt(ptdesc_page(pt)); -} - +/** + * ptdesc_address - Virtual address of page table. + * @pt: Page table descriptor. + * + * Return: The first byte of the page table described by @pt. + */ static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); From d322f6a24ee5964a58294f61bf96a1b6404c676d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 17:41:57 +0200 Subject: [PATCH 290/372] scripts/decode_stacktrace.sh: symbol: avoid trailing whitespaces A few patches slightly improving the output generated by decode_stacktrace.sh. This patch (of 3): Lines having a symbol to decode might not always have info after this symbol. It means ${info_str} might not be set, but it will always be printed after a space, causing trailing whitespaces. That's a detail, but when the output is opened with an editor marking these trailing whitespaces, that's a bit disturbing. It is easy to remove them by printing this variable with a space only if it is set. While at it, do the same with ${module} and print everything in one line. Link: https://lkml.kernel.org/r/20250908-decode_strace_indent-v1-0-28e5e4758080@kernel.org Link: https://lkml.kernel.org/r/20250908-decode_strace_indent-v1-1-28e5e4758080@kernel.org Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Carlos Llamas Reviewed-by: Breno Leitao Reviewed-by: Luca Ceresoli Cc: Carlos Llamas Cc: Elliot Berman Cc: Stephen Boyd Signed-off-by: Andrew Morton --- scripts/decode_stacktrace.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 17abc4e7a985..c6b5c14412f0 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -323,12 +323,7 @@ handle_line() { parse_symbol # modifies $symbol # Add up the line number to the symbol - if [[ -z ${module} ]] - then - echo "${words[@]}" "$symbol ${info_str}" - else - echo "${words[@]}" "$symbol $module ${info_str}" - fi + echo "${words[@]}" "${symbol}${module:+ ${module}}${info_str:+ ${info_str}}" } while read line; do From 4a2fc4897b5e0ca1e7a3cb4e32f44c7db3367dee Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 17:41:58 +0200 Subject: [PATCH 291/372] scripts/decode_stacktrace.sh: symbol: preserve alignment With lines having a symbol to decode, the script was only trying to preserve the alignment for the timestamps, but not the rest, nor when the caller was set (CONFIG_PRINTK_CALLER=y). With this sample ... [ 52.080924] Call Trace: [ 52.080926] [ 52.080931] dump_stack_lvl+0x6f/0xb0 ... the script was producing the following output: [ 52.080924] Call Trace: [ 52.080926] [ 52.080931] dump_stack_lvl (arch/x86/include/asm/irqflags.h:19) (dump_stack_lvl is no longer aligned with : one missing space) With this other sample ... [ 52.080924][ T48] Call Trace: [ 52.080926][ T48] [ 52.080931][ T48] dump_stack_lvl+0x6f/0xb0 ... the script was producing the following output: [ 52.080924][ T48] Call Trace: [ 52.080926][ T48] [ 52.080931][ T48] dump_stack_lvl (arch/x86/include/asm/irqflags.h:19) (the misalignment is clearer here) That's because the script had a workaround for CONFIG_PRINTK_TIME=y only, see the previous comment called "Format timestamps with tabs". To always preserve spaces, they need to be recorded along the words. That is what is now done with the new 'spaces' array. Some notes: - 'extglob' is needed only for this operation, and that's why it is set in a dedicated subshell. - 'read' is used with '-r' not to treat a character in any special way, e.g. when followed by a space. - When a word is removed from the 'words' array, the corresponding space needs to be removed from the 'spaces' array as well. With the last sample, we now have: [ 52.080924][ T48] Call Trace: [ 52.080926][ T48] [ 52.080931][ T48] dump_stack_lvl (arch/x86/include/asm/irqflags.h:19) (the alignment is preserved) Link: https://lkml.kernel.org/r/20250908-decode_strace_indent-v1-2-28e5e4758080@kernel.org Signed-off-by: Matthieu Baerts (NGI0) Tested-by: Carlos Llamas Cc: Breno Leitao Cc: Elliot Berman Cc: Luca Ceresoli Cc: Stephen Boyd Signed-off-by: Andrew Morton --- scripts/decode_stacktrace.sh | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index c6b5c14412f0..0c92d6a7f777 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -255,10 +255,11 @@ handle_line() { basepath=${basepath%/init/main.c:*)} fi - local words + local words spaces - # Tokenize - read -a words <<<"$1" + # Tokenize: words and spaces to preserve the alignment + read -ra words <<<"$1" + IFS='#' read -ra spaces <<<"$(shopt -s extglob; echo "${1//+([^[:space:]])/#}")" # Remove hex numbers. Do it ourselves until it happens in the # kernel @@ -270,19 +271,13 @@ handle_line() { for i in "${!words[@]}"; do # Remove the address if [[ ${words[$i]} =~ \[\<([^]]+)\>\] ]]; then - unset words[$i] - fi - - # Format timestamps with tabs - if [[ ${words[$i]} == \[ && ${words[$i+1]} == *\] ]]; then - unset words[$i] - words[$i+1]=$(printf "[%13s\n" "${words[$i+1]}") + unset words[$i] spaces[$i] fi done if [[ ${words[$last]} =~ ^[0-9a-f]+\] ]]; then words[$last-1]="${words[$last-1]} ${words[$last]}" - unset words[$last] + unset words[$last] spaces[$last] last=$(( $last - 1 )) fi @@ -294,7 +289,7 @@ handle_line() { local info_str="" if [[ ${words[$last]} =~ \([A-Z]*\) ]]; then info_str=${words[$last]} - unset words[$last] + unset words[$last] spaces[$last] last=$(( $last - 1 )) fi @@ -311,7 +306,7 @@ handle_line() { modbuildid= fi symbol=${words[$last-1]} - unset words[$last-1] + unset words[$last-1] spaces[$last-1] else # The symbol is the last element, process it symbol=${words[$last]} @@ -323,7 +318,10 @@ handle_line() { parse_symbol # modifies $symbol # Add up the line number to the symbol - echo "${words[@]}" "${symbol}${module:+ ${module}}${info_str:+ ${info_str}}" + for i in "${!words[@]}"; do + echo -n "${spaces[i]}${words[i]}" + done + echo "${spaces[$last]}${symbol}${module:+ ${module}}${info_str:+ ${info_str}}" } while read line; do From e1831e8dd1c839088692c09c97031ce467db1a2a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 8 Sep 2025 17:41:59 +0200 Subject: [PATCH 292/372] scripts/decode_stacktrace.sh: code: preserve alignment With lines having a code to decode, the alignment was not preserved for the first line. With this sample ... [ 52.238089][ T55] RIP: 0010:__ip_queue_xmit+0x127c/0x1820 [ 52.238401][ T55] Code: c1 83 e0 07 48 c1 e9 03 83 c0 03 (...) ... the script was producing the following output: [ 52.238089][ T55] RIP: 0010:__ip_queue_xmit (...) [ 52.238401][ T55] Code: c1 83 e0 07 48 c1 e9 03 83 c0 03 (...) That's because scripts/decodecode doesn't preserve the alignment. No need to modify it, it is enough to give only the "Code: (...)" part to this script, and print the prefix without modifications. With the same sample, we now have: [ 52.238089][ T55] RIP: 0010:__ip_queue_xmit (...) [ 52.238401][ T55] Code: c1 83 e0 07 48 c1 e9 03 83 c0 03 (...) Link: https://lkml.kernel.org/r/20250908-decode_strace_indent-v1-3-28e5e4758080@kernel.org Signed-off-by: Matthieu Baerts (NGI0) Tested-by: Carlos Llamas Cc: Breno Leitao Cc: Elliot Berman Cc: Luca Ceresoli Cc: Stephen Boyd Signed-off-by: Andrew Morton --- scripts/decode_stacktrace.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 0c92d6a7f777..c73cb802a0a3 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -242,8 +242,10 @@ debuginfod_get_vmlinux() { decode_code() { local scripts=`dirname "${BASH_SOURCE[0]}"` + local lim="Code: " - echo "$1" | $scripts/decodecode + echo -n "${1%%${lim}*}" + echo "${lim}${1##*${lim}}" | $scripts/decodecode } handle_line() { From 602837268999912b3c0e0db21b67818ffbde7141 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 8 Sep 2025 16:55:34 +0200 Subject: [PATCH 293/372] readahead: add trace points Add a couple of trace points to make debugging readahead logic easier. [jack@suse.cz: v2] Link: https://lkml.kernel.org/r/20250909145849.5090-2-jack@suse.cz Link: https://lkml.kernel.org/r/20250908145533.31528-2-jack@suse.cz Signed-off-by: Jan Kara Tested-by: Pankaj Raghav Signed-off-by: Andrew Morton --- include/trace/events/readahead.h | 132 +++++++++++++++++++++++++++++++ mm/readahead.c | 8 ++ 2 files changed, 140 insertions(+) create mode 100644 include/trace/events/readahead.h diff --git a/include/trace/events/readahead.h b/include/trace/events/readahead.h new file mode 100644 index 000000000000..0997ac5eceab --- /dev/null +++ b/include/trace/events/readahead.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM readahead + +#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_READAHEAD_H + +#include +#include +#include +#include +#include + +TRACE_EVENT(page_cache_ra_unbounded, + TP_PROTO(struct inode *inode, pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_size), + + TP_ARGS(inode, index, nr_to_read, lookahead_size), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned long, nr_to_read) + __field(unsigned long, lookahead_size) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->nr_to_read = nr_to_read; + __entry->lookahead_size = lookahead_size; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu nr_to_read=%lu lookahead_size=%lu", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->nr_to_read, __entry->lookahead_size + ) +); + +TRACE_EVENT(page_cache_ra_order, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra), + + TP_ARGS(inode, index, ra), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned int, order) + __field(unsigned int, size) + __field(unsigned int, async_size) + __field(unsigned int, ra_pages) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->order = ra->order; + __entry->size = ra->size; + __entry->async_size = ra->async_size; + __entry->ra_pages = ra->ra_pages; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu order=%u size=%u async_size=%u ra_pages=%u", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->order, __entry->size, + __entry->async_size, __entry->ra_pages + ) +); + +DECLARE_EVENT_CLASS(page_cache_ra_op, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + + TP_ARGS(inode, index, ra, req_count), + + TP_STRUCT__entry( + __field(unsigned long, i_ino) + __field(dev_t, s_dev) + __field(pgoff_t, index) + __field(unsigned int, order) + __field(unsigned int, size) + __field(unsigned int, async_size) + __field(unsigned int, ra_pages) + __field(unsigned int, mmap_miss) + __field(loff_t, prev_pos) + __field(unsigned long, req_count) + ), + + TP_fast_assign( + __entry->i_ino = inode->i_ino; + __entry->s_dev = inode->i_sb->s_dev; + __entry->index = index; + __entry->order = ra->order; + __entry->size = ra->size; + __entry->async_size = ra->async_size; + __entry->ra_pages = ra->ra_pages; + __entry->mmap_miss = ra->mmap_miss; + __entry->prev_pos = ra->prev_pos; + __entry->req_count = req_count; + ), + + TP_printk( + "dev=%d:%d ino=%lx index=%lu req_count=%lu order=%u size=%u async_size=%u ra_pages=%u mmap_miss=%u prev_pos=%lld", + MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, + __entry->index, __entry->req_count, __entry->order, + __entry->size, __entry->async_size, __entry->ra_pages, + __entry->mmap_miss, __entry->prev_pos + ) +); + +DEFINE_EVENT(page_cache_ra_op, page_cache_sync_ra, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + TP_ARGS(inode, index, ra, req_count) +); + +DEFINE_EVENT(page_cache_ra_op, page_cache_async_ra, + TP_PROTO(struct inode *inode, pgoff_t index, struct file_ra_state *ra, + unsigned long req_count), + TP_ARGS(inode, index, ra, req_count) +); + +#endif /* _TRACE_FILEMAP_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/readahead.c b/mm/readahead.c index 406756d34309..3a4b5d58eeb6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -129,6 +129,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "internal.h" /* @@ -225,6 +228,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ unsigned int nofs = memalloc_nofs_save(); + trace_page_cache_ra_unbounded(mapping->host, index, nr_to_read, + lookahead_size); filemap_invalidate_lock_shared(mapping); index = mapping_align_index(mapping, index); @@ -470,6 +475,7 @@ void page_cache_ra_order(struct readahead_control *ractl, gfp_t gfp = readahead_gfp_mask(mapping); unsigned int new_order = ra->order; + trace_page_cache_ra_order(mapping->host, start, ra); if (!mapping_large_folio_support(mapping)) { ra->order = 0; goto fallback; @@ -554,6 +560,7 @@ void page_cache_sync_ra(struct readahead_control *ractl, unsigned long max_pages, contig_count; pgoff_t prev_index, miss; + trace_page_cache_sync_ra(ractl->mapping->host, index, ra, req_count); /* * Even if readahead is disabled, issue this request as readahead * as we'll need it to satisfy the requested range. The forced @@ -638,6 +645,7 @@ void page_cache_async_ra(struct readahead_control *ractl, if (folio_test_writeback(folio)) return; + trace_page_cache_async_ra(ractl->mapping->host, index, ra, req_count); folio_clear_readahead(folio); if (blk_cgroup_congested()) From 6e296bcf294ee9e8e024158ed1d279cf823eae63 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Fri, 12 Sep 2025 09:37:09 +0800 Subject: [PATCH 294/372] selftests/mm: fix hugepages cleanup too early Patch series "Fix va_high_addr_switch.sh test failure", v3. These three patches fix the va_high_addr_switch.sh test failure on x86_64. Patch 1 fixes the hugepage setup issue that nr_hugepages is reset too early in run_vmtests.sh and break the later va_high_addr_switch testing. Patch 2 adds hugepage setup in va_high_addr_switch test, so that it can still work if vm_runtests.sh changes the hugepage setup someday. Patch 3 fixes the test failure caused by the hint addr align method change in hugetlb_get_unmapped_area(). This patch (of 3): The nr_hugepgs variable is used to keep the original nr_hugepages at the hugepage setup step at test beginning. After userfaultfd test, a cleaup is executed, both /sys/kernel/mm/hugepages/hugepages-*/nr_hugepages and /proc/sys//vm/nr_hugepages are reset to 'original' value before userfaultfd test starts. Issue here is the value used to restore /proc/sys/vm/nr_hugepages is nr_hugepgs which is the initial value before the vm_runtests.sh runs, not the value before userfaultfd test starts. 'va_high_addr_swith.sh' tests runs after that will possibly see no hugepages available for test, and got EINVAL when mmap(HUGETLB), making the result invalid. And before pkey tests, nr_hugepgs is changed to be used as a temp variable to save nr_hugepages before pkey test, and restore it after pkey tests finish. The original nr_hugepages value is not tracked anymore, so no way to restore it after all tests finish. Add a new variable orig_nr_hugepgs to save the original nr_hugepages, and and restore it to nr_hugepages after all tests finish. And change to use the nr_hugepgs variable to save the /proc/sys/vm/nr_hugeages after hugepage setup, it's also the value before userfaultfd test starts, and the correct value to be restored after userfaultfd finishes. The va_high_addr_switch.sh broken will be resolved. Link: https://lkml.kernel.org/r/20250912013711.3002969-1-chuhu@redhat.com Link: https://lkml.kernel.org/r/20250912013711.3002969-2-chuhu@redhat.com Signed-off-by: Chunyu Hu Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 9e88cc25b9df..8115fc4526ed 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -174,13 +174,13 @@ fi # set proper nr_hugepages if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then - nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) + orig_nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) needpgs=$((needmem_KB / hpgsize_KB)) tries=2 while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do lackpgs=$((needpgs - freepgs)) echo 3 > /proc/sys/vm/drop_caches - if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then + if ! echo $((lackpgs + orig_nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then echo "Please run this test as root" exit $ksft_skip fi @@ -191,6 +191,7 @@ if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then done < /proc/meminfo tries=$((tries - 1)) done + nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) if [ "$freepgs" -lt "$needpgs" ]; then printf "Not enough huge pages available (%d < %d)\n" \ "$freepgs" "$needpgs" @@ -540,6 +541,10 @@ CATEGORY="page_frag" run_test ./test_page_frag.sh nonaligned CATEGORY="rmap" run_test ./rmap +if [ "${HAVE_HUGEPAGES}" = 1 ]; then + echo "$orig_nr_hugepgs" > /proc/sys/vm/nr_hugepages +fi + echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" | tap_prefix echo "1..${count_total}" | tap_output From d9d957bd7b61175db0723b6238d8f5a9740868a4 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Fri, 12 Sep 2025 09:37:10 +0800 Subject: [PATCH 295/372] selftests/mm: alloc hugepages in va_high_addr_switch test Alloc hugepages in the test internally, so we don't fully rely on the run_vmtests.sh. If run_vmtests.sh does that great, free hugepages is enough for being used to run the test, leave it as it is, otherwise setup the hugepages in the test. Save the original nr_hugepages value and restore it after test finish, so leave a stable test envronment. Link: https://lkml.kernel.org/r/20250912013711.3002969-3-chuhu@redhat.com Signed-off-by: Chunyu Hu Cc: David Hildenbrand Signed-off-by: Andrew Morton --- .../selftests/mm/va_high_addr_switch.sh | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh index 325de53966b6..a7d4b02b21dd 100755 --- a/tools/testing/selftests/mm/va_high_addr_switch.sh +++ b/tools/testing/selftests/mm/va_high_addr_switch.sh @@ -9,6 +9,7 @@ # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 +orig_nr_hugepages=0 skip() { @@ -76,5 +77,41 @@ check_test_requirements() esac } +save_nr_hugepages() +{ + orig_nr_hugepages=$(cat /proc/sys/vm/nr_hugepages) +} + +restore_nr_hugepages() +{ + echo "$orig_nr_hugepages" > /proc/sys/vm/nr_hugepages +} + +setup_nr_hugepages() +{ + local needpgs=$1 + while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs="$size" + break + fi + done < /proc/meminfo + if [ "$freepgs" -ge "$needpgs" ]; then + return + fi + local hpgs=$((orig_nr_hugepages + needpgs)) + echo $hpgs > /proc/sys/vm/nr_hugepages + + local nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) + if [ "$nr_hugepgs" != "$hpgs" ]; then + restore_nr_hugepages + skip "$0: no enough hugepages for testing" + fi +} + check_test_requirements +save_nr_hugepages +# 4 keep_mapped pages, and one for tmp usage +setup_nr_hugepages 5 ./va_high_addr_switch --run-hugetlb +restore_nr_hugepages From c56325259abc026205c98964616dcc0df5648912 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Fri, 12 Sep 2025 09:37:11 +0800 Subject: [PATCH 296/372] selftests/mm: fix va_high_addr_switch.sh failure on x86_64 The test will fail as below on x86_64 with cpu la57 support (will skip if no la57 support). Note, the test requries nr_hugepages to be set first. # running bash ./va_high_addr_switch.sh # ------------------------------------- # mmap(addr_switch_hint - pagesize, pagesize): 0x7f55b60fa000 - OK # mmap(addr_switch_hint - pagesize, (2 * pagesize)): 0x7f55b60f9000 - OK # mmap(addr_switch_hint, pagesize): 0x800000000000 - OK # mmap(addr_switch_hint, 2 * pagesize, MAP_FIXED): 0x800000000000 - OK # mmap(NULL): 0x7f55b60f9000 - OK # mmap(low_addr): 0x40000000 - OK # mmap(high_addr): 0x1000000000000 - OK # mmap(high_addr) again: 0xffff55b6136000 - OK # mmap(high_addr, MAP_FIXED): 0x1000000000000 - OK # mmap(-1): 0xffff55b6134000 - OK # mmap(-1) again: 0xffff55b6132000 - OK # mmap(addr_switch_hint - pagesize, pagesize): 0x7f55b60fa000 - OK # mmap(addr_switch_hint - pagesize, 2 * pagesize): 0x7f55b60f9000 - OK # mmap(addr_switch_hint - pagesize/2 , 2 * pagesize): 0x7f55b60f7000 - OK # mmap(addr_switch_hint, pagesize): 0x800000000000 - OK # mmap(addr_switch_hint, 2 * pagesize, MAP_FIXED): 0x800000000000 - OK # mmap(NULL, MAP_HUGETLB): 0x7f55b5c00000 - OK # mmap(low_addr, MAP_HUGETLB): 0x40000000 - OK # mmap(high_addr, MAP_HUGETLB): 0x1000000000000 - OK # mmap(high_addr, MAP_HUGETLB) again: 0xffff55b5e00000 - OK # mmap(high_addr, MAP_FIXED | MAP_HUGETLB): 0x1000000000000 - OK # mmap(-1, MAP_HUGETLB): 0x7f55b5c00000 - OK # mmap(-1, MAP_HUGETLB) again: 0x7f55b5a00000 - OK # mmap(addr_switch_hint - pagesize, 2*hugepagesize, MAP_HUGETLB): 0x800000000000 - FAILED # mmap(addr_switch_hint , 2*hugepagesize, MAP_FIXED | MAP_HUGETLB): 0x800000000000 - OK # [FAIL] addr_switch_hint is defined as DFEFAULT_MAP_WINDOW in the failed test (for x86_64, DFEFAULT_MAP_WINDOW is defined as (1UL<<47) - pagesize) in 64 bit. Before commit cc92882ee218 ("mm: drop hugetlb_get_unmapped_area{_*} functions"), for x86_64 hugetlb_get_unmapped_area() is handled in arch code arch/x86/mm/hugetlbpage.c and addr is checked with map_address_hint_valid() after align with 'addr &= huge_page_mask(h)' which is a round down way, and it will fail the check because the addr is within the DEFAULT_MAP_WINDOW but (addr + len) is above the DFEFAULT_MAP_WINDOW. So it wil go through the hugetlb_get_unmmaped_area_top_down() to find an area within the DFEFAULT_MAP_WINDOW. After commit cc92882ee218 ("mm: drop hugetlb_get_unmapped_area{_*} functions"). The addr hint for hugetlb_get_unmmaped_area() will be rounded up and aligned to hugepage size with ALIGN() for all arches. And after the align, the addr will be above the default MAP_DEFAULT_WINDOW, and the map_addresshint_valid() check will pass because both aligned addr (addr0) and (addr + len) are above the DEFAULT_MAP_WINDOW, and the aligned hint address (0x800000000000) is returned as an suitable gap is found there, in arch_get_unmapped_area_topdown(). To still cover the case that addr is within the DEFAULT_MAP_WINDOW, and addr + len is above the DFEFAULT_MAP_WINDOW, change to choose the last hugepage aligned address within the DEFAULT_MAP_WINDOW as the hint addr, and the addr + len (2 hugepages) will be one hugepage above the DEFAULT_MAP_WINDOW. An aligned address won't be affected by the page round up or round down from kernel, so it's determistic. Link: https://lkml.kernel.org/r/20250912013711.3002969-4-chuhu@redhat.com Fixes: cc92882ee218 ("mm: drop hugetlb_get_unmapped_area{_*} functions") Signed-off-by: Chunyu Hu Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 896b3f73fc53..306eba825107 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -230,10 +230,10 @@ void testcases_init(void) .msg = "mmap(-1, MAP_HUGETLB) again", }, { - .addr = (void *)(addr_switch_hint - pagesize), + .addr = (void *)(addr_switch_hint - hugepagesize), .size = 2 * hugepagesize, .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(addr_switch_hint - pagesize, 2*hugepagesize, MAP_HUGETLB)", + .msg = "mmap(addr_switch_hint - hugepagesize, 2*hugepagesize, MAP_HUGETLB)", .low_addr_required = 1, .keep_mapped = 1, }, From e7a5f249e6db3b41a4618763e0f840639f3578f4 Mon Sep 17 00:00:00 2001 From: Chanwon Park Date: Mon, 8 Sep 2025 19:04:10 +0900 Subject: [PATCH 297/372] mm: re-enable kswapd when memory pressure subsides or demotion is toggled If kswapd fails to reclaim pages from a node MAX_RECLAIM_RETRIES in a row, kswapd on that node gets disabled. That is, the system won't wakeup kswapd for that node until page reclamation is observed at least once. That reclamation is mostly done by direct reclaim, which in turn enables kswapd back. However, on systems with CXL memory nodes, workloads with high anon page usage can disable kswapd indefinitely, without triggering direct reclaim. This can be reproduced with following steps: numa node 0 (32GB memory, 48 CPUs) numa node 2~5 (512GB CXL memory, 128GB each) (numa node 1 is disabled) swap space 8GB 1) Set /sys/kernel/mm/demotion_enabled to 0. 2) Set /proc/sys/kernel/numa_balancing to 0. 3) Run a process that allocates and random accesses 500GB of anon pages. 4) Let the process exit normally. During 3), free memory on node 0 gets lower than low watermark, and kswapd runs and depletes swap space. Then, kswapd fails consecutively and gets disabled. Allocation afterwards happens on CXL memory, so node 0 never gains more memory pressure to trigger direct reclaim. After 4), kswapd on node 0 remains disabled, and tasks running on that node are unable to swap. If you turn on NUMA_BALANCING_MEMORY_TIERING and demotion now, it won't work properly since kswapd is disabled. To mitigate this problem, reset kswapd_failures to 0 on following conditions: a) ZONE_BELOW_HIGH bit of a zone in hopeless node with a fallback memory node gets cleared. b) demotion_enabled is changed from false to true. Rationale for a): ZONE_BELOW_HIGH bit being cleared might be a sign that the node may be reclaimable afterwards. This won't help much if the memory-hungry process keeps running without freeing anything, but at least the node will go back to reclaimable state when the process exits. Rationale for b): When demotion_enabled is false, kswapd can only reclaim anon pages by swapping them out to swap space. If demotion_enabled is turned on, kswapd can demote anon pages to another node for reclaiming. So, the original failure count for determining reclaimability is no longer valid. Since kswapd_failures resets may be missed by ++ operation, it is changed from int to atomic_t. [akpm@linux-foundation.org: tweak whitespace] Link: https://lkml.kernel.org/r/aL6qGi69jWXfPc4D@pcw-MS-7D22 Signed-off-by: Chanwon Park Cc: Brendan Jackman Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Qi Zheng Cc: Shakeel Butt Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- mm/memory-tiers.c | 12 ++++++++++++ mm/page_alloc.c | 29 ++++++++++++++++++++++------- mm/show_mem.c | 3 ++- mm/vmscan.c | 14 +++++++------- mm/vmstat.c | 2 +- 6 files changed, 45 insertions(+), 17 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6c4eae96160d..7fb7331c5725 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1440,7 +1440,7 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_highest_zoneidx; - int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + atomic_t kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 0382b6942b8b..0ea5c13f10a2 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -942,11 +942,23 @@ static ssize_t demotion_enabled_store(struct kobject *kobj, const char *buf, size_t count) { ssize_t ret; + bool before = numa_demotion_enabled; ret = kstrtobool(buf, &numa_demotion_enabled); if (ret) return ret; + /* + * Reset kswapd_failures statistics. They may no longer be + * valid since the policy for kswapd has changed. + */ + if (before == false && numa_demotion_enabled == true) { + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + atomic_set(&pgdat->kswapd_failures, 0); + } + return count; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df6df302d0c5..6ff9f17d5f4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2860,14 +2860,29 @@ static void free_frozen_page_commit(struct zone *zone, */ return; } + high = nr_pcp_high(pcp, zone, batch, free_high); - if (pcp->count >= high) { - free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), - pcp, pindex); - if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && - zone_watermark_ok(zone, 0, high_wmark_pages(zone), - ZONE_MOVABLE, 0)) - clear_bit(ZONE_BELOW_HIGH, &zone->flags); + if (pcp->count < high) + return; + + free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), + pcp, pindex); + if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && + zone_watermark_ok(zone, 0, high_wmark_pages(zone), + ZONE_MOVABLE, 0)) { + struct pglist_data *pgdat = zone->zone_pgdat; + clear_bit(ZONE_BELOW_HIGH, &zone->flags); + + /* + * Assume that memory pressure on this node is gone and may be + * in a reclaimable state. If a memory fallback node exists, + * direct reclaim may not have been triggered, causing a + * 'hopeless node' to stay in that state for a while. Let + * kswapd work again by resetting kswapd_failures. + */ + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES && + next_memory_node(pgdat->node_id) < MAX_NUMNODES) + atomic_set(&pgdat->kswapd_failures, 0); } } diff --git a/mm/show_mem.c b/mm/show_mem.c index 90a9a37116e7..3a4b5207635d 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -278,7 +278,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES), + str_yes_no(atomic_read(&pgdat->kswapd_failures) >= + MAX_RECLAIM_RETRIES), K(node_page_state(pgdat, NR_BALLOON_PAGES))); } diff --git a/mm/vmscan.c b/mm/vmscan.c index e170c12e2065..b2fc8b626d3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -518,7 +518,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; /* @@ -5101,7 +5101,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * blk_finish_plug(&plug); done: if (sc->nr_reclaimed > reclaimed) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); } /****************************************************************************** @@ -6180,7 +6180,7 @@ again: * successful direct reclaim run will revive a dormant kswapd. */ if (reclaimable) - pgdat->kswapd_failures = 0; + atomic_set(&pgdat->kswapd_failures, 0); else if (sc->cache_trim_mode) sc->cache_trim_mode_failed = 1; } @@ -6492,7 +6492,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) int i; bool wmark_ok; - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { @@ -6902,7 +6902,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, wake_up_all(&pgdat->pfmemalloc_wait); /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES) return true; if (pgdat_balanced(pgdat, order, highest_zoneidx)) { @@ -7170,7 +7170,7 @@ restart: } if (!sc.nr_reclaimed) - pgdat->kswapd_failures++; + atomic_inc(&pgdat->kswapd_failures); out: clear_reclaim_active(pgdat, highest_zoneidx); @@ -7429,7 +7429,7 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, return; /* Hopeless node, leave it to direct reclaim if possible */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + if (atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES || (pgdat_balanced(pgdat, order, highest_zoneidx) && !pgdat_watermark_boosted(pgdat, highest_zoneidx))) { /* diff --git a/mm/vmstat.c b/mm/vmstat.c index e522decf6a72..bb09c032eecf 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1848,7 +1848,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n node_unreclaimable: %u" "\n start_pfn: %lu", - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, + atomic_read(&pgdat->kswapd_failures) >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn); seq_putc(m, '\n'); } From 7cad96ae59b334a80b6e48c396f147a6c810c9ea Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Sep 2025 11:45:17 +0200 Subject: [PATCH 298/372] mm/gup: fix handling of errors from arch_make_folio_accessible() in follow_page_pte() In case we call arch_make_folio_accessible() and it fails, we would incorrectly return a value that is "!= 0" to the caller, indicating that we pinned all requested pages and that the caller can keep going. follow_page_pte() is not supposed to return error values, but instead "0" on failure and "1" on success -- we'll clean that up separately. In case we return "!= 0", the caller will just keep going pinning more pages. If we happen to pin a page afterwards, we're in trouble, because we essentially skipped some pages in the requested range. Staring at the arch_make_folio_accessible() implementation on s390x, I assume it should actually never really fail unless something unexpected happens (BUG?). So let's not CC stable and just fix common code to do the right thing. Clean up the code a bit now that there is no reason to store the return value of arch_make_folio_accessible(). Link: https://lkml.kernel.org/r/20250908094517.303409-1-david@redhat.com Fixes: f28d43636d6f ("mm/gup/writeback: add callbacks for inaccessible pages") Signed-off-by: David Hildenbrand Reviewed-by: Claudio Imbrenda Cc: Jason Gunthorpe Cc: John Hubbard Cc: Peter Xu Cc: Christian Borntraeger Signed-off-by: Andrew Morton --- mm/gup.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 00f30e03f736..a8ba5112e4d0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2898,12 +2898,9 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, * see Documentation/core-api/pin_user_pages.rst for * details. */ - if (flags & FOLL_PIN) { - ret = arch_make_folio_accessible(folio); - if (ret) { - gup_put_folio(folio, 1, flags); - goto pte_unmap; - } + if ((flags & FOLL_PIN) && arch_make_folio_accessible(folio)) { + gup_put_folio(folio, 1, flags); + goto pte_unmap; } folio_set_referenced(folio); pages[*nr] = page; From 10b9feee2d0dc81c44f7a9e69e7a894e33f8c4a1 Mon Sep 17 00:00:00 2001 From: Francois Dugast Date: Mon, 8 Sep 2025 11:10:52 +0200 Subject: [PATCH 299/372] mm/hmm: populate PFNs from PMD swap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Once support for THP migration of zone device pages is enabled, device private swap entries will be found during the walk not only for PTEs but also for PMDs. Therefore, it is necessary to extend to PMDs the special handling which is already in place for PTEs when device private pages are owned by the caller: instead of faulting or skipping the range, the correct behavior is to use the swap entry to populate HMM PFNs. This change is a prerequisite to make use of device-private THP in drivers using drivers/gpu/drm/drm_pagemap, such as xe. Even though subsequent PFNs can be inferred when handling large order PFNs, the PFN list is still fully populated because this is currently expected by HMM users. In case this changes in the future, that is all HMM users support a sparsely populated PFN list, the for() loop can be made to skip remaining PFNs for the current order. A quick test shows the loop takes about 10 ns, roughly 20 times faster than without this optimization. Link: https://lkml.kernel.org/r/20250908091052.612303-1-francois.dugast@intel.com Signed-off-by: Francois Dugast Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Zi Yan Cc: Alistair Popple Cc: Balbir Singh Cc: David Airlie Cc: Christian König Cc: Mika Penttilä Cc: Thomas Hellstrom Cc: Matthew Brost Signed-off-by: Andrew Morton --- mm/hmm.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 65 insertions(+), 5 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index d545e2494994..3e00f08722d5 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -326,6 +326,68 @@ fault: return hmm_vma_fault(addr, end, required_fault, walk); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION +static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, + unsigned long end, unsigned long *hmm_pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long npages = (end - start) >> PAGE_SHIFT; + unsigned long addr = start; + swp_entry_t entry = pmd_to_swp_entry(pmd); + unsigned int required_fault; + + if (is_device_private_entry(entry) && + pfn_swap_entry_folio(entry)->pgmap->owner == + range->dev_private_owner) { + unsigned long cpu_flags = HMM_PFN_VALID | + hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT); + unsigned long pfn = swp_offset_pfn(entry); + unsigned long i; + + if (is_writable_device_private_entry(entry)) + cpu_flags |= HMM_PFN_WRITE; + + /* + * Fully populate the PFN list though subsequent PFNs could be + * inferred, because drivers which are not yet aware of large + * folios probably do not support sparsely populated PFN lists. + */ + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_INOUT_FLAGS; + hmm_pfns[i] |= pfn | cpu_flags; + } + + return 0; + } + + required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns, + npages, 0); + if (required_fault) { + if (is_device_private_entry(entry)) + return hmm_vma_fault(addr, end, required_fault, walk); + else + return -EFAULT; + } + + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); +} +#else +static int hmm_vma_handle_absent_pmd(struct mm_walk *walk, unsigned long start, + unsigned long end, unsigned long *hmm_pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long npages = (end - start) >> PAGE_SHIFT; + + if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) + return -EFAULT; + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); +} +#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */ + static int hmm_vma_walk_pmd(pmd_t *pmdp, unsigned long start, unsigned long end, @@ -354,11 +416,9 @@ again: return hmm_pfns_fill(start, end, range, 0); } - if (!pmd_present(pmd)) { - if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) - return -EFAULT; - return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); - } + if (!pmd_present(pmd)) + return hmm_vma_handle_absent_pmd(walk, start, end, hmm_pfns, + pmd); if (pmd_trans_huge(pmd)) { /* From 6ce3bc990cf0cc1ea1bfcae7149e095afd898d41 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 8 Sep 2025 17:07:41 +0800 Subject: [PATCH 300/372] mm: skip mlocked THPs that are underused early in deferred_split_scan() When we stumble over a fully-mapped mlocked THP in the deferred shrinker, it does not make sense to try to detect whether it is underused, because try_to_map_unused_to_zeropage(), called while splitting the folio, will not actually replace any zeroed pages by the shared zeropage. Splitting the folio in that case does not make any sense, so let's not even scan to check if the folio is underused. Link: https://lkml.kernel.org/r/20250908090741.61519-1-lance.yang@linux.dev Signed-off-by: Lance Yang Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Usama Arif Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Lance Yang Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Cc: Zi Yan Cc: Kiryl Shutsemau Signed-off-by: Andrew Morton --- mm/huge_memory.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a9fc7a09167a..5acca24bbabb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -4175,6 +4175,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, bool underused = false; if (!folio_test_partially_mapped(folio)) { + /* + * See try_to_map_unused_to_zeropage(): we cannot + * optimize zero-filled pages after splitting an + * mlocked folio. + */ + if (folio_test_mlocked(folio)) + goto next; underused = thp_underused(folio); if (!underused) goto next; From 5919f1282141f29345432a4f1dadf34716f3dbec Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Mon, 8 Sep 2025 14:26:14 +0800 Subject: [PATCH 301/372] mm/shmem: remove unused entry_order after large swapin rework After commit 93c0476e7057 ("mm/shmem, swap: rework swap entry and index calculation for large swapin"), xas_get_order() will never return a non-zero value for `entry_order` in shmem_split_large_entry(). As a result, the local variable `entry_order` is effectively unused. Clean up the code by removing `entry_order` and directly using `cur_order`. This change is purely a refactor and has no functional impact. No functional change intended. Link: https://lkml.kernel.org/r/20250908062614.89880-1-liu.yun@linux.dev Signed-off-by: Jackie Liu Reviewed-by: Baolin Wang Cc: Hugh Dickins Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/shmem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index cf0171a72e47..7db01567b645 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2173,7 +2173,7 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, { struct address_space *mapping = inode->i_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, index, 0); - int split_order = 0, entry_order; + int split_order = 0; int i; /* Convert user data gfp flags to xarray node gfp flags */ @@ -2191,15 +2191,12 @@ static int shmem_split_large_entry(struct inode *inode, pgoff_t index, goto unlock; } - entry_order = xas_get_order(&xas); - - if (!entry_order) + cur_order = xas_get_order(&xas); + if (!cur_order) goto unlock; /* Try to split large swap entry in pagecache */ - cur_order = entry_order; - swap_index = round_down(index, 1 << entry_order); - + swap_index = round_down(index, 1 << cur_order); split_order = xas_try_split_min_order(cur_order); while (cur_order > 0) { From 123bcf284205b2513c4172c50da8d193f8f8ab3d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 15 Sep 2025 14:27:54 -0700 Subject: [PATCH 302/372] alloc_tag: use release_pages() in the cleanup path Patch series "Minor fixes for memory allocation profiling", v2. Over the last couple months I gathered a few reports of minor issues in memory allocation profiling which are addressed in this patchset. This patch (of 2): When bulk-freeing an array of pages use release_pages() instead of freeing them page-by-page. Link: https://lkml.kernel.org/r/20250915212756.3998938-1-surenb@google.com Link: https://lkml.kernel.org/r/20250915212756.3998938-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Suggested-by: Usama Arif Acked-by: Shakeel Butt Acked-by: Usama Arif Cc: David Wang <00107082@163.com> Cc: Johannes Weiner Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index e9b33848700a..715315f5d9ba 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -438,9 +438,10 @@ static int vm_module_tags_populate(void) if (nr < more_pages || vmap_pages_range(phys_end, phys_end + (nr << PAGE_SHIFT), PAGE_KERNEL, next_page, PAGE_SHIFT) < 0) { + release_pages_arg arg = { .pages = next_page }; + /* Clean up and error out */ - for (int i = 0; i < nr; i++) - __free_page(next_page[i]); + release_pages(arg, nr); return -ENOMEM; } @@ -682,11 +683,10 @@ static int __init alloc_mod_tags_mem(void) static void __init free_mod_tags_mem(void) { - int i; + release_pages_arg arg = { .pages = vm_module_tags->pages }; module_tags.start_addr = 0; - for (i = 0; i < vm_module_tags->nr_pages; i++) - __free_page(vm_module_tags->pages[i]); + release_pages(arg, vm_module_tags->nr_pages); kfree(vm_module_tags->pages); free_vm_area(vm_module_tags); } From 9e8a0bbb128ec9379ce271ccecdfb022c483da0b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 15 Sep 2025 14:27:55 -0700 Subject: [PATCH 303/372] alloc_tag: prevent enabling memory profiling if it was shut down Memory profiling can be shut down due to reasons like a failure during initialization. When this happens, the user should not be able to re-enable it. Current sysctrl interface does not handle this properly and will allow re-enabling memory profiling. Fix this by checking for this condition during sysctrl write operation. Link: https://lkml.kernel.org/r/20250915212756.3998938-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Shakeel Butt Acked-by: Usama Arif Cc: David Wang <00107082@163.com> Cc: Johannes Weiner Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 715315f5d9ba..f79217427d81 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -766,6 +766,20 @@ struct page_ext_operations page_alloc_tagging_ops = { EXPORT_SYMBOL(page_alloc_tagging_ops); #ifdef CONFIG_SYSCTL +/* + * Not using proc_do_static_key() directly to prevent enabling profiling + * after it was shut down. + */ +static int proc_mem_profiling_handler(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + if (!mem_profiling_support && write) + return -EINVAL; + + return proc_do_static_key(table, write, buffer, lenp, ppos); +} + + static struct ctl_table memory_allocation_profiling_sysctls[] = { { .procname = "mem_profiling", @@ -775,7 +789,7 @@ static struct ctl_table memory_allocation_profiling_sysctls[] = { #else .mode = 0644, #endif - .proc_handler = proc_do_static_key, + .proc_handler = proc_mem_profiling_handler, }, }; From 7ffc923e252ca89e58228adbdc500a9fdea43c38 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 15 Sep 2025 14:27:56 -0700 Subject: [PATCH 304/372] alloc_tag: avoid warnings when freeing non-compound "tail" pages When freeing "tail" pages of a non-compount high-order page, we properly subtract the allocation tag counters, however later when these pages are released, alloc_tag_sub() will issue warnings because tags for these pages are NULL. This issue was originally anticipated by Vlastimil in his review [1] and then recently reported by David. Prevent warnings by marking the tags empty. Link: https://lkml.kernel.org/r/20250915212756.3998938-4-surenb@google.com Link: https://lore.kernel.org/all/6db0f0c8-81cb-4d04-9560-ba73d63db4b8@suse.cz/ [1] Signed-off-by: Suren Baghdasaryan Suggested-by: David Wang <00107082@163.com> Acked-by: Shakeel Butt Acked-by: Usama Arif Cc: Johannes Weiner Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6ff9f17d5f4e..cf38d499e045 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5240,9 +5240,16 @@ static void ___free_pages(struct page *page, unsigned int order, __free_frozen_pages(page, order, fpi_flags); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); - while (order-- > 0) + while (order-- > 0) { + /* + * The "tail" pages of this non-compound high-order + * page will have no code tags, so to avoid warnings + * mark them as empty. + */ + clear_page_tag_ref(page + (1 << order)); __free_frozen_pages(page + (1 << order), order, fpi_flags); + } } } From 032c31127f27acb1b8152b512830ecef04ed2ebc Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Tue, 9 Sep 2025 13:13:57 -0700 Subject: [PATCH 305/372] mm: vm_event_item: explicit #include for THREAD_SIZE This header uses THREAD_SIZE, which is provided by the thread_info.h header but is not included in this header. Depending on the #include ordering in other files, this can produce preprocessor errors. Link: https://lkml.kernel.org/r/20250909201419.827638-1-briannorris@chromium.org Signed-off-by: Brian Norris Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 9e15a088ba38..92f80b4d69a6 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -2,6 +2,8 @@ #ifndef VM_EVENT_ITEM_H_INCLUDED #define VM_EVENT_ITEM_H_INCLUDED +#include + #ifdef CONFIG_ZONE_DMA #define DMA_ZONE(xx) xx##_DMA, #else From 9b375adb398493f096d776721caac8fe26bfb2b7 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Tue, 9 Sep 2025 12:05:34 -0700 Subject: [PATCH 306/372] selftests/mm: remove PROT_EXEC req from file-collapse tests As of v6.8 commit 7fbb5e188248 ("mm: remove VM_EXEC requirement for THP eligibility") thp collapse no longer requires file-backed mappings be created with PROT_EXEC. Remove the overly-strict dependency from thp collapse tests so we test the least-strict requirement for success. Link: https://lkml.kernel.org/r/20250909190534.512801-1-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Reviewed-by: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Ryan Roberts Cc: Shuah Khan Cc: Zi Yan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/khugepaged.c b/tools/testing/selftests/mm/khugepaged.c index a18c50d51141..3fe7ef04ac62 100644 --- a/tools/testing/selftests/mm/khugepaged.c +++ b/tools/testing/selftests/mm/khugepaged.c @@ -394,7 +394,7 @@ static void *file_setup_area(int nr_hpages) perror("open()"); exit(EXIT_FAILURE); } - p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + p = mmap(BASE_ADDR, size, PROT_READ, MAP_PRIVATE, finfo.fd, 0); if (p == MAP_FAILED || p != BASE_ADDR) { perror("mmap()"); From fa17bcd5f65ed702df001579cca8c885fa6bf3e7 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Tue, 26 Aug 2025 11:37:21 -0400 Subject: [PATCH 307/372] mm: make folio page count functions return unsigned As raised by Andrew [1], a folio/compound page never spans a negative number of pages. Consequently, let's use "unsigned long" instead of "long" consistently for folio_nr_pages(), folio_large_nr_pages() and compound_nr(). Using "unsigned long" as return value is fine, because even "(long)-folio_nr_pages()" will keep on working as expected. Using "unsigned int" instead would actually break these use cases. This patch takes the first step changing these to return unsigned long (and making drm_gem_get_pages() use the new types instead of replacing min()). In the future, we might want to make more callers of these functions to consistently use "unsigned long". Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/ Link: https://lkml.kernel.org/r/20250826153721.GA23292@cathedrallabs.org Link: https://lore.kernel.org/linux-mm/20250503182858.5a02729fcffd6d4723afcfc2@linux-foundation.org/ [1] Signed-off-by: Aristeu Rozanski Suggested-by: Andrew Morton Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Simona Vetter Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- drivers/gpu/drm/drm_gem.c | 4 ++-- include/linux/mm.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index 6a44351e58b7..5dbcf91ff73c 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -621,7 +621,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) struct page **pages; struct folio *folio; struct folio_batch fbatch; - long i, j, npages; + unsigned long i, j, npages; if (WARN_ON(!obj->filp)) return ERR_PTR(-EINVAL); @@ -645,7 +645,7 @@ struct page **drm_gem_get_pages(struct drm_gem_object *obj) i = 0; while (i < npages) { - long nr; + unsigned long nr; folio = shmem_read_folio_gfp(mapping, i, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) diff --git a/include/linux/mm.h b/include/linux/mm.h index da6e0abad2cb..8f5b4df9b166 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1018,12 +1018,12 @@ static inline unsigned int folio_large_order(const struct folio *folio) } #ifdef NR_PAGES_IN_LARGE_FOLIO -static inline long folio_large_nr_pages(const struct folio *folio) +static inline unsigned long folio_large_nr_pages(const struct folio *folio) { return folio->_nr_pages; } #else -static inline long folio_large_nr_pages(const struct folio *folio) +static inline unsigned long folio_large_nr_pages(const struct folio *folio) { return 1L << folio_large_order(folio); } @@ -2062,7 +2062,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone, * * Return: A positive power of two. */ -static inline long folio_nr_pages(const struct folio *folio) +static inline unsigned long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; @@ -2097,7 +2097,7 @@ static inline long folio_nr_pages(const struct folio *folio) * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ -static inline long compound_nr(const struct page *page) +static inline unsigned long compound_nr(const struct page *page) { const struct folio *folio = (struct folio *)page; From b33939383b6439cb90cebf8df301cd2cbed0b980 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 8 Sep 2025 15:24:54 -0700 Subject: [PATCH 308/372] mm: lru_add_drain_all() do local lru_add_drain() first No numbers to back this up, but it seemed obvious to me, that if there are competing lru_add_drain_all()ers, the work will be minimized if each flushes its own local queues before locking and doing cross-CPU drains. Link: https://lkml.kernel.org/r/33389bf8-f79d-d4dd-b7a4-680c4aa21b23@google.com Signed-off-by: Hugh Dickins Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: Chris Li Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: Johannes Weiner Cc: John Hubbard Cc: Keir Fraser Cc: Konstantin Khlebnikov Cc: Li Zhe Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Rik van Riel Cc: Shivank Garg Cc: Vlastimil Babka Cc: Wei Xu Cc: Will Deacon Cc: yangge Cc: Yuanchu Xie Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/swap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/swap.c b/mm/swap.c index b8cea6a1b86f..2260dcd2775e 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -834,6 +834,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus) */ this_gen = smp_load_acquire(&lru_drain_gen); + /* It helps everyone if we do our own local drain immediately. */ + lru_add_drain(); + mutex_lock(&lock); /* From 3a37469e5ac004905e4125bf60b43cc5216e83dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Sep 2025 15:29:17 +0100 Subject: [PATCH 309/372] mm: constify compound_order() and page_size() Patch series "Small cleanups". These small cleanups can be applied now to reduce conflicts during the next merge window. They're all from various efforts to split struct page from other memdescs. Thanks to Vlastimil for the suggestion. This patch (of 3): These functions do not modify their arguments. Telling the compiler this may improve code generation, and allows us to pass const arguments from other functions. Link: https://lkml.kernel.org/r/20250910142923.2465470-1-willy@infradead.org Link: https://lkml.kernel.org/r/20250910142923.2465470-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f5b4df9b166..fcb1e72eea40 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1036,9 +1036,9 @@ static inline unsigned long folio_large_nr_pages(const struct folio *folio) * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ -static inline unsigned int compound_order(struct page *page) +static inline unsigned int compound_order(const struct page *page) { - struct folio *folio = (struct folio *)page; + const struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags.f)) return 0; @@ -1256,7 +1256,7 @@ int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ -static inline unsigned long page_size(struct page *page) +static inline unsigned long page_size(const struct page *page) { return PAGE_SIZE << compound_order(page); } From 6fd893a40e3c990ea4ca3a9c084d1ddc3020d936 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Sep 2025 15:29:18 +0100 Subject: [PATCH 310/372] mm: remove redundant test in validate_page_before_insert() The page_has_type() call would have included slab since commit 46df8e73a4a3 and now we don't even get that far because slab pages have a zero refcount since commit 9aec2fb0fd5e. Link: https://lkml.kernel.org/r/20250910142923.2465470-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 41e641823558..17cebb97beae 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2266,8 +2266,7 @@ static int validate_page_before_insert(struct vm_area_struct *vma, return -EINVAL; return 0; } - if (folio_test_anon(folio) || folio_test_slab(folio) || - page_has_type(page)) + if (folio_test_anon(folio) || page_has_type(page)) return -EINVAL; flush_dcache_folio(folio); return 0; From 9d003dec972563efb8ce14c9962af3652d0e201d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 10 Sep 2025 15:29:19 +0100 Subject: [PATCH 311/372] mm: remove page->order We already use page->private for storing the order of a page while it's in the buddy allocator system; extend that to also storing the order while it's in the pcp_llist. Link: https://lkml.kernel.org/r/20250910142923.2465470-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Alexei Starovoitov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 8 +++----- mm/page_alloc.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f048dc80646e..6920c816f6c6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -97,10 +97,7 @@ struct page { /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; - struct { - struct llist_node pcp_llist; - unsigned int order; - }; + struct llist_node pcp_llist; }; struct address_space *mapping; union { @@ -111,7 +108,8 @@ struct page { * @private: Mapping-private opaque data. * Usually used for buffer_heads if PagePrivate. * Used for swp_entry_t if swapcache flag set. - * Indicates order in the buddy system if PageBuddy. + * Indicates order in the buddy system if PageBuddy + * or on pcp_llist. */ unsigned long private; }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cf38d499e045..2bfab96c207f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1520,7 +1520,7 @@ static void add_page_to_zone_llist(struct zone *zone, struct page *page, unsigned int order) { /* Remember the order */ - page->order = order; + page->private = order; /* Add the page to the free list */ llist_add(&page->pcp_llist, &zone->trylock_free_pages); } @@ -1549,7 +1549,7 @@ static void free_one_page(struct zone *zone, struct page *page, llnode = llist_del_all(llhead); llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { - unsigned int p_order = p->order; + unsigned int p_order = p->private; split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); __count_vm_events(PGFREE, 1 << p_order); From a5883fa94295f1ef2473eadd84cc1e24dab9ae18 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 10 Sep 2025 11:30:51 +0200 Subject: [PATCH 312/372] selftests/mm: gup_tests: option to GUP all pages in a single call We recently missed detecting an issue during early testing because the default (!all) tests would not trigger it and even when running "all" tests it only would happen sometimes because of races. So let's allow for an easy way to specify "GUP all pages in a single call", extend the test matrix and extend our default (!all) tests. By GUP'ing all pages in a single call, with the default size of 128MiB we'll cover multiple leaf page tables / PMDs on architectures with sane THP sizes. Link: https://lkml.kernel.org/r/20250910093051.1693097-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Jason Gunthorpe Cc: John Hubbard Cc: Peter Xu Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_test.c | 2 ++ tools/testing/selftests/mm/run_vmtests.sh | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index bdeaac67ff9a..8900b840c17a 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -139,6 +139,8 @@ int main(int argc, char **argv) break; case 'n': nr_pages = atoi(optarg); + if (nr_pages < 0) + nr_pages = size / psize(); break; case 't': thp = 1; diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 8115fc4526ed..d9173f2312b7 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -138,7 +138,7 @@ run_gup_matrix() { # -n: How many pages to fetch together? 512 is special # because it's default thp size (or 2M on x86), 123 to # just test partial gup when hit a huge in whatever form - for num in "-n 1" "-n 512" "-n 123"; do + for num in "-n 1" "-n 512" "-n 123" "-n -1"; do CATEGORY="gup_test" run_test ./gup_test \ $huge $test_cmd $write $share $num done @@ -314,9 +314,11 @@ if $RUN_ALL; then run_gup_matrix else # get_user_pages_fast() benchmark - CATEGORY="gup_test" run_test ./gup_test -u + CATEGORY="gup_test" run_test ./gup_test -u -n 1 + CATEGORY="gup_test" run_test ./gup_test -u -n -1 # pin_user_pages_fast() benchmark - CATEGORY="gup_test" run_test ./gup_test -a + CATEGORY="gup_test" run_test ./gup_test -a -n 1 + CATEGORY="gup_test" run_test ./gup_test -a -n -1 fi # Dump pages 0, 19, and 4096, using pin_user_pages: CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 From 2e0fe9245d6bcfc689961d58b43216adaece3182 Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Wed, 10 Sep 2025 19:32:20 +0800 Subject: [PATCH 313/372] mm/damon/lru_sort: support addr_unit for DAMON_LRU_SORT Patch series "mm/damon: add addr_unit for DAMON_LRU_SORT and DAMON_RECLAIM". In DAMON_LRU_SORT and DAMON_RECLAIM, damon_ctx is independent of the core. Add addr_unit to these modules to support systems like ARM32 with LPAE. This patch (of 2): Implement a sysfs file to expose addr_unit for DAMON_LRU_SORT users. During parameter application, use the configured addr_unit parameter to perform the necessary initialization. Similar to the core layer, prevent setting addr_unit to zero. It is worth noting that when monitor_region_start and monitor_region_end are unset (i.e., 0), their values will later be set to biggest_system_ram. At that point, addr_unit may not be the default value 1. Although we could divide the biggest_system_ram value by addr_unit, changing addr_unit without setting monitor_region_start/end should be considered a user misoperation. And biggest_system_ram is only within the 0~ULONG_MAX range, system can clearly work correctly with addr_unit=1. Therefore, if monitor_region_start/end are unset, always silently reset addr_unit to 1. Link: https://lkml.kernel.org/r/20250910113221.1065764-1-yanquanmin1@huawei.com Link: https://lkml.kernel.org/r/20250910113221.1065764-2-yanquanmin1@huawei.com Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index b5a5ed16a7a5..14d31009c09e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -111,6 +111,13 @@ module_param(monitor_region_start, ulong, 0600); static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); +/* + * Scale factor for DAMON_LRU_SORT to ops address conversion. + * + * This parameter must not be set to 0. + */ +static unsigned long addr_unit __read_mostly = 1; + /* * PID of the DAMON thread * @@ -198,6 +205,15 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; + /* + * If monitor_region_start/end are unset, always silently + * reset addr_unit to 1. + */ + if (!monitor_region_start && !monitor_region_end) + addr_unit = 1; + param_ctx->addr_unit = addr_unit; + param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1); + if (!damon_lru_sort_mon_attrs.sample_interval) { err = -EINVAL; goto out; @@ -290,6 +306,30 @@ static int damon_lru_sort_turn(bool on) return damon_call(ctx, &call_control); } +static int damon_lru_sort_addr_unit_store(const char *val, + const struct kernel_param *kp) +{ + unsigned long input_addr_unit; + int err = kstrtoul(val, 0, &input_addr_unit); + + if (err) + return err; + if (!input_addr_unit) + return -EINVAL; + + addr_unit = input_addr_unit; + return 0; +} + +static const struct kernel_param_ops addr_unit_param_ops = { + .set = damon_lru_sort_addr_unit_store, + .get = param_get_ulong, +}; + +module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600); +MODULE_PARM_DESC(addr_unit, + "Scale factor for DAMON_LRU_SORT to ops address conversion (default: 1)"); + static int damon_lru_sort_enabled_store(const char *val, const struct kernel_param *kp) { From 7db551fcfb2aa8a20740406d41b48fd4365fd17f Mon Sep 17 00:00:00 2001 From: Quanmin Yan Date: Wed, 10 Sep 2025 19:32:21 +0800 Subject: [PATCH 314/372] mm/damon/reclaim: support addr_unit for DAMON_RECLAIM Implement a sysfs file to expose addr_unit for DAMON_RECLAIM users. During parameter application, use the configured addr_unit parameter to perform the necessary initialization. Similar to the core layer, prevent setting addr_unit to zero. It is worth noting that when monitor_region_start and monitor_region_end are unset (i.e., 0), their values will later be set to biggest_system_ram. At that point, addr_unit may not be the default value 1. Although we could divide the biggest_system_ram value by addr_unit, changing addr_unit without setting monitor_region_start/end should be considered a user misoperation. And biggest_system_ram is only within the 0~ULONG_MAX range, system can clearly work correctly with addr_unit=1. Therefore, if monitor_region_start/end are unset, always silently reset addr_unit to 1. Link: https://lkml.kernel.org/r/20250910113221.1065764-3-yanquanmin1@huawei.com Signed-off-by: Quanmin Yan Reviewed-by: SeongJae Park Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index fb7c982a0018..590f9d6c55ef 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -128,6 +128,13 @@ module_param(monitor_region_start, ulong, 0600); static unsigned long monitor_region_end __read_mostly; module_param(monitor_region_end, ulong, 0600); +/* + * Scale factor for DAMON_RECLAIM to ops address conversion. + * + * This parameter must not be set to 0. + */ +static unsigned long addr_unit __read_mostly = 1; + /* * Skip anonymous pages reclamation. * @@ -194,6 +201,15 @@ static int damon_reclaim_apply_parameters(void) if (err) return err; + /* + * If monitor_region_start/end are unset, always silently + * reset addr_unit to 1. + */ + if (!monitor_region_start && !monitor_region_end) + addr_unit = 1; + param_ctx->addr_unit = addr_unit; + param_ctx->min_sz_region = max(DAMON_MIN_REGION / addr_unit, 1); + if (!damon_reclaim_mon_attrs.aggr_interval) { err = -EINVAL; goto out; @@ -294,6 +310,30 @@ static int damon_reclaim_turn(bool on) return damon_call(ctx, &call_control); } +static int damon_reclaim_addr_unit_store(const char *val, + const struct kernel_param *kp) +{ + unsigned long input_addr_unit; + int err = kstrtoul(val, 0, &input_addr_unit); + + if (err) + return err; + if (!input_addr_unit) + return -EINVAL; + + addr_unit = input_addr_unit; + return 0; +} + +static const struct kernel_param_ops addr_unit_param_ops = { + .set = damon_reclaim_addr_unit_store, + .get = param_get_ulong, +}; + +module_param_cb(addr_unit, &addr_unit_param_ops, &addr_unit, 0600); +MODULE_PARM_DESC(addr_unit, + "Scale factor for DAMON_RECLAIM to ops address conversion (default: 1)"); + static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { From 5ea8ab7f93139c63626830b48b36aab4e62b3f8f Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 12 Sep 2025 17:51:01 +0500 Subject: [PATCH 315/372] selftests/mm: centralize the __always_unused macro This macro gets used in different tests. Add it to kselftest.h which is central location and tests use this header. Then use this new macro. Link: https://lkml.kernel.org/r/20250912125102.1309796-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Acked-by: David Hildenbrand Cc: Antonio Quartulli Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kacinski Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Paolo Abeni Cc: "Sabrina Dubroca" Cc: Shuah Khan Cc: Simon Horman Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/kselftest.h | 4 ++++ tools/testing/selftests/mm/protection_keys.c | 2 +- tools/testing/selftests/net/ovpn/ovpn-cli.c | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 661d31c4b558..274480e3573a 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -92,6 +92,10 @@ #endif #define __printf(a, b) __attribute__((format(printf, a, b))) +#ifndef __always_unused +#define __always_unused __attribute__((__unused__)) +#endif + #ifndef __maybe_unused #define __maybe_unused __attribute__((__unused__)) #endif diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index 23ebec367015..c02fb204547e 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -1304,7 +1304,7 @@ static void test_mprotect_with_pkey_0(int *ptr, u16 pkey) static void test_ptrace_of_child(int *ptr, u16 pkey) { - __attribute__((__unused__)) int peek_result; + __always_unused int peek_result; pid_t child_pid; void *ignored = 0; long ret; diff --git a/tools/testing/selftests/net/ovpn/ovpn-cli.c b/tools/testing/selftests/net/ovpn/ovpn-cli.c index 9201f2905f2c..688a5fa6fdac 100644 --- a/tools/testing/selftests/net/ovpn/ovpn-cli.c +++ b/tools/testing/selftests/net/ovpn/ovpn-cli.c @@ -32,9 +32,10 @@ #include +#include "../../kselftest.h" + /* defines to make checkpatch happy */ #define strscpy strncpy -#define __always_unused __attribute__((__unused__)) /* libnl < 3.5.0 does not set the NLA_F_NESTED on its own, therefore we * have to explicitly do it to prevent the kernel from failing upon From eea5706cb04216214bcd269a27afbc479c71bceb Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Fri, 12 Sep 2025 14:30:21 +0200 Subject: [PATCH 316/372] resource: improve child resource handling in release_mem_region_adjustable() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When memory block is removed via try_remove_memory(), it eventually reaches release_mem_region_adjustable(). The current implementation assumes that when a busy memory resource is split into two, all child resources remain in the lower address range. This simplification causes problems when child resources actually belong to the upper split. For example: * Initial memory layout: lsmem RANGE SIZE STATE REMOVABLE BLOCK 0x0000000000000000-0x00000002ffffffff 12G online yes 0-95 * /proc/iomem 00000000-2dfefffff : System RAM 158834000-1597b3fff : Kernel code 1597b4000-159f50fff : Kernel data 15a13c000-15a218fff : Kernel bss 2dff00000-2ffefffff : Crash kernel 2fff00000-2ffffffff : System RAM * After offlining and removing range 0x150000000-0x157ffffff lsmem -o RANGE,SIZE,STATE,BLOCK,CONFIGURED (output according to upcoming lsmem changes with the configured column: s390) RANGE SIZE STATE BLOCK CONFIGURED 0x0000000000000000-0x000000014fffffff 5.3G online 0-41 yes 0x0000000150000000-0x0000000157ffffff 128M offline 42 no 0x0000000158000000-0x00000002ffffffff 6.6G online 43-95 yes The iomem resource gets split into two entries, but kernel code, kernel data, and kernel bss remain attached to the lower resource [0–5376M] instead of the correct upper resource [5504M–12288M]. As a result, WARN_ON() triggers in release_mem_region_adjustable() ("Usecase: split into two entries - we need a new resource") ------------[ cut here ]------------ WARNING: CPU: 5 PID: 858 at kernel/resource.c:1486 release_mem_region_adjustable+0x210/0x280 Modules linked in: CPU: 5 UID: 0 PID: 858 Comm: chmem Not tainted 6.17.0-rc2-11707-g2c36aaf3ba4e Hardware name: IBM 3906 M04 704 (z/VM 7.3.0) Krnl PSW : 0704d00180000000 0000024ec0dae0e4 (release_mem_region_adjustable+0x214/0x280) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3 Krnl GPRS: 0000000000000000 00000002ffffafc0 fffffffffffffff0 0000000000000000 000000014fffffff 0000024ec2257608 0000000000000000 0000024ec2301758 0000024ec22680d0 00000000902c9140 0000000150000000 00000002ffffafc0 000003ffa61d8d18 0000024ec21fb478 0000024ec0dae014 000001cec194fbb0 Krnl Code: 0000024ec0dae0d8: af000000 mc 0,0 0000024ec0dae0dc: a7f4ffc1 brc 15,0000024ec0dae05e #0000024ec0dae0e0: af000000 mc 0,0 >0000024ec0dae0e4: a5defffd llilh %r13,65533 0000024ec0dae0e8: c04000c6064c larl %r4,0000024ec266ed80 0000024ec0dae0ee: eb1d400000f8 laa %r1,%r13,0(%r4) 0000024ec0dae0f4: 07e0 bcr 14,%r0 0000024ec0dae0f6: a7f4ffc0 brc 15,0000024ec0dae076 [<0000024ec0dae0e4>] release_mem_region_adjustable+0x214/0x280 ([<0000024ec0dadf3c>] release_mem_region_adjustable+0x6c/0x280) [<0000024ec10a2130>] try_remove_memory+0x100/0x140 [<0000024ec10a4052>] __remove_memory+0x22/0x40 [<0000024ec18890f6>] config_mblock_store+0x326/0x3e0 [<0000024ec11f7056>] kernfs_fop_write_iter+0x136/0x210 [<0000024ec1121e86>] vfs_write+0x236/0x3c0 [<0000024ec11221b8>] ksys_write+0x78/0x110 [<0000024ec1b6bfbe>] __do_syscall+0x12e/0x350 [<0000024ec1b782ce>] system_call+0x6e/0x90 Last Breaking-Event-Address: [<0000024ec0dae014>] release_mem_region_adjustable+0x144/0x280 ---[ end trace 0000000000000000 ]--- Also, resource adjustment doesn't happen and stale resources still cover [0-12288M]. Later, memory re-add fails in register_memory_resource() with -EBUSY. i.e: /proc/iomem is still: 00000000-2dfefffff : System RAM 158834000-1597b3fff : Kernel code 1597b4000-159f50fff : Kernel data 15a13c000-15a218fff : Kernel bss 2dff00000-2ffefffff : Crash kernel 2fff00000-2ffffffff : System RAM Enhance release_mem_region_adjustable() to reassign child resources to the correct parent after a split. Children are now assigned based on their actual range: If they fall within the lower split, keep them in the lower parent. If they fall within the upper split, move them to the upper parent. Kernel code/data/bss regions are not offlined, so they will always reside entirely within one parent and never span across both. Output after the enhancement: * Initial state /proc/iomem (before removal of memory block): 00000000-2dfefffff : System RAM 1f94f8000-1fa477fff : Kernel code 1fa478000-1fac14fff : Kernel data 1fae00000-1faedcfff : Kernel bss 2dff00000-2ffefffff : Crash kernel 2fff00000-2ffffffff : System RAM * Offline and remove 0x1e8000000-0x1efffffff memory range * /proc/iomem 00000000-1e7ffffff : System RAM 1f0000000-2dfefffff : System RAM 1f94f8000-1fa477fff : Kernel code 1fa478000-1fac14fff : Kernel data 1fae00000-1faedcfff : Kernel bss 2dff00000-2ffefffff : Crash kernel 2fff00000-2ffffffff : System RAM Link: https://lkml.kernel.org/r/20250912123021.3219980-1-sumanthk@linux.ibm.com Signed-off-by: Sumanth Korikkar Cc: Alexander Gordeev Cc: Andriy Shevchenko Cc: Dan Williams Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Mike Rapoport Signed-off-by: Andrew Morton --- kernel/resource.c | 50 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/kernel/resource.c b/kernel/resource.c index f9bb5481501a..b9fa2a4ce089 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1388,6 +1388,47 @@ void __release_region(struct resource *parent, resource_size_t start, EXPORT_SYMBOL(__release_region); #ifdef CONFIG_MEMORY_HOTREMOVE +static void append_child_to_parent(struct resource *new_parent, struct resource *new_child) +{ + struct resource *child; + + child = new_parent->child; + if (child) { + while (child->sibling) + child = child->sibling; + child->sibling = new_child; + } else { + new_parent->child = new_child; + } + new_child->parent = new_parent; + new_child->sibling = NULL; +} + +/* + * Reparent all child resources that no longer belong to "low" after a split to + * "high". Note that "high" does not have any children, because "low" is the + * original resource and "high" is a new resource. Treat "low" as the original + * resource being split and defer its range adjustment to __adjust_resource(). + */ +static void reparent_children_after_split(struct resource *low, + struct resource *high, + resource_size_t split_addr) +{ + struct resource *child, *next, **p; + + p = &low->child; + while ((child = *p)) { + next = child->sibling; + if (child->start > split_addr) { + /* unlink child */ + *p = next; + append_child_to_parent(high, child); + } else { + p = &child->sibling; + } + } +} + /** * release_mem_region_adjustable - release a previously reserved memory region * @start: resource start address @@ -1397,15 +1438,13 @@ EXPORT_SYMBOL(__release_region); * is released from a currently busy memory resource. The requested region * must either match exactly or fit into a single busy resource entry. In * the latter case, the remaining resource is adjusted accordingly. - * Existing children of the busy memory resource must be immutable in the - * request. * * Note: * - Additional release conditions, such as overlapping region, can be * supported after they are confirmed as valid cases. - * - When a busy memory resource gets split into two entries, the code - * assumes that all children remain in the lower address entry for - * simplicity. Enhance this logic when necessary. + * - When a busy memory resource gets split into two entries, its children are + * reassigned to the correct parent based on their range. If a child memory + * resource overlaps with more than one parent, enhance the logic as needed. */ void release_mem_region_adjustable(resource_size_t start, resource_size_t size) { @@ -1482,6 +1521,7 @@ retry: new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; + reparent_children_after_split(res, new_res, end); if (WARN_ON_ONCE(__adjust_resource(res, res->start, start - res->start))) From 3d5022a0f82442e03f84e17a134c7ad8b14d6628 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 12 Sep 2025 17:30:21 +0500 Subject: [PATCH 317/372] selftests/mm: add -Wunreachable-code and fix warnings Patch series "selftests/mm: Add -Wunreachable-code and fix warnings". Add -Wunreachable-code to selftests and remove dead code from generated warnings. This patch (of 2): Enable -Wunreachable-code flag to catch dead code and fix them. 1. Remove the dead code and write a comment instead: hmm-tests.c:2033:3: warning: code will never be executed [-Wunreachable-code] perror("Should not reach this\n"); ^~~~~~ 2. ksft_exit_fail_msg() calls exit(). So cleanup isn't done. Replace it with ksft_print_msg(). split_huge_page_test.c:301:3: warning: code will never be executed [-Wunreachable-code] goto cleanup; ^~~~~~~~~~~~ 3. Remove duplicate inline. pkey_sighandler_tests.c:44:15: warning: duplicate 'inline' declaration specifier [-Wduplicate-decl-specifier] static inline __always_inline Link: https://lkml.kernel.org/r/20250912123025.1271051-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20250912123025.1271051-2-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Reviewed-by: Sidhartha Kumar Reviewed-by: Kevin Brodsky Acked-by: David Hildenbrand Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: Dev Jain Cc: Jason Gunthorpe Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/hmm-tests.c | 5 ++--- tools/testing/selftests/mm/pkey_sighandler_tests.c | 2 +- tools/testing/selftests/mm/split_huge_page_test.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 5a1dee50b898..eaf9312097f7 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -34,6 +34,7 @@ endif MAKEFLAGS += --no-builtin-rules CFLAGS = -Wall -O2 -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) $(TOOLS_INCLUDES) +CFLAGS += -Wunreachable-code LDLIBS = -lrt -lpthread -lm # Some distributions (such as Ubuntu) configure GCC so that _FORTIFY_SOURCE is diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 141bf63cbe05..15aadaf24a66 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -2027,11 +2027,10 @@ TEST_F(hmm, hmm_cow_in_device) if (pid == -1) ASSERT_EQ(pid, 0); if (!pid) { - /* Child process waitd for SIGTERM from the parent. */ + /* Child process waits for SIGTERM from the parent. */ while (1) { } - perror("Should not reach this\n"); - exit(0); + /* Should not reach this */ } /* Parent process writes to COW pages(s) and gets a * new copy in system. In case of device private pages, diff --git a/tools/testing/selftests/mm/pkey_sighandler_tests.c b/tools/testing/selftests/mm/pkey_sighandler_tests.c index b5e076a564c9..302fef54049c 100644 --- a/tools/testing/selftests/mm/pkey_sighandler_tests.c +++ b/tools/testing/selftests/mm/pkey_sighandler_tests.c @@ -41,7 +41,7 @@ static siginfo_t siginfo = {0}; * syscall will attempt to access the PLT in order to call a library function * which is protected by MPK 0 which we don't have access to. */ -static inline __always_inline +static __always_inline long syscall_raw(long n, long a1, long a2, long a3, long a4, long a5, long a6) { unsigned long ret; diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index 7731191cc8e9..743af3c05190 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -510,7 +510,7 @@ static void split_file_backed_thp(int order) status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); if (status >= INPUT_MAX) { - ksft_exit_fail_msg("Fail to create file-backed THP split testing file\n"); + ksft_print_msg("Fail to create file-backed THP split testing file\n"); goto cleanup; } From e75f15fb69610cf90e9dfa0cbe436ae9342b9b79 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 12 Sep 2025 17:30:22 +0500 Subject: [PATCH 318/372] selftests/mm: protection_keys: fix dead code The while loop doesn't execute and following warning gets generated: protection_keys.c:561:15: warning: code will never be executed [-Wunreachable-code] int rpkey = alloc_random_pkey(); Let's enable the while loop such that it gets executed nr_iterations times. Simplify the code a bit as well. Link: https://lkml.kernel.org/r/20250912123025.1271051-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Reviewed-by: Sidhartha Kumar Reviewed-by: Zi Yan Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Dev Jain Cc: Jason Gunthorpe Cc: Kevin Brodsky Cc: Lance Yang Cc: Leon Romanovsky Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/protection_keys.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/protection_keys.c b/tools/testing/selftests/mm/protection_keys.c index c02fb204547e..2085982dba69 100644 --- a/tools/testing/selftests/mm/protection_keys.c +++ b/tools/testing/selftests/mm/protection_keys.c @@ -557,13 +557,11 @@ int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, int nr_iterations = random() % 100; int ret; - while (0) { + while (nr_iterations-- >= 0) { int rpkey = alloc_random_pkey(); ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", ptr, size, orig_prot, pkey, ret); - if (nr_iterations-- < 0) - break; dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" " shadow: 0x%016llx\n", From e18190b7e97e9db6546390e6e0ceddae606892b2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:15:49 -0700 Subject: [PATCH 319/372] mm/damon/lru_sort: use param_ctx for damon_attrs staging damon_lru_sort_apply_parameters() allocates a new DAMON context, stages user-specified DAMON parameters on it, and commits to running DAMON context at once, using damon_commit_ctx(). The code is, however, directly updating the monitoring attributes of the running context. And the attributes are over-written by later damon_commit_ctx() call. This means that the monitoring attributes parameters are not really working. Fix the wrong use of the parameter context. Link: https://lkml.kernel.org/r/20250916031549.115326-1-sj@kernel.org Fixes: a30969436428 ("mm/damon/lru_sort: use damon_commit_ctx()") Signed-off-by: SeongJae Park Reviewed-by: Joshua Hahn Cc: Joshua Hahn Cc: [6.11+] Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 14d31009c09e..ab6173a646bd 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -219,7 +219,7 @@ static int damon_lru_sort_apply_parameters(void) goto out; } - err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); + err = damon_set_attrs(param_ctx, &damon_lru_sort_mon_attrs); if (err) goto out; From d02ac836e4d6bdfd7d44927d01a4cd048ad4aba8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 13 Sep 2025 17:03:39 -0700 Subject: [PATCH 320/372] include/linux/pgtable.h: convert arch_enter_lazy_mmu_mode() and friends to static inlines For all the usual reasons, plus a new one. Calling (void)arch_enter_lazy_mmu_mode(); deservedly blows up. Cc: Balbir Singh Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94249e671a7e..32e8457ad535 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -232,9 +232,9 @@ static inline int pmd_dirty(pmd_t pmd) * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) +static inline void arch_enter_lazy_mmu_mode(void) {} +static inline void arch_leave_lazy_mmu_mode(void) {} +static inline void arch_flush_lazy_mmu_mode(void) {} #endif #ifndef pte_batch_hint From 59d4d36158ba3cdbce141d8e9261eea154d4c441 Mon Sep 17 00:00:00 2001 From: zhongjinji Date: Tue, 16 Sep 2025 00:29:45 +0800 Subject: [PATCH 321/372] mm/oom_kill: thaw the entire OOM victim process Patch series "Improvements to Victim Process Thawing and OOM Reaper Traversal Order", v10. This patch series focuses on optimizing victim process thawing and refining the traversal order of the OOM reaper. Since __thaw_task() is used to thaw a single thread of the victim, thawing only one thread cannot guarantee the exit of the OOM victim when it is frozen. Patch 1 thaw the entire process of the OOM victim to ensure that OOM victims are able to terminate themselves. Even if the oom_reaper is delayed, patch 2 is still beneficial for reaping processes with a large address space footprint, and it also greatly improves process_mrelease. This patch (of 10): OOM killer is a mechanism that selects and kills processes when the system runs out of memory to reclaim resources and keep the system stable. But the oom victim cannot terminate on its own when it is frozen, even if the OOM victim task is thawed through __thaw_task(). This is because __thaw_task() can only thaw a single OOM victim thread, and cannot thaw the entire OOM victim process. In addition, freezing_slow_path() determines whether a task is an OOM victim by checking the task's TIF_MEMDIE flag. When a task is identified as an OOM victim, the freezer bypasses both PM freezing and cgroup freezing states to thaw it. Historically, TIF_MEMDIE was a "this is the oom victim & it has access to memory reserves" flag in the past. It has that thread vs. process problems and tsk_is_oom_victim was introduced later to get rid of them and other issues as well as the guarantee that we can identify the oom victim's mm reliably for other oom_reaper. Therefore, thaw_process() is introduced to unfreeze all threads within the OOM victim process, ensuring that every thread is properly thawed. The freezer now uses tsk_is_oom_victim() to determine OOM victim status, allowing all victim threads to be unfrozen as necessary. With this change, the entire OOM victim process will be thawed when an OOM event occurs, ensuring that the victim can terminate on its own. Link: https://lkml.kernel.org/r/20250915162946.5515-1-zhongjinji@honor.com Link: https://lkml.kernel.org/r/20250915162946.5515-2-zhongjinji@honor.com Signed-off-by: zhongjinji Reviewed-by: Suren Baghdasaryan Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Liam R. Howlett Cc: David Rientjes Cc: Len Brown Cc: Lorenzo Stoakes Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- include/linux/freezer.h | 2 ++ kernel/freezer.c | 20 +++++++++++++++++++- mm/oom_kill.c | 10 +++++----- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/linux/freezer.h b/include/linux/freezer.h index b303472255be..32884c9721e5 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -47,6 +47,7 @@ extern int freeze_processes(void); extern int freeze_kernel_threads(void); extern void thaw_processes(void); extern void thaw_kernel_threads(void); +extern void thaw_process(struct task_struct *p); static inline bool try_to_freeze(void) { @@ -80,6 +81,7 @@ static inline int freeze_processes(void) { return -ENOSYS; } static inline int freeze_kernel_threads(void) { return -ENOSYS; } static inline void thaw_processes(void) {} static inline void thaw_kernel_threads(void) {} +static inline void thaw_process(struct task_struct *p) {} static inline bool try_to_freeze(void) { return false; } diff --git a/kernel/freezer.c b/kernel/freezer.c index 6a96149aede9..ddc11a8bd2ea 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /* total number of freezing conditions in effect */ @@ -40,7 +41,7 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; - if (test_tsk_thread_flag(p, TIF_MEMDIE)) + if (tsk_is_oom_victim(p)) return false; if (pm_nosig_freezing || cgroup_freezing(p)) @@ -206,6 +207,23 @@ void __thaw_task(struct task_struct *p) wake_up_state(p, TASK_FROZEN); } +/* + * thaw_process - Thaw a frozen process + * @p: the process to be thawed + * + * Iterate over all threads of @p and call __thaw_task() on each. + */ +void thaw_process(struct task_struct *p) +{ + struct task_struct *t; + + rcu_read_lock(); + for_each_thread(p, t) { + __thaw_task(t); + } + rcu_read_unlock(); +} + /** * set_freezable - make %current freezable * diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 58bd4cf71d52..22caef65f1d0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -772,12 +772,12 @@ static void mark_oom_victim(struct task_struct *tsk) mmgrab(tsk->signal->oom_mm); /* - * Make sure that the task is woken up from uninterruptible sleep - * if it is frozen because OOM killer wouldn't be able to free - * any memory and livelock. freezing_slow_path will tell the freezer - * that TIF_MEMDIE tasks should be ignored. + * Make sure that the process is woken up from uninterruptible sleep + * if it is frozen because OOM killer wouldn't be able to free any + * memory and livelock. The freezer will thaw the tasks that are OOM + * victims regardless of the PM freezing and cgroup freezing states. */ - __thaw_task(tsk); + thaw_process(tsk); atomic_inc(&oom_victims); cred = get_task_cred(tsk); trace_mark_victim(tsk, cred->uid.val); From 5e1953dc71af01fae3d6786e073892ef3eebc3d8 Mon Sep 17 00:00:00 2001 From: zhongjinji Date: Tue, 16 Sep 2025 00:29:46 +0800 Subject: [PATCH 322/372] mm/oom_kill: the OOM reaper traverses the VMA maple tree in reverse order Although the oom_reaper is delayed and it gives the oom victim chance to clean up its address space this might take a while especially for processes with a large address space footprint. In those cases oom_reaper might start racing with the dying task and compete for shared resources - e.g. page table lock contention has been observed. Reduce those races by reaping the oom victim from the other end of the address space. It is also a significant improvement for process_mrelease(). When a process is killed, process_mrelease is used to reap the killed process and often runs concurrently with the dying task. The test data shows that after applying the patch, lock contention is greatly reduced during the procedure of reaping the killed process. The test is conducted on arm64. The following basic perf numbers show that applying this patch significantly reduces pte spin lock contention. Without the patch: |--99.57%-- oom_reaper | |--73.58%-- unmap_page_range | | |--8.67%-- [hit in function] | | |--41.59%-- __pte_offset_map_lock | | |--29.47%-- folio_remove_rmap_ptes | | |--16.11%-- tlb_flush_mmu | |--19.94%-- tlb_finish_mmu | |--3.21%-- folio_remove_rmap_ptes With the patch: |--99.53%-- oom_reaper | |--55.77%-- unmap_page_range | | |--20.49%-- [hit in function] | | |--58.30%-- folio_remove_rmap_ptes | | |--11.48%-- tlb_flush_mmu | | |--3.33%-- folio_mark_accessed | |--32.21%-- tlb_finish_mmu | |--6.93%-- folio_remove_rmap_ptes | |--0.69%-- __pte_offset_map_lock Detailed breakdowns for both scenarios are provided below. The cumulative time for oom_reaper plus exit_mmap(victim) in both cases is also summarized, making the performance improvements clear. +----------------------------------------------------------------+ | Category | Applying patch | Without patch | +-------------------------------+----------------+---------------+ | Total running time | 132.6 | 167.1 | | (exit_mmap + reaper work) | 72.4 + 60.2 | 90.7 + 76.4 | +-------------------------------+----------------+---------------+ | Time waiting for pte spinlock | 1.0 | 33.1 | | (exit_mmap + reaper work) | 0.4 + 0.6 | 10.0 + 23.1 | +-------------------------------+----------------+---------------+ | folio_remove_rmap_ptes time | 42.0 | 41.3 | | (exit_mmap + reaper work) | 18.4 + 23.6 | 22.4 + 18.9 | +----------------------------------------------------------------+ From this report, we can see that: 1. The reduction in total time comes mainly from the decrease in time spent on pte spinlock and other locks. 2. oom_reaper performs more work in some areas, but at the same time, exit_mmap also handles certain tasks more efficiently, such as folio_remove_rmap_ptes. Here is a more detailed perf report. [1] Link: https://lkml.kernel.org/r/20250915162946.5515-3-zhongjinji@honor.com Link: https://lore.kernel.org/all/20250915162619.5133-1-zhongjinji@honor.com/ [1] Signed-off-by: zhongjinji Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: David Rientjes Cc: Len Brown Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- mm/oom_kill.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 22caef65f1d0..e33087c60f3b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -516,7 +516,7 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; - VMA_ITERATOR(vmi, mm, 0); + MA_STATE(mas, &mm->mm_mt, ULONG_MAX, ULONG_MAX); /* * Tell all users of get_user/copy_from_user etc... that the content @@ -526,7 +526,13 @@ static bool __oom_reap_task_mm(struct mm_struct *mm) */ mm_flags_set(MMF_UNSTABLE, mm); - for_each_vma(vmi, vma) { + /* + * It might start racing with the dying task and compete for shared + * resources - e.g. page table lock contention has been observed. + * Reduce those races by reaping the oom victim from the other end + * of the address space. + */ + mas_for_each_rev(&mas, vma, 0) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; From b9e2f58ffb84afcbba7e66f96ca14f98e0e88f26 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 15 Sep 2025 16:02:24 -0700 Subject: [PATCH 323/372] alloc_tag: mark inaccurate allocation counters in /proc/allocinfo output While rare, memory allocation profiling can contain inaccurate counters if slab object extension vector allocation fails. That allocation might succeed later but prior to that, slab allocations that would have used that object extension vector will not be accounted for. To indicate incorrect counters, "accurate:no" marker is appended to the call site line in the /proc/allocinfo output. Bump up /proc/allocinfo version to reflect the change in the file format and update documentation. Example output with invalid counters: allocinfo - version: 2.0 0 0 arch/x86/kernel/kdebugfs.c:105 func:create_setup_data_nodes 0 0 arch/x86/kernel/alternative.c:2090 func:alternatives_smp_module_add 0 0 arch/x86/kernel/alternative.c:127 func:__its_alloc accurate:no 0 0 arch/x86/kernel/fpu/regset.c:160 func:xstateregs_set 0 0 arch/x86/kernel/fpu/xstate.c:1590 func:fpstate_realloc 0 0 arch/x86/kernel/cpu/aperfmperf.c:379 func:arch_enable_hybrid_capacity_scale 0 0 arch/x86/kernel/cpu/amd_cache_disable.c:258 func:init_amd_l3_attrs 49152 48 arch/x86/kernel/cpu/mce/core.c:2709 func:mce_device_create accurate:no 32768 1 arch/x86/kernel/cpu/mce/genpool.c:132 func:mce_gen_pool_create 0 0 arch/x86/kernel/cpu/mce/amd.c:1341 func:mce_threshold_create_device [surenb@google.com: document new "accurate:no" marker] Fixes: 39d117e04d15 ("alloc_tag: mark inaccurate allocation counters in /proc/allocinfo output") [akpm@linux-foundation.org: simplification per Usama, reflow text] [akpm@linux-foundation.org: add newline to prevent docs warning, per Randy] Link: https://lkml.kernel.org/r/20250915230224.4115531-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Usama Arif Acked-by: Johannes Weiner Cc: David Rientjes Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Roman Gushchin Cc: Sourav Panda Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 13 +++++++++++++ include/linux/alloc_tag.h | 12 ++++++++++++ include/linux/codetag.h | 5 ++++- lib/alloc_tag.c | 4 +++- mm/slub.c | 2 ++ 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 915a3e44bc12..6ffd04c736db 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1009,6 +1009,19 @@ number, module (if originates from a loadable module) and the function calling the allocation. The number of bytes allocated and number of calls at each location are reported. The first line indicates the version of the file, the second line is the header listing fields in the file. +If file version is 2.0 or higher then each line may contain additional +: pairs representing extra information about the call site. +For example if the counters are not accurate, the line will be appended with +"accurate:no" pair. + +Supported markers in v2: +accurate:no + + Absolute values of the counters in this line are not accurate + because of the failure to allocate memory to track some of the + allocations made at this location. Deltas in these counters are + accurate, therefore counters can be used to track allocation size + and count changes. Example output. diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 9ef2633e2c08..d40ac39bfbe8 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -221,6 +221,16 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) ref->ct = NULL; } +static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) +{ + tag->ct.flags |= CODETAG_FLAG_INACCURATE; +} + +static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) +{ + return !!(tag->ct.flags & CODETAG_FLAG_INACCURATE); +} + #define alloc_tag_record(p) ((p) = current->alloc_tag) #else /* CONFIG_MEM_ALLOC_PROFILING */ @@ -230,6 +240,8 @@ static inline bool mem_alloc_profiling_enabled(void) { return false; } static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) {} static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_set_inaccurate(struct alloc_tag *tag) {} +static inline bool alloc_tag_is_inaccurate(struct alloc_tag *tag) { return false; } #define alloc_tag_record(p) do {} while (0) #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/include/linux/codetag.h b/include/linux/codetag.h index 457ed8fd3214..8ea2a5f7c98a 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -16,13 +16,16 @@ struct module; #define CODETAG_SECTION_START_PREFIX "__start_" #define CODETAG_SECTION_STOP_PREFIX "__stop_" +/* codetag flags */ +#define CODETAG_FLAG_INACCURATE (1 << 0) + /* * An instance of this structure is created in a special ELF section at every * code location being tagged. At runtime, the special section is treated as * an array of these. */ struct codetag { - unsigned int flags; /* used in later patches */ + unsigned int flags; unsigned int lineno; const char *modname; const char *function; diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index f79217427d81..3ef702e6b69a 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -80,7 +80,7 @@ static void allocinfo_stop(struct seq_file *m, void *arg) static void print_allocinfo_header(struct seq_buf *buf) { /* Output format version, so we can change it. */ - seq_buf_printf(buf, "allocinfo - version: 1.0\n"); + seq_buf_printf(buf, "allocinfo - version: 2.0\n"); seq_buf_printf(buf, "# \n"); } @@ -92,6 +92,8 @@ static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls); codetag_to_text(out, ct); + if (unlikely(alloc_tag_is_inaccurate(tag))) + seq_buf_printf(out, " accurate:no"); seq_buf_putc(out, ' '); seq_buf_putc(out, '\n'); } diff --git a/mm/slub.c b/mm/slub.c index af343ca570b5..9c04f29ee8de 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2143,6 +2143,8 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) */ if (likely(obj_exts)) alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); + else + alloc_tag_set_inaccurate(current->alloc_tag); } static inline void From 2a05df14b3ad921ff2fcc6cc535cb153cbf38c87 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:23:35 -0700 Subject: [PATCH 324/372] mm/damon/core: reset age if nr_accesses changes between non-zero and zero Patch series "mm/damon: misc fixups and improvements for 6.18", v2. Misc fixes and improvements for DAMON that are not critical and therefore aims to be merged into Linux 6.18-rc1. The first patch improves DAMON's age counting for nr_accesses zero to/from non-zero changes. The second patch fixes an initial DAMOS apply interval delay issue that is not realistic but still could happen on an odd setup. The third and the fourth patches update DAMON community meetup description and DAMON user-space tool example command for DAMOS usage, respectively. Finally, the fifth patch updates MAINTAINERS section name for DAMON to just DAMON. This patch (of 5): DAMON resets the age of a region if its nr_accesses value has significantly changed. Specifically, the threshold is calculated as 20% of largest nr_accesses of the current snapshot. This means that regions changing the nr_accesses from zero to small non-zero value or from a small non-zero value to zero will keep the age. Since many users treat zero nr_accesses regions special, this can be confusing. Kernel code including DAMOS' regions priority calculation and DAMON_STAT's idle time calculation also treat zero nr_accesses regions special. Make it unconfusing by resetting the age when the nr_accesses changes between zero and a non-zero value. Link: https://lkml.kernel.org/r/20250916032339.115817-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250916032339.115817-2-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Joshua Hahn Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index be5942435d78..ff2c6bb30621 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2261,6 +2261,8 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, damon_for_each_region_safe(r, next, t) { if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; + else if ((r->nr_accesses == 0) != (r->last_nr_accesses == 0)) + r->age = 0; else r->age++; From ac93e87c66fd30b6cf328591ea0f09321ab98fac Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:23:36 -0700 Subject: [PATCH 325/372] mm/damon/core: set effective quota on first charge window The effective quota of a scheme is initialized zero, which means there is no quota. It is set based on user-specified time/quota/quota goals. But the later value set is done only from the second charge window. As a result, a scheme having a user-specified quota can work as not having the quota (unexpectedly fast) for the first charge window. In practical and common use cases the quota interval is not too long, and the scheme's target access pattern is restrictive. Hence the issue should be modest. That said, it is apparently an unintended misbehavior. Fix the problem by setting esz on the first charge window. Link: https://lkml.kernel.org/r/20250916032339.115817-3-sj@kernel.org Fixes: 1cd243030059 ("mm/damon/schemes: implement time quota") # 5.16.x Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/damon/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index ff2c6bb30621..775121ae7a9b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2142,8 +2142,10 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) return; /* First charge window */ - if (!quota->total_charged_sz && !quota->charged_from) + if (!quota->total_charged_sz && !quota->charged_from) { quota->charged_from = jiffies; + damos_set_effective_quota(quota); + } /* New charge window starts */ if (time_after_eq(jiffies, quota->charged_from + From 489c5d096e6bdae0742299861c2f6551b73a7823 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:23:37 -0700 Subject: [PATCH 326/372] Docs/mm/damon/maintainer-profile: update community meetup for reservation requirements DAMON community meetup was having two different kinds of meetups: reservation required ones and unrequired ones. Now the reservation unrequested one is gone, but the documentation on the maintainer-profile is not updated. Update. Link: https://lkml.kernel.org/r/20250916032339.115817-4-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Joshua Hahn Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/mm/damon/maintainer-profile.rst | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/Documentation/mm/damon/maintainer-profile.rst b/Documentation/mm/damon/maintainer-profile.rst index 5cd07905a193..58a3fb3c5762 100644 --- a/Documentation/mm/damon/maintainer-profile.rst +++ b/Documentation/mm/damon/maintainer-profile.rst @@ -89,18 +89,13 @@ the maintainer. Community meetup ---------------- -DAMON community is maintaining two bi-weekly meetup series for community -members who prefer synchronous conversations over mails. +DAMON community has a bi-weekly meetup series for members who prefer +synchronous conversations over mails. It is for discussions on specific topics +between a group of members including the maintainer. The maintainer shares the +available time slots, and attendees should reserve one of those at least 24 +hours before the time slot, by reaching out to the maintainer. -The first one is for any discussion between every community member. No -reservation is needed. - -The seconds one is for discussions on specific topics between restricted -members including the maintainer. The maintainer shares the available time -slots, and attendees should reserve one of those at least 24 hours before the -time slot, by reaching out to the maintainer. - -Schedules and available reservation time slots are available at the Google `doc +Schedules and reservation status are available at the Google `doc `_. There is also a public Google `calendar `_ From bff3026320adad29159758f4588cffc2cf5cb4b4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:23:38 -0700 Subject: [PATCH 327/372] Docs/admin-guide/mm/damon/start: add --target_pid to DAMOS example command The example command doesn't work [1] on the latest DAMON user-space tool, since --damos_action option is updated to receive multiple arguments, and hence cannot know if the final argument is for deductible monitoring target or an argument for --damos_action option. Add --target_pid option to let damo understand it is for target pid. Link: https://lkml.kernel.org/r/20250916032339.115817-5-sj@kernel.org Link: https://github.com/damonitor/damo/pull/32 [2] Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/start.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index ede14b679d02..ec8c34b2d32f 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -175,4 +175,4 @@ Below command makes every memory region of size >=4K that has not accessed for $ sudo damo start --damos_access_rate 0 0 --damos_sz_region 4K max \ --damos_age 60s max --damos_action pageout \ - + --target_pid From 9044cbe50a708b77f0135a9325f0298ff6266853 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:23:39 -0700 Subject: [PATCH 328/372] MAINTAINERS: rename DAMON section DAMON section name is 'DATA ACCESS MONITOR', which implies it is only for data access monitoring. But DAMON is now evolved for not only access monitoring but also access-aware system operations (DAMOS). Rename the section to simply DAMON. It might make it difficult to understand what it does at a glance, but at least not spreading more confusion. Readers can further refer to the documentation to better understand what really DAMON does. Link: https://lkml.kernel.org/r/20250916032339.115817-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Joshua Hahn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4c8bbf70a3c7..ca8e3d18eedd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6738,7 +6738,7 @@ S: Maintained W: https://docs.dasharo.com/ F: drivers/platform/x86/dasharo-acpi.c -DATA ACCESS MONITOR +DAMON M: SeongJae Park L: damon@lists.linux.dev L: linux-mm@kvack.org From ab152db3cae520154d572cff32e63de441672454 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:05 -0700 Subject: [PATCH 329/372] mm/damon/core: implement damon_initialized() function Patch series "mm/damon: define and use DAMON initialization check function". DAMON is initialized in subsystem initialization time, by damon_init(). If DAMON API functions are called before the initialization, the system could crash. Actually such issues happened and were fixed [1] in the past. For the fix, DAMON API callers have updated to check if DAMON is initialized or not, using their own hacks. The hacks are unnecessarily duplicated on every DAMON API callers and therefore it would be difficult to reliably maintain in the long term. Make it reliable and easy to maintain. For this, implement a new DAMON core layer API function that returns if DAMON is successfully initialized. If it returns true, it means DAMON API functions are safe to be used. After the introduction of the new API, update DAMON API callers to use the new function instead of their own hacks. This patch (of 7): If DAMON is tried to be used when it is not yet successfully initialized, the caller could be crashed. DAMON core layer is not providing a reliable way to see if it is successfully initialized and therefore ready to be used, though. As a result, DAMON API callers are implementing their own hacks to see it. The hacks simply assume DAMON should be ready on module init time. It is not reliable as DAMON initialization can indeed fail if KMEM_CACHE() fails, and difficult to maintain as those are duplicates. Implement a core layer API function for better reliability and maintainability to replace the hacks with followup commits. Link: https://lkml.kernel.org/r/20250916033511.116366-2-sj@kernel.org Link: https://lkml.kernel.org/r/20250916033511.116366-2-sj@kernel.org Link: https://lore.kernel.org/20250909022238.2989-1-sj@kernel.org [1] Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 1 + mm/damon/core.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index aa7381be388c..cae8c613c5fc 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -938,6 +938,7 @@ static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs } +bool damon_initialized(void); int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); bool damon_is_running(struct damon_ctx *ctx); diff --git a/mm/damon/core.c b/mm/damon/core.c index 775121ae7a9b..93848b4c6944 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2863,6 +2863,16 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, r->nr_accesses++; } +/** + * damon_initialized() - Return if DAMON is ready to be used. + * + * Return: true if DAMON is ready to be used, false otherwise. + */ +bool damon_initialized(void) +{ + return damon_region_cache != NULL; +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); From b663f17b738f4a9a9f599dff1bed8e4db519e6f8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:06 -0700 Subject: [PATCH 330/372] mm/damon/stat: use damon_initialized() DAMON_STAT is assuming DAMON is ready to use in module_init time, and uses its own hack to see if it is the time. Use damon_initialized(), which is a way for seeing if DAMON is ready to be used that is more reliable and better to maintain instead of the hack. Link: https://lkml.kernel.org/r/20250916033511.116366-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/stat.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 87bcd8866d4b..c4fbd8cfa5eb 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -214,8 +214,6 @@ static void damon_stat_stop(void) damon_destroy_ctx(damon_stat_context); } -static bool damon_stat_init_called; - static int damon_stat_enabled_store( const char *val, const struct kernel_param *kp) { @@ -229,7 +227,7 @@ static int damon_stat_enabled_store( if (is_enabled == enabled) return 0; - if (!damon_stat_init_called) + if (!damon_initialized()) /* * probably called from command line parsing (parse_args()). * Cannot call damon_new_ctx(). Let damon_stat_init() handle. @@ -250,12 +248,16 @@ static int __init damon_stat_init(void) { int err = 0; - damon_stat_init_called = true; + if (!damon_initialized()) { + err = -ENOMEM; + goto out; + } /* probably set via command line */ if (enabled) err = damon_stat_start(); +out: if (err && enabled) enabled = false; return err; From 3f7a914ab9a5e46cf8aac7de270f02aa3f63de04 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:07 -0700 Subject: [PATCH 331/372] mm/damon/reclaim: use damon_initialized() DAMON_RECLAIM is assuming DAMON is ready to use in module_init time, and uses its own hack to see if it is the time. Use damon_initialized(), which is a way for seeing if DAMON is ready to be used that is more reliable and better to maintain instead of the hack. Link: https://lkml.kernel.org/r/20250916033511.116366-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 590f9d6c55ef..7ba3d0f9a19a 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -349,7 +349,7 @@ static int damon_reclaim_enabled_store(const char *val, return 0; /* Called before init function. The function will handle this. */ - if (!ctx) + if (!damon_initialized()) goto set_param_out; err = damon_reclaim_turn(enable); @@ -372,8 +372,13 @@ MODULE_PARM_DESC(enabled, static int __init damon_reclaim_init(void) { - int err = damon_modules_new_paddr_ctx_target(&ctx, &target); + int err; + if (!damon_initialized()) { + err = -ENOMEM; + goto out; + } + err = damon_modules_new_paddr_ctx_target(&ctx, &target); if (err) goto out; From c4a8e662c839ac0003e4781aa324cb2d68ed9cb1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:08 -0700 Subject: [PATCH 332/372] mm/damon/lru_sort: use damon_initialized() DAMON_LRU_SORT is assuming DAMON is ready to use in module_init time, and uses its own hack to see if it is the time. Use damon_initialized(), which is a way for seeing if DAMON is ready to be used that is more reliable and better to maintain instead of the hack. Link: https://lkml.kernel.org/r/20250916033511.116366-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index ab6173a646bd..42b9a656f9de 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -345,7 +345,7 @@ static int damon_lru_sort_enabled_store(const char *val, return 0; /* Called before init function. The function will handle this. */ - if (!ctx) + if (!damon_initialized()) goto set_param_out; err = damon_lru_sort_turn(enable); @@ -368,8 +368,13 @@ MODULE_PARM_DESC(enabled, static int __init damon_lru_sort_init(void) { - int err = damon_modules_new_paddr_ctx_target(&ctx, &target); + int err; + if (!damon_initialized()) { + err = -ENOMEM; + goto out; + } + err = damon_modules_new_paddr_ctx_target(&ctx, &target); if (err) goto out; From 1f70367f7b6720ca0d3280b202317aa9d0167066 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:09 -0700 Subject: [PATCH 333/372] samples/damon/wsse: use damon_initialized() damon_sample_wsse is assuming DAMON is ready to use in module_init time, and uses its own hack to see if it is the time. Use damon_initialized(), which is a way for seeing if DAMON is ready to be used that is more reliable and better to maintain instead of the hack. Link: https://lkml.kernel.org/r/20250916033511.116366-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- samples/damon/wsse.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/samples/damon/wsse.c b/samples/damon/wsse.c index 21eaf15f987d..799ad4443943 100644 --- a/samples/damon/wsse.c +++ b/samples/damon/wsse.c @@ -102,8 +102,6 @@ static void damon_sample_wsse_stop(void) } } -static bool init_called; - static int damon_sample_wsse_enable_store( const char *val, const struct kernel_param *kp) { @@ -117,10 +115,10 @@ static int damon_sample_wsse_enable_store( if (enabled == is_enabled) return 0; - if (enabled) { - if (!init_called) - return 0; + if (!damon_initialized()) + return 0; + if (enabled) { err = damon_sample_wsse_start(); if (err) enabled = false; @@ -134,7 +132,12 @@ static int __init damon_sample_wsse_init(void) { int err = 0; - init_called = true; + if (!damon_initialized()) { + err = -ENOMEM; + if (enabled) + enabled = false; + } + if (enabled) { err = damon_sample_wsse_start(); if (err) From 20c0ed5035fa81872a97c46d2d5beee5aab09800 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:10 -0700 Subject: [PATCH 334/372] samples/damon/prcl: use damon_initialized() damon_sample_prcl is assuming DAMON is ready to use in module_init time, and uses its own hack to see if it is the time. Use damon_initialized(), which is a way for seeing if DAMON is ready to be used that is more reliable and better to maintain instead of the hack. Link: https://lkml.kernel.org/r/20250916033511.116366-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- samples/damon/prcl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/samples/damon/prcl.c b/samples/damon/prcl.c index 0226652f94d5..b7c50f2656ce 100644 --- a/samples/damon/prcl.c +++ b/samples/damon/prcl.c @@ -122,8 +122,6 @@ static void damon_sample_prcl_stop(void) } } -static bool init_called; - static int damon_sample_prcl_enable_store( const char *val, const struct kernel_param *kp) { @@ -137,7 +135,7 @@ static int damon_sample_prcl_enable_store( if (enabled == is_enabled) return 0; - if (!init_called) + if (!damon_initialized()) return 0; if (enabled) { @@ -154,7 +152,12 @@ static int __init damon_sample_prcl_init(void) { int err = 0; - init_called = true; + if (!damon_initialized()) { + if (enabled) + enabled = false; + return -ENOMEM; + } + if (enabled) { err = damon_sample_prcl_start(); if (err) From d93871f473c57ed234d4549a602163019ea5b43f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 15 Sep 2025 20:35:11 -0700 Subject: [PATCH 335/372] samples/damon/mtier: use damon_initialized() damon_sample_mtier is assuming DAMON is ready to use in module_init time, and uses its own hack to see if it is the time. Use damon_initialized(), which is a way for seeing if DAMON is ready to be used that is more reliable and better to maintain instead of the hack. Link: https://lkml.kernel.org/r/20250916033511.116366-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- samples/damon/mtier.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/samples/damon/mtier.c b/samples/damon/mtier.c index beaf36657dea..775838a23d93 100644 --- a/samples/damon/mtier.c +++ b/samples/damon/mtier.c @@ -193,8 +193,6 @@ static void damon_sample_mtier_stop(void) damon_destroy_ctx(ctxs[1]); } -static bool init_called; - static int damon_sample_mtier_enable_store( const char *val, const struct kernel_param *kp) { @@ -208,7 +206,7 @@ static int damon_sample_mtier_enable_store( if (enabled == is_enabled) return 0; - if (!init_called) + if (!damon_initialized()) return 0; if (enabled) { @@ -225,7 +223,12 @@ static int __init damon_sample_mtier_init(void) { int err = 0; - init_called = true; + if (!damon_initialized()) { + if (enabled) + enabled = false; + return -ENOMEM; + } + if (enabled) { err = damon_sample_mtier_start(); if (err) From cc7ceb1d14b0a6f6eb83dfdfa91970f69c4c23b4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Sep 2025 11:31:26 -0700 Subject: [PATCH 336/372] mm/damon/stat: expose the current tuned aggregation interval Patch series "mm/damon/stat: expose auto-tuned intervals and non-idle ages". DAMON_STAT is intentionally providing limited information for easy consumption of the information. From production fleet level usages, below limitations are found, though. The aggregation interval of DAMON_STAT represents the granularity of the memory_idle_ms_percentiles. But the interval is auto-tuned and not exposed to users, so users cannot know the granularity. All memory regions of non-zero (positive) nr_accesses are treated as having zero idle time. A significant portion of production systems have such zero idle time. Hence breakdown of warm and hot data is nearly impossible. Make following changes to overcome the limitations. Expose the auto-tuned aggregation interval with a new parameter named aggr_interval_us. Expose the age of non-zero nr_accesses (how long >0 access frequency the region retained) regions as a negative idle time. This patch (of 2): DAMON_STAT calculates the idle time for a region as the region's age multiplied by the aggregation interval. That is, the aggregation interval is the granularity of the idle time. Since the aggregation interval is auto-tuned and not exposed to users, however, users cannot easily know in what granularity the stat is made. Expose the tuned aggregation interval in microseconds via a new parameter, aggr_interval_us. Link: https://lkml.kernel.org/r/20250916183127.65708-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250916183127.65708-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/stat.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/damon/stat.c b/mm/damon/stat.c index c4fbd8cfa5eb..1a8465abef4a 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -39,6 +39,11 @@ module_param_array(memory_idle_ms_percentiles, ulong, NULL, 0400); MODULE_PARM_DESC(memory_idle_ms_percentiles, "Memory idle time percentiles in milliseconds"); +static unsigned long aggr_interval_us; +module_param(aggr_interval_us, ulong, 0400); +MODULE_PARM_DESC(aggr_interval_us, + "Current tuned aggregation interval in microseconds"); + static struct damon_ctx *damon_stat_context; static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) @@ -133,6 +138,7 @@ static int damon_stat_damon_call_fn(void *data) return 0; last_refresh_jiffies = jiffies; + aggr_interval_us = c->attrs.aggr_interval; damon_stat_set_estimated_memory_bandwidth(c); damon_stat_set_idletime_percentiles(c); return 0; From a983a26d52989996f7626d10833610572112e753 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 16 Sep 2025 11:31:27 -0700 Subject: [PATCH 337/372] mm/damon/stat: expose negative idle time DAMON_STAT calculates the idle time of a region using the region's age if the region's nr_accesses is zero. If the nr_accesses value is non-zero (positive), the idle time of the region becomes zero. This means the users cannot know how warm and hot data is distributed, using DAMON_STAT's memory_idle_ms_percentiles output. The other stat, namely estimated_memory_bandwidth, can help understanding how the overall access temperature of the system is, but it is still very rough information. On production systems, actually, a significant portion of the system memory is observed with zero idle time, and we cannot break it down based on its internal hotness distribution. Define the idle time of the region using its age, similar to those having zero nr_accesses, but multiples '-1' to distinguish it. And expose that using the same parameter interface, memory_idle_ms_percentiles. Link: https://lkml.kernel.org/r/20250916183127.65708-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/stat.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/damon/stat.c b/mm/damon/stat.c index 1a8465abef4a..d8010968bbed 100644 --- a/mm/damon/stat.c +++ b/mm/damon/stat.c @@ -34,8 +34,8 @@ module_param(estimated_memory_bandwidth, ulong, 0400); MODULE_PARM_DESC(estimated_memory_bandwidth, "Estimated memory bandwidth usage in bytes per second"); -static unsigned long memory_idle_ms_percentiles[101] __read_mostly = {0,}; -module_param_array(memory_idle_ms_percentiles, ulong, NULL, 0400); +static long memory_idle_ms_percentiles[101] __read_mostly = {0,}; +module_param_array(memory_idle_ms_percentiles, long, NULL, 0400); MODULE_PARM_DESC(memory_idle_ms_percentiles, "Memory idle time percentiles in milliseconds"); @@ -61,10 +61,10 @@ static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c) MSEC_PER_SEC / c->attrs.aggr_interval; } -static unsigned int damon_stat_idletime(const struct damon_region *r) +static int damon_stat_idletime(const struct damon_region *r) { if (r->nr_accesses) - return 0; + return -1 * (r->age + 1); return r->age + 1; } @@ -122,7 +122,7 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c) while (next_percentile <= accounted_bytes * 100 / total_sz) memory_idle_ms_percentiles[next_percentile++] = damon_stat_idletime(region) * - c->attrs.aggr_interval / USEC_PER_MSEC; + (long)c->attrs.aggr_interval / USEC_PER_MSEC; } kfree(sorted_regions); } From 62b98015d98815ad65ab5ca4d4077bc7c952e917 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Mon, 8 Sep 2025 13:20:27 +0530 Subject: [PATCH 338/372] mm: enable khugepaged anonymous collapse on non-writable regions Patch series "Expand scope of khugepaged anonymous collapse", v2. Currently khugepaged does not collapse an anonymous region which does not have a single writable pte. This is wasteful since a region mapped with non-writable ptes, for example, non-writable VMAs mapped by the application, won't benefit from THP collapse. An additional consequence of this constraint is that MADV_COLLAPSE does not perform a collapse on a non-writable VMA, and this restriction is nowhere to be found on the manpage - the restriction itself sounds wrong to me since the user knows the protection of the memory it has mapped, so collapsing read-only memory via madvise() should be a choice of the user which shouldn't be overridden by the kernel. Therefore, remove this constraint. On an arm64 bare metal machine, comparing with vanilla 6.17-rc2, an average of 5% improvement is seen on some mmtests benchmarks, particularly hackbench, with a maximum improvement of 12%. In the following table, (I) denotes statistically significant improvement, (R) denotes statistically significant regression. +-------------------------+--------------------------------+---------------+ | mmtests/hackbench | process-pipes-1 (seconds) | -0.06% | | | process-pipes-4 (seconds) | -0.27% | | | process-pipes-7 (seconds) | (I) -12.13% | | | process-pipes-12 (seconds) | (I) -5.32% | | | process-pipes-21 (seconds) | (I) -2.87% | | | process-pipes-30 (seconds) | (I) -3.39% | | | process-pipes-48 (seconds) | (I) -5.65% | | | process-pipes-79 (seconds) | (I) -6.74% | | | process-pipes-110 (seconds) | (I) -6.26% | | | process-pipes-141 (seconds) | (I) -4.99% | | | process-pipes-172 (seconds) | (I) -4.45% | | | process-pipes-203 (seconds) | (I) -3.65% | | | process-pipes-234 (seconds) | (I) -3.45% | | | process-pipes-256 (seconds) | (I) -3.47% | | | process-sockets-1 (seconds) | 2.13% | | | process-sockets-4 (seconds) | 1.02% | | | process-sockets-7 (seconds) | -0.26% | | | process-sockets-12 (seconds) | -1.24% | | | process-sockets-21 (seconds) | 0.01% | | | process-sockets-30 (seconds) | -0.15% | | | process-sockets-48 (seconds) | 0.15% | | | process-sockets-79 (seconds) | 1.45% | | | process-sockets-110 (seconds) | -1.64% | | | process-sockets-141 (seconds) | (I) -4.27% | | | process-sockets-172 (seconds) | 0.30% | | | process-sockets-203 (seconds) | -1.71% | | | process-sockets-234 (seconds) | -1.94% | | | process-sockets-256 (seconds) | -0.71% | | | thread-pipes-1 (seconds) | 0.66% | | | thread-pipes-4 (seconds) | 1.66% | | | thread-pipes-7 (seconds) | -0.17% | | | thread-pipes-12 (seconds) | (I) -4.12% | | | thread-pipes-21 (seconds) | (I) -2.13% | | | thread-pipes-30 (seconds) | (I) -3.78% | | | thread-pipes-48 (seconds) | (I) -5.77% | | | thread-pipes-79 (seconds) | (I) -5.31% | | | thread-pipes-110 (seconds) | (I) -6.12% | | | thread-pipes-141 (seconds) | (I) -4.00% | | | thread-pipes-172 (seconds) | (I) -3.01% | | | thread-pipes-203 (seconds) | (I) -2.62% | | | thread-pipes-234 (seconds) | (I) -2.00% | | | thread-pipes-256 (seconds) | (I) -2.30% | | | thread-sockets-1 (seconds) | (R) 2.39% | +-------------------------+--------------------------------+---------------+ +-------------------------+------------------------------------------------+ | mmtests/sysbench-mutex | sysbenchmutex-1 (usec) | -0.02% | | | sysbenchmutex-4 (usec) | -0.02% | | | sysbenchmutex-7 (usec) | 0.00% | | | sysbenchmutex-12 (usec) | 0.12% | | | sysbenchmutex-21 (usec) | -0.40% | | | sysbenchmutex-30 (usec) | 0.08% | | | sysbenchmutex-48 (usec) | 2.59% | | | sysbenchmutex-79 (usec) | -0.80% | | | sysbenchmutex-110 (usec) | -3.87% | | | sysbenchmutex-128 (usec) | (I) -4.46% | +-------------------------+--------------------------------+---------------+ This patch (of 2): Currently khugepaged does not collapse an anonymous region which does not have a single writable pte. This is wasteful since a region mapped with non-writable ptes, for example, non-writable VMAs mapped by the application, won't benefit from THP collapse. An additional consequence of this constraint is that MADV_COLLAPSE does not perform a collapse on a non-writable VMA, and this restriction is nowhere to be found on the manpage - the restriction itself sounds wrong to me since the user knows the protection of the memory it has mapped, so collapsing read-only memory via madvise() should be a choice of the user which shouldn't be overridden by the kernel. Therefore, remove this restriction by not honouring SCAN_PAGE_RO. Link: https://lkml.kernel.org/r/20250908075028.38431-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20250908075028.38431-2-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Wei Yang Reviewed-by: Kiryl Shutsemau Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Reviewed-by: Zach O'Keefe Reviewed-by: Anshuman Khandual Cc: Barry Song Cc: Hugh Dickins Cc: Liam Howlett Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/khugepaged.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4ec324a4c1fe..a0f1df2a7ae6 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -676,9 +676,7 @@ next: writable = true; } - if (unlikely(!writable)) { - result = SCAN_PAGE_RO; - } else if (unlikely(cc->is_khugepaged && !referenced)) { + if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; @@ -1421,9 +1419,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, mmu_notifier_test_young(vma->vm_mm, _address))) referenced++; } - if (!writable) { - result = SCAN_PAGE_RO; - } else if (cc->is_khugepaged && + if (cc->is_khugepaged && (!referenced || (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; @@ -2830,7 +2826,6 @@ handle_result: case SCAN_PMD_NULL: case SCAN_PTE_NON_PRESENT: case SCAN_PTE_UFFD_WP: - case SCAN_PAGE_RO: case SCAN_LACK_REFERENCED_PAGE: case SCAN_PAGE_NULL: case SCAN_PAGE_COUNT: From 473b73222f3d8cc66bcd840bf9c3260619620789 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Mon, 8 Sep 2025 13:20:28 +0530 Subject: [PATCH 339/372] mm: drop all references of writable and SCAN_PAGE_RO Now that all actionable outcomes from checking pte_write() are gone, drop the related references. Link: https://lkml.kernel.org/r/20250908075028.38431-3-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Kiryl Shutsemau Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Reviewed-by: Zach O'Keefe Reviewed-by: Anshuman Khandual Cc: Baolin Wang Cc: Barry Song Cc: Hugh Dickins Cc: Liam Howlett Cc: Mariano Pache Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Wei Yang Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 19 ++++++------------- mm/khugepaged.c | 14 +++----------- 2 files changed, 9 insertions(+), 24 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 2305df6cb485..dd94d14a2427 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -19,7 +19,6 @@ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \ - EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ EM( SCAN_PAGE_NULL, "page_null") \ EM( SCAN_SCAN_ABORT, "scan_aborted") \ @@ -55,15 +54,14 @@ SCAN_STATUS TRACE_EVENT(mm_khugepaged_scan_pmd, - TP_PROTO(struct mm_struct *mm, struct folio *folio, bool writable, + TP_PROTO(struct mm_struct *mm, struct folio *folio, int referenced, int none_or_zero, int status, int unmapped), - TP_ARGS(mm, folio, writable, referenced, none_or_zero, status, unmapped), + TP_ARGS(mm, folio, referenced, none_or_zero, status, unmapped), TP_STRUCT__entry( __field(struct mm_struct *, mm) __field(unsigned long, pfn) - __field(bool, writable) __field(int, referenced) __field(int, none_or_zero) __field(int, status) @@ -73,17 +71,15 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, TP_fast_assign( __entry->mm = mm; __entry->pfn = folio ? folio_pfn(folio) : -1; - __entry->writable = writable; __entry->referenced = referenced; __entry->none_or_zero = none_or_zero; __entry->status = status; __entry->unmapped = unmapped; ), - TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d", + TP_printk("mm=%p, scan_pfn=0x%lx, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d", __entry->mm, __entry->pfn, - __entry->writable, __entry->referenced, __entry->none_or_zero, __print_symbolic(__entry->status, SCAN_STATUS), @@ -117,15 +113,14 @@ TRACE_EVENT(mm_collapse_huge_page, TRACE_EVENT(mm_collapse_huge_page_isolate, TP_PROTO(struct folio *folio, int none_or_zero, - int referenced, bool writable, int status), + int referenced, int status), - TP_ARGS(folio, none_or_zero, referenced, writable, status), + TP_ARGS(folio, none_or_zero, referenced, status), TP_STRUCT__entry( __field(unsigned long, pfn) __field(int, none_or_zero) __field(int, referenced) - __field(bool, writable) __field(int, status) ), @@ -133,15 +128,13 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, __entry->pfn = folio ? folio_pfn(folio) : -1; __entry->none_or_zero = none_or_zero; __entry->referenced = referenced; - __entry->writable = writable; __entry->status = status; ), - TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, writable=%d, status=%s", + TP_printk("scan_pfn=0x%lx, none_or_zero=%d, referenced=%d, status=%s", __entry->pfn, __entry->none_or_zero, __entry->referenced, - __entry->writable, __print_symbolic(__entry->status, SCAN_STATUS)) ); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a0f1df2a7ae6..af5f5c80fe4e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -39,7 +39,6 @@ enum scan_result { SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, SCAN_PTE_MAPPED_HUGEPAGE, - SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, SCAN_SCAN_ABORT, @@ -557,7 +556,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, struct folio *folio = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; - bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { @@ -671,9 +669,6 @@ next: folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; - - if (pte_write(pteval)) - writable = true; } if (unlikely(cc->is_khugepaged && !referenced)) { @@ -681,13 +676,13 @@ next: } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(folio, none_or_zero, - referenced, writable, result); + referenced, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(folio, none_or_zero, - referenced, writable, result); + referenced, result); return result; } @@ -1280,7 +1275,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; - bool writable = false; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1344,8 +1338,6 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_UFFD_WP; goto out_unmap; } - if (pte_write(pteval)) - writable = true; page = vm_normal_page(vma, _address, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { @@ -1435,7 +1427,7 @@ out_unmap: *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, folio, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, folio, referenced, none_or_zero, result, unmapped); return result; } From af6703838ecb1513efdd2502a8f7bb6472c5ce96 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 3 Sep 2025 18:48:41 +0100 Subject: [PATCH 340/372] mm: specify separate file and vm_file params in vm_area_desc Patch series "mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare()", v2. As part of the efforts to eliminate the problematic f_op->mmap callback, a new callback - f_op->mmap_prepare was provided. While we are converting these callbacks, we must deal with 'stacked' filesystems and drivers - those which in their own f_op->mmap callback invoke an inner f_op->mmap callback. To accomodate for this, a compatibility layer is provided that, via vfs_mmap(), detects if f_op->mmap_prepare is provided and if so, generates a vm_area_desc containing the VMA's metadata and invokes the call. So far, we have provided desc->file equal to vma->vm_file. However this is not necessarily valid, especially in the case of stacked drivers which wish to assign a new file after the inner hook is invoked. To account for this, we adjust vm_area_desc to have both file and vm_file fields. The .vm_file field is strictly set to vma->vm_file (or in the case of a new mapping, what will become vma->vm_file). However, .file is set to whichever file vfs_mmap() is invoked with when using the compatibilty layer. Therefore, if the VMA's file needs to be updated in .mmap_prepare, desc->vm_file should be assigned, whilst desc->file should be read. No current f_op->mmap_prepare users assign desc->file so this is safe to do. This makes the .mmap_prepare callback in the context of a stacked filesystem or driver completely consistent with the existing .mmap implementations. While we're here, we do a few small cleanups, and ensure that we const-ify things correctly in the vm_area_desc struct to avoid hooks accidentally trying to assign fields they should not. This patch (of 2): Stacked filesystems and drivers may invoke mmap hooks with a struct file pointer that differs from the overlying file. We will make this functionality possible in a subsequent patch. In order to prepare for this, let's update vm_area_struct to separately provide desc->file and desc->vm_file parameters. The desc->file parameter is the file that the hook is expected to operate upon, and is not assignable (though the hok may wish to e.g. update the file's accessed time for instance). The desc->vm_file defaults to what will become vma->vm_file and is what the hook must reassign should it wish to change the VMA"s vma->vm_file. For now we keep desc->file, vm_file the same to remain consistent. No f_op->mmap_prepare() callback sets a new vma->vm_file currently, so this is safe to change. While we're here, make the mm_struct desc->mm pointers at immutable as well as the desc->mm field itself. As part of this change, also update the single hook which this would otherwise break - mlock_future_ok(), invoked by secretmem_mmap_prepare()). We additionally update set_vma_from_desc() to compare fields in a more logical fashion, checking the (possibly) user-modified fields as the first operand against the existing value as the second one. Additionally, update VMA tests to accommodate changes. Link: https://lkml.kernel.org/r/cover.1756920635.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/3fa15a861bb7419f033d22970598aa61850ea267.1756920635.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: Jan Kara Cc: Jann Horn Cc: Liam Howlett Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Pedro Falcato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 5 +++-- mm/internal.h | 4 ++-- mm/mmap.c | 2 +- mm/util.c | 14 ++++++++++++-- mm/vma.c | 5 +++-- mm/vma.h | 28 ++++------------------------ tools/testing/vma/vma_internal.h | 28 ++++++++++++++++++---------- 7 files changed, 43 insertions(+), 43 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6920c816f6c6..965dedb3ccfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -777,13 +777,14 @@ struct pfnmap_track_ctx { */ struct vm_area_desc { /* Immutable state. */ - struct mm_struct *mm; + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; - struct file *file; + struct file *vm_file; vm_flags_t vm_flags; pgprot_t page_prot; diff --git a/mm/internal.h b/mm/internal.h index c4657ffd342e..63e3ec8d63be 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -962,8 +962,8 @@ extern long populate_vma_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *locked); extern long faultin_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, bool write, int *locked); -extern bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, - unsigned long bytes); +bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, + unsigned long bytes); /* * NOTE: This function can't tell whether the folio is "fully mapped" in the diff --git a/mm/mmap.c b/mm/mmap.c index 7a057e0e8da9..5fd3b80fda1d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -225,7 +225,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, +bool mlock_future_ok(const struct mm_struct *mm, vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; diff --git a/mm/util.c b/mm/util.c index 732a2dfcaec7..215ecd0214b7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1161,10 +1161,20 @@ EXPORT_SYMBOL(flush_dcache_folio); */ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc; + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = vma->vm_file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; int err; - err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + err = file->f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); diff --git a/mm/vma.c b/mm/vma.c index 3b12c7579831..abe0da33c844 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2572,11 +2572,12 @@ static int call_mmap_prepare(struct mmap_state *map) int err; struct vm_area_desc desc = { .mm = map->mm, + .file = map->file, .start = map->addr, .end = map->end, .pgoff = map->pgoff, - .file = map->file, + .vm_file = map->file, .vm_flags = map->vm_flags, .page_prot = map->page_prot, }; @@ -2588,7 +2589,7 @@ static int call_mmap_prepare(struct mmap_state *map) /* Update fields permitted to be changed. */ map->pgoff = desc.pgoff; - map->file = desc.file; + map->file = desc.vm_file; map->vm_flags = desc.vm_flags; map->page_prot = desc.page_prot; /* User-defined fields. */ diff --git a/mm/vma.h b/mm/vma.h index bcdc261c5b15..9183fe549009 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -222,31 +222,11 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } - /* - * Temporary helper functions for file systems which wrap an invocation of + * Temporary helper function for stacked mmap handlers which specify * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ - -static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc) -{ - desc->mm = vma->vm_mm; - desc->start = vma->vm_start; - desc->end = vma->vm_end; - - desc->pgoff = vma->vm_pgoff; - desc->file = vma->vm_file; - desc->vm_flags = vma->vm_flags; - desc->page_prot = vma->vm_page_prot; - - desc->vm_ops = NULL; - desc->private_data = NULL; - - return desc; -} - static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { @@ -258,9 +238,9 @@ static inline void set_vma_from_desc(struct vm_area_struct *vma, /* Mutable fields. Populated with initial state. */ vma->vm_pgoff = desc->pgoff; - if (vma->vm_file != desc->file) - vma_set_file(vma, desc->file); - if (vma->vm_flags != desc->vm_flags) + if (desc->vm_file != vma->vm_file) + vma_set_file(vma, desc->vm_file); + if (desc->vm_flags != vma->vm_flags) vm_flags_set(vma, desc->vm_flags); vma->vm_page_prot = desc->page_prot; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 437d2a1013be..6dcbeaa9f9a0 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -283,13 +283,14 @@ struct vm_area_struct; */ struct vm_area_desc { /* Immutable state. */ - struct mm_struct *mm; + const struct mm_struct *const mm; + struct file *const file; /* May vary from vm_file in stacked callers. */ unsigned long start; unsigned long end; /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; - struct file *file; + struct file *vm_file; vm_flags_t vm_flags; pgprot_t page_prot; @@ -1299,8 +1300,8 @@ static inline bool capable(int cap) return true; } -static inline bool mlock_future_ok(struct mm_struct *mm, vm_flags_t vm_flags, - unsigned long bytes) +static inline bool mlock_future_ok(const struct mm_struct *mm, + vm_flags_t vm_flags, unsigned long bytes) { unsigned long locked_pages, limit_pages; @@ -1465,16 +1466,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, - struct vm_area_desc *desc); - -static int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc; + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = vma->vm_file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; int err; - err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + err = file->f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); From f7a741c53b712542aedd9382f215fbe969f8a580 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 3 Sep 2025 18:48:42 +0100 Subject: [PATCH 341/372] mm: do not assume file == vma->vm_file in compat_vma_mmap_prepare() In commit bb666b7c2707 ("mm: add mmap_prepare() compatibility layer for nested file systems") we introduced the ability for stacked drivers and file systems to correctly invoke the f_op->mmap_prepare() handler from an f_op->mmap() handler via a compatibility layer implemented in compat_vma_mmap_prepare(). This populates vm_area_desc fields according to those found in the (not yet fully initialised) VMA passed to f_op->mmap(). However this function implicitly assumes that the struct file which we are operating upon is equal to vma->vm_file. This is not a safe assumption in all cases. The only really sane situation in which this matters would be something like e.g. i915_gem_dmabuf_mmap() which invokes vfs_mmap() against obj->base.filp: ret = vfs_mmap(obj->base.filp, vma); if (ret) return ret; And then sets the VMA's file to this, should the mmap operation succeed: vma_set_file(vma, obj->base.filp); That is - it is the file that is intended to back the VMA mapping. This is not an issue currently, as so far we have only implemented f_op->mmap_prepare() handlers for some file systems and internal mm uses, and the only stacked f_op->mmap() operations that can be performed upon these are those in backing_file_mmap() and coda_file_mmap(), both of which use vma->vm_file. However, moving forward, as we convert drivers to using f_op->mmap_prepare(), this will become a problem. Resolve this issue by explicitly setting desc->file to the provided file parameter and update callers accordingly. Callers are expected to read desc->file and update desc->vm_file - the former will be the file provided by the caller (if stacked, this may differ from vma->vm_file). If the caller needs to differentiate between the two they therefore now can. While we are here, also provide a variant of compat_vma_mmap_prepare() that operates against a pointer to any file_operations struct and does not assume that the file_operations struct we are interested in is file->f_op. This function is __compat_vma_mmap_prepare() and we invoke it from compat_vma_mmap_prepare() so that we share code between the two functions. This is important, because some drivers provide hooks in a separate struct, for instance struct drm_device provides an fops field for this purpose. Also update the VMA selftests accordingly. Link: https://lkml.kernel.org/r/dd0c72df8a33e8ffaa243eeb9b01010b670610e9.1756920635.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Christian Brauner Reviewed-by: Pedro Falcato Reviewed-by: Liam R. Howlett Cc: Al Viro Cc: David Hildenbrand Cc: Jan Kara Cc: Jann Horn Cc: Michal Hocko Cc: Mike Rapoport Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 ++ mm/util.c | 62 ++++++++++++++++++++------------ tools/testing/vma/vma_internal.h | 12 +++++-- 3 files changed, 50 insertions(+), 26 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 0783c5d05d3f..594bd4d0521e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2279,6 +2279,8 @@ static inline bool can_mmap_file(struct file *file) return true; } +int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma); int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma); static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/mm/util.c b/mm/util.c index 215ecd0214b7..6c1d64ed0221 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1133,17 +1133,51 @@ void flush_dcache_folio(struct folio *folio) EXPORT_SYMBOL(flush_dcache_folio); #endif +/** + * __compat_vma_mmap_prepare() - See description for compat_vma_mmap_prepare() + * for details. This is the same operation, only with a specific file operations + * struct which may or may not be the same as vma->vm_file->f_op. + * @f_op: The file operations whose .mmap_prepare() hook is specified. + * @file: The file which backs or will back the mapping. + * @vma: The VMA to apply the .mmap_prepare() hook to. + * Returns: 0 on success or error. + */ +int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc = { + .mm = vma->vm_mm, + .file = file, + .start = vma->vm_start, + .end = vma->vm_end, + + .pgoff = vma->vm_pgoff, + .vm_file = vma->vm_file, + .vm_flags = vma->vm_flags, + .page_prot = vma->vm_page_prot, + }; + int err; + + err = f_op->mmap_prepare(&desc); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(__compat_vma_mmap_prepare); + /** * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an - * existing VMA - * @file: The file which possesss an f_op->mmap_prepare() hook + * existing VMA. + * @file: The file which possesss an f_op->mmap_prepare() hook. * @vma: The VMA to apply the .mmap_prepare() hook to. * * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain - * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * stacked filesystems invoke a nested mmap hook of an underlying file. * * Until all filesystems are converted to use .mmap_prepare(), we must be - * conservative and continue to invoke these 'wrapper' filesystems using the + * conservative and continue to invoke these stacked filesystems using the * deprecated .mmap() hook. * * However we have a problem if the underlying file system possesses an @@ -1161,25 +1195,7 @@ EXPORT_SYMBOL(flush_dcache_folio); */ int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) { - struct vm_area_desc desc = { - .mm = vma->vm_mm, - .file = vma->vm_file, - .start = vma->vm_start, - .end = vma->vm_end, - - .pgoff = vma->vm_pgoff, - .vm_file = vma->vm_file, - .vm_flags = vma->vm_flags, - .page_prot = vma->vm_page_prot, - }; - int err; - - err = file->f_op->mmap_prepare(&desc); - if (err) - return err; - set_vma_from_desc(vma, &desc); - - return 0; + return __compat_vma_mmap_prepare(file->f_op, file, vma); } EXPORT_SYMBOL(compat_vma_mmap_prepare); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 6dcbeaa9f9a0..07167446dcf4 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -1466,8 +1466,8 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int compat_vma_mmap_prepare(struct file *file, - struct vm_area_struct *vma) +static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, + struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, @@ -1482,7 +1482,7 @@ static inline int compat_vma_mmap_prepare(struct file *file, }; int err; - err = file->f_op->mmap_prepare(&desc); + err = f_op->mmap_prepare(&desc); if (err) return err; set_vma_from_desc(vma, &desc); @@ -1490,6 +1490,12 @@ static inline int compat_vma_mmap_prepare(struct file *file, return 0; } +static inline int compat_vma_mmap_prepare(struct file *file, + struct vm_area_struct *vma) +{ + return __compat_vma_mmap_prepare(file->f_op, file, vma); +} + /* Did the driver provide valid mmap hook configuration? */ static inline bool can_mmap_file(struct file *file) { From f8a01513f5749180228d6460662188dd1e101a53 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Mon, 15 Sep 2025 14:52:53 +0100 Subject: [PATCH 342/372] mm/khugepaged: do not fail collapse_pte_mapped_thp() on SCAN_PMD_NULL MADV_COLLAPSE on a file mapping behaves inconsistently depending on if PMD page table is installed or not. Consider following example: p = mmap(NULL, 2UL << 20, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); err = madvise(p, 2UL << 20, MADV_COLLAPSE); fd is a populated tmpfs file. The result depends on the address that the kernel returns on mmap(). If it is located in an existing PMD table, the madvise() will succeed. However, if the table does not exist, it will fail with -EINVAL. This occurs because find_pmd_or_thp_or_none() returns SCAN_PMD_NULL when a page table is missing, which causes collapse_pte_mapped_thp() to fail. SCAN_PMD_NULL and SCAN_PMD_NONE should be treated the same in collapse_pte_mapped_thp(): install the PMD leaf entry and allocate page tables as needed. Link: https://lkml.kernel.org/r/v5ivpub6z2n2uyemlnxgbilzs52ep4lrary7lm7o6axxoneb75@yfacfl5rkzeh Signed-off-by: Kiryl Shutsemau Acked-by: David Hildenbrand Reviewed-by: Dev Jain Reviewed-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Lorenzo Stoakes Reviewed-by: Lorenzo Stoakes Reviewed-by: Zach O'Keefe Cc: Barry Song Cc: "Kirill A. Shutemov" Cc: Liam Howlett Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/khugepaged.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index af5f5c80fe4e..9ed1af2b5c38 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1460,15 +1460,32 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio, struct page *page) { + struct mm_struct *mm = vma->vm_mm; struct vm_fault vmf = { .vma = vma, .address = addr, .flags = 0, - .pmd = pmdp, }; + pgd_t *pgdp; + p4d_t *p4dp; + pud_t *pudp; mmap_assert_locked(vma->vm_mm); + if (!pmdp) { + pgdp = pgd_offset(mm, addr); + p4dp = p4d_alloc(mm, pgdp, addr); + if (!p4dp) + return SCAN_FAIL; + pudp = pud_alloc(mm, p4dp, addr); + if (!pudp) + return SCAN_FAIL; + pmdp = pmd_alloc(mm, pudp, addr); + if (!pmdp) + return SCAN_FAIL; + } + + vmf.pmd = pmdp; if (do_set_pmd(&vmf, folio, page)) return SCAN_FAIL; @@ -1544,6 +1561,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, switch (result) { case SCAN_SUCCEED: break; + case SCAN_PMD_NULL: case SCAN_PMD_NONE: /* * All pte entries have been removed and pmd cleared. From fde591dad10900b9b4af07a532b5f91c53b20e25 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 17 Sep 2025 06:16:37 +0100 Subject: [PATCH 343/372] mm/oom_kill.c: fix inverted check Fix an incorrect logic conversion in process_mrelease(). Link: https://lkml.kernel.org/r/3b7f0faf-4dbc-4d67-8a71-752fbcdf0906@lucifer.local Fixes: 12e423ba4eae ("mm: convert core mm to mm_flags_*() accessors") Signed-off-by: Lorenzo Stoakes Reported-by: Chris Mason Closes: https://lkml.kernel.org/r/c2e28e27-d84b-4671-8784-de5fe0d14f41@lucifer.local Signed-off-by: Andrew Morton --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e33087c60f3b..c145b0feecc1 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1257,7 +1257,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure * possible change in exit_mmap is seen */ - if (mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) + if (!mm_flags_test(MMF_OOM_SKIP, mm) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); From 19c5fb83f2a4dde7b53b3aeb1fa87bfa3559286b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 19 Sep 2025 12:21:34 -0400 Subject: [PATCH 344/372] mm: page_alloc: avoid kswapd thrashing due to NUMA restrictions On NUMA systems without bindings, allocations check all nodes for free space, then wake up the kswapds on all nodes and retry. This ensures all available space is evenly used before reclaim begins. However, when one process or certain allocations have node restrictions, they can cause kswapds on only a subset of nodes to be woken up. Since kswapd hysteresis targets watermarks that are *higher* than needed for allocation, even *unrestricted* allocations can now get suckered onto such nodes that are already pressured. This ends up concentrating all allocations on them, even when there are idle nodes available for the unrestricted requests. This was observed with two numa nodes, where node0 is normal and node1 is ZONE_MOVABLE to facilitate hotplugging: a kernel allocation wakes kswapd on node0 only (since node1 is not eligible); once kswapd0 is active, the watermarks hover between low and high, and then even the movable allocations end up on node0, only to be kicked out again; meanwhile node1 is empty and idle. Similar behavior is possible when a process with NUMA bindings is causing selective kswapd wakeups. To fix this, on NUMA systems augment the (misleading) watermark test with a check for whether kswapd is already active during the first iteration through the zonelist. If this fails to place the request, kswapd must be running everywhere already, and the watermark test is good enough to decide placement. With this patch, unrestricted requests successfully make use of node1, even while kswapd is reclaiming node0 for restricted allocations. [gourry@gourry.net: don't retry if no kswapds were active] Link: https://lkml.kernel.org/r/20250919162134.1098208-1-hannes@cmpxchg.org Signed-off-by: Gregory Price Tested-by: Joshua Hahn Signed-off-by: Johannes Weiner Acked-by: Zi Yan Cc: Brendan Jackman Cc: Joshua Hahn Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bfab96c207f..304e12bf2e4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3735,6 +3735,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct pglist_data *last_pgdat = NULL; bool last_pgdat_dirty_ok = false; bool no_fallback; + bool skip_kswapd_nodes = nr_online_nodes > 1; + bool skipped_kswapd_nodes = false; retry: /* @@ -3797,6 +3799,19 @@ retry: } } + /* + * If kswapd is already active on a node, keep looking + * for other nodes that might be idle. This can happen + * if another process has NUMA bindings and is causing + * kswapd wakeups on only some nodes. Avoid accidental + * "node_reclaim_mode"-like behavior in this case. + */ + if (skip_kswapd_nodes && + !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) { + skipped_kswapd_nodes = true; + continue; + } + cond_accept_memory(zone, order, alloc_flags); /* @@ -3888,6 +3903,15 @@ try_this_zone: } } + /* + * If we skipped over nodes with active kswapds and found no + * idle nodes, retry and place anywhere the watermarks permit. + */ + if (skip_kswapd_nodes && skipped_kswapd_nodes) { + skip_kswapd_nodes = false; + goto retry; + } + /* * It's possible on a UMA machine to get through all zones that are * fragmented. If avoiding fragmentation, reset and try again. From 1b00ab48892fe6115618e2c81f9c1891ad0c0a5a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Sep 2025 19:11:59 +0100 Subject: [PATCH 345/372] ksm: use a folio inside cmp_and_merge_page() This removes the last call to page_stable_node(), so delete the wrapper. It also removes a call to trylock_page() and saves a call to compound_head(), as well as removing a reference to folio->page. Link: https://lkml.kernel.org/r/20250916181219.2400258-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Chengming Zhou Acked-by: David Hildenbrand Cc: Longlong Xia Cc: xu xin Signed-off-by: Andrew Morton --- mm/ksm.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 2ef29802a49b..2dbe92e3dd52 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1061,11 +1061,6 @@ struct ksm_stable_node *folio_stable_node(const struct folio *folio) return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } -static inline struct ksm_stable_node *page_stable_node(struct page *page) -{ - return folio_stable_node(page_folio(page)); -} - static inline void folio_set_stable_node(struct folio *folio, struct ksm_stable_node *stable_node) { @@ -2225,6 +2220,7 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { + struct folio *folio = page_folio(page); struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; @@ -2233,7 +2229,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite int err; bool max_page_sharing_bypass = false; - stable_node = page_stable_node(page); + stable_node = folio_stable_node(folio); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(READ_ONCE(stable_node->kpfn)) != @@ -2272,7 +2268,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite /* Start by searching for the folio in the stable tree */ kfolio = stable_tree_search(page); - if (&kfolio->page == page && rmap_item->head == stable_node) { + if (kfolio == folio && rmap_item->head == stable_node) { folio_put(kfolio); return; } @@ -2353,10 +2349,11 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * the page is locked, it is better to skip it and * perhaps try again later. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) return; split_huge_page(page); - unlock_page(page); + folio = page_folio(page); + folio_unlock(folio); } } } From 7ef5268a907534c4e6373b0d3fe45e0b3d95bfe2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 17 Sep 2025 20:59:06 +0200 Subject: [PATCH 346/372] mm/vmalloc: move resched point into alloc_vmap_area() Currently vm_area_alloc_pages() contains two cond_resched() points. However, the page allocator already has its own in slow path so an extra resched is not optimal because it delays the loops. The place where CPU time can be consumed is in the VA-space search in alloc_vmap_area(), especially if the space is really fragmented using synthetic stress tests, after a fast path falls back to a slow one. Move a single cond_resched() there, after dropping free_vmap_area_lock in a slow path. This keeps fairness where it matters while removing redundant yields from the page-allocation path. [akpm@linux-foundation.org: tweak comment grammar] Link: https://lkml.kernel.org/r/20250917185906.1595454-1-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) Cc: Baoquan He Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmalloc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4249e1e01947..798b2ed21e46 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2057,6 +2057,12 @@ retry: addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list, size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); + + /* + * This is not a fast path. Check if yielding is needed. This + * is the only reschedule point in the vmalloc() path. + */ + cond_resched(); } trace_alloc_vmap_area(addr, size, align, vstart, vend, IS_ERR_VALUE(addr)); @@ -3622,7 +3628,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, pages + nr_allocated); nr_allocated += nr; - cond_resched(); /* * If zero or pages were obtained partly, @@ -3664,7 +3669,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid, for (i = 0; i < (1U << order); i++) pages[nr_allocated + i] = page + i; - cond_resched(); nr_allocated += 1U << order; } From 8d009da32f13759ee7a6ec002ba62dc8faeb6423 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 17 Sep 2025 08:31:54 -0700 Subject: [PATCH 347/372] mm/damon/sysfs: set damon_ctx->min_sz_region only for paddr use case damon_ctx->addr_unit is respected only for physical address space monitoring use case. Meanwhile, damon_ctx->min_sz_region is used by the core layer for aligning regions, regardless of whether it is set for physical address space monitoring or virtual address spaces monitoring. And it is set as 'DAMON_MIN_REGION / damon_ctx->addr_unit'. Hence, if user sets ->addr_unit on virtual address spaces monitoring mode, regions can be unexpectedly aligned in min_sz_region only when it is configured for physical address space monitoring. The issue was found from a result of Chris' experiments that thankfully shared with me off-list. Link: https://lkml.kernel.org/r/20250917160041.53187-1-sj@kernel.org Fixes: d8f867fa0825 ("mm/damon: add damon_ctx->min_sz_region") Signed-off-by: SeongJae Park Cc: Chris Mason Cc: Kefeng Wang Cc: ze zuo Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index fe4e73d0ebbb..883b0d886d68 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1435,7 +1435,10 @@ static int damon_sysfs_apply_inputs(struct damon_ctx *ctx, if (err) return err; ctx->addr_unit = sys_ctx->addr_unit; - ctx->min_sz_region = max(DAMON_MIN_REGION / sys_ctx->addr_unit, 1); + /* addr_unit is respected by only DAMON_OPS_PADDR */ + if (sys_ctx->ops_id == DAMON_OPS_PADDR) + ctx->min_sz_region = max( + DAMON_MIN_REGION / sys_ctx->addr_unit, 1); err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs); if (err) return err; From 0389c305ef56cbadca4cbef44affc0ec3213ed30 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Wed, 17 Sep 2025 21:31:37 +0800 Subject: [PATCH 348/372] selftests/mm: skip soft-dirty tests when CONFIG_MEM_SOFT_DIRTY is disabled The madv_populate and soft-dirty kselftests currently fail on systems where CONFIG_MEM_SOFT_DIRTY is disabled. Introduce a new helper softdirty_supported() into vm_util.c/h to ensure tests are properly skipped when the feature is not enabled. Link: https://lkml.kernel.org/r/20250917133137.62802-1-lance.yang@linux.dev Fixes: 9f3265db6ae8 ("selftests: vm: add test for Soft-Dirty PTE bit") Signed-off-by: Lance Yang Acked-by: David Hildenbrand Suggested-by: David Hildenbrand Cc: Lorenzo Stoakes Cc: Shuah Khan Cc: Gabriel Krisman Bertazi Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/madv_populate.c | 21 ++------------------- tools/testing/selftests/mm/soft-dirty.c | 5 ++++- tools/testing/selftests/mm/vm_util.c | 17 +++++++++++++++++ tools/testing/selftests/mm/vm_util.h | 1 + 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c index b6fabd5c27ed..d8d11bc67ddc 100644 --- a/tools/testing/selftests/mm/madv_populate.c +++ b/tools/testing/selftests/mm/madv_populate.c @@ -264,23 +264,6 @@ static void test_softdirty(void) munmap(addr, SIZE); } -static int system_has_softdirty(void) -{ - /* - * There is no way to check if the kernel supports soft-dirty, other - * than by writing to a page and seeing if the bit was set. But the - * tests are intended to check that the bit gets set when it should, so - * doing that check would turn a potentially legitimate fail into a - * skip. Fortunately, we know for sure that arm64 does not support - * soft-dirty. So for now, let's just use the arch as a corse guide. - */ -#if defined(__aarch64__) - return 0; -#else - return 1; -#endif -} - int main(int argc, char **argv) { int nr_tests = 16; @@ -288,7 +271,7 @@ int main(int argc, char **argv) pagesize = getpagesize(); - if (system_has_softdirty()) + if (softdirty_supported()) nr_tests += 5; ksft_print_header(); @@ -300,7 +283,7 @@ int main(int argc, char **argv) test_holes(); test_populate_read(); test_populate_write(); - if (system_has_softdirty()) + if (softdirty_supported()) test_softdirty(); err = ksft_get_fail_cnt(); diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 8a3f2b4b2186..4ee4db3750c1 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -200,8 +200,11 @@ int main(int argc, char **argv) int pagesize; ksft_print_header(); - ksft_set_plan(15); + if (!softdirty_supported()) + ksft_exit_skip("soft-dirty is not support\n"); + + ksft_set_plan(15); pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); if (pagemap_fd < 0) ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 56e9bd541edd..e33cda301dad 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -449,6 +449,23 @@ bool check_vmflag_pfnmap(void *addr) return check_vmflag(addr, "pf"); } +bool softdirty_supported(void) +{ + char *addr; + bool supported = false; + const size_t pagesize = getpagesize(); + + /* New mappings are expected to be marked with VM_SOFTDIRTY (sd). */ + addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (!addr) + ksft_exit_fail_msg("mmap failed\n"); + + supported = check_vmflag(addr, "sd"); + munmap(addr, pagesize); + return supported; +} + /* * Open an fd at /proc/$pid/maps and configure procmap_out ready for * PROCMAP_QUERY query. Returns 0 on success, or an error code otherwise. diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 07c4acfd84b6..26c30fdc0241 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -104,6 +104,7 @@ bool find_vma_procmap(struct procmap_fd *procmap, void *address); int close_procmap(struct procmap_fd *procmap); int write_sysfs(const char *file_path, unsigned long val); int read_sysfs(const char *file_path, unsigned long *val); +bool softdirty_supported(void); static inline int open_self_procmap(struct procmap_fd *procmap_out) { From 20571b187051e5b78b48b99c9bdd425c94b29e18 Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Wed, 17 Sep 2025 14:56:53 +0200 Subject: [PATCH 349/372] kho: move sanity checks to kho_restore_page() While KHO exposes folio as the primitive externally, internally its restoration machinery operates on pages. This can be seen with kho_restore_folio() for example. It performs some sanity checks and hands it over to kho_restore_page() to do the heavy lifting of page restoration. After the work done by kho_restore_page(), kho_restore_folio() only converts the head page to folio and returns it. Similarly, deserialize_bitmap() operates on the head page directly to store the order. Move the sanity checks for valid phys and order from the public-facing kho_restore_folio() to the private-facing kho_restore_page(). This makes the boundary between page and folio clearer from KHO's perspective. While at it, drop the comment above kho_restore_page(). The comment is misleading now. The function stopped looking like free_reserved_page() since 12b9a2c05d1b4 ("kho: initialize tail pages for higher order folios properly"), and now looks even more different. Link: https://lkml.kernel.org/r/20250917125725.665-1-pratyush@kernel.org Signed-off-by: Pratyush Yadav Reviewed-by: Mike Rapoport (Microsoft) Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Chris Li Cc: Jason Gunthorpe Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 8079fc4b9189..c006a7544664 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -183,10 +183,18 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, return 0; } -/* almost as free_reserved_page(), just don't free the page */ -static void kho_restore_page(struct page *page, unsigned int order) +static struct page *kho_restore_page(phys_addr_t phys) { - unsigned int nr_pages = (1 << order); + struct page *page = pfn_to_online_page(PHYS_PFN(phys)); + unsigned int nr_pages, order; + + if (!page) + return NULL; + + order = page->private; + if (order > MAX_PAGE_ORDER) + return NULL; + nr_pages = (1 << order); /* Head page gets refcount of 1. */ set_page_count(page, 1); @@ -199,6 +207,7 @@ static void kho_restore_page(struct page *page, unsigned int order) prep_compound_page(page, order); adjust_managed_page_count(page, nr_pages); + return page; } /** @@ -209,18 +218,9 @@ static void kho_restore_page(struct page *page, unsigned int order) */ struct folio *kho_restore_folio(phys_addr_t phys) { - struct page *page = pfn_to_online_page(PHYS_PFN(phys)); - unsigned long order; + struct page *page = kho_restore_page(phys); - if (!page) - return NULL; - - order = page->private; - if (order > MAX_PAGE_ORDER) - return NULL; - - kho_restore_page(page, order); - return page_folio(page); + return page ? page_folio(page) : NULL; } EXPORT_SYMBOL_GPL(kho_restore_folio); From 89a3ecca49ee889cc1ab4def6caa0452df196efb Mon Sep 17 00:00:00 2001 From: Pratyush Yadav Date: Wed, 17 Sep 2025 14:56:54 +0200 Subject: [PATCH 350/372] kho: make sure page being restored is actually from KHO When restoring a page, no sanity checks are done to make sure the page actually came from a kexec handover. The caller is trusted to pass in the right address. If the caller has a bug and passes in a wrong address, an in-use page might be "restored" and returned, causing all sorts of memory corruption. Harden the page restore logic by stashing in a magic number in page->private along with the order. If the magic number does not match, the page won't be touched. page->private is an unsigned long. The union kho_page_info splits it into two parts, with one holding the order and the other holding the magic number. Link: https://lkml.kernel.org/r/20250917125725.665-2-pratyush@kernel.org Signed-off-by: Pratyush Yadav Cc: Alexander Graf Cc: Baoquan He Cc: Changyuan Lyu Cc: Chris Li Cc: Jason Gunthorpe Cc: Mike Rapoport (Microsoft) Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- kernel/kexec_handover.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index c006a7544664..555488eb1a18 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -32,6 +32,22 @@ #define PROP_PRESERVED_MEMORY_MAP "preserved-memory-map" #define PROP_SUB_FDT "fdt" +#define KHO_PAGE_MAGIC 0x4b484f50U /* ASCII for 'KHOP' */ + +/* + * KHO uses page->private, which is an unsigned long, to store page metadata. + * Use it to store both the magic and the order. + */ +union kho_page_info { + unsigned long page_private; + struct { + unsigned int order; + unsigned int magic; + }; +}; + +static_assert(sizeof(union kho_page_info) == sizeof(((struct page *)0)->private)); + static bool kho_enable __ro_after_init; bool kho_is_enabled(void) @@ -186,16 +202,24 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, static struct page *kho_restore_page(phys_addr_t phys) { struct page *page = pfn_to_online_page(PHYS_PFN(phys)); - unsigned int nr_pages, order; + union kho_page_info info; + unsigned int nr_pages; if (!page) return NULL; - order = page->private; - if (order > MAX_PAGE_ORDER) + info.page_private = page->private; + /* + * deserialize_bitmap() only sets the magic on the head page. This magic + * check also implicitly makes sure phys is order-aligned since for + * non-order-aligned phys addresses, magic will never be set. + */ + if (WARN_ON_ONCE(info.magic != KHO_PAGE_MAGIC || info.order > MAX_PAGE_ORDER)) return NULL; - nr_pages = (1 << order); + nr_pages = (1 << info.order); + /* Clear private to make sure later restores on this page error out. */ + page->private = 0; /* Head page gets refcount of 1. */ set_page_count(page, 1); @@ -203,8 +227,8 @@ static struct page *kho_restore_page(phys_addr_t phys) for (unsigned int i = 1; i < nr_pages; i++) set_page_count(page + i, 0); - if (order > 0) - prep_compound_page(page, order); + if (info.order > 0) + prep_compound_page(page, info.order); adjust_managed_page_count(page, nr_pages); return page; @@ -341,10 +365,13 @@ static void __init deserialize_bitmap(unsigned int order, phys_addr_t phys = elm->phys_start + (bit << (order + PAGE_SHIFT)); struct page *page = phys_to_page(phys); + union kho_page_info info; memblock_reserve(phys, sz); memblock_reserved_mark_noinit(phys, sz); - page->private = order; + info.magic = KHO_PAGE_MAGIC; + info.order = order; + page->private = info.page_private; } } From 89e688edcffee7858aa394fd107df98bfd7647a4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 10 Sep 2025 09:22:40 +0000 Subject: [PATCH 351/372] mm/compaction: fix low_pfn advance on isolating hugetlb Commit 56ae0bb349b4 ("mm: compaction: convert to use a folio in isolate_migratepages_block()") converts api from page to folio. But the low_pfn advance for hugetlb page seems wrong when low_pfn doesn't point to head page. Originally, if page is a hugetlb tail page, compound_nr() return 1, which means low_pfn only advance one in next iteration. After the change, low_pfn would advance more than the hugetlb range, since folio_nr_pages() always return total number of the large page. This results in skipping some range to isolate and then to migrate. The worst case for alloc_contig is it does all the isolation and migration, but finally find some range is still not isolated. And then undo all the work and try a new range. Advance low_pfn to the end of hugetlb. Link: https://lkml.kernel.org/r/20250910092240.3981-1-richard.weiyang@gmail.com Fixes: 56ae0bb349b4 ("mm: compaction: convert to use a folio in isolate_migratepages_block()") Signed-off-by: Wei Yang Acked-by: Zi Yan Cc: "Vishal Moola (Oracle)" Cc: Kefeng Wang Cc: Oscar Salvador Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index bf021b31c7ec..1e8f8eca318c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -989,7 +989,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - low_pfn += folio_nr_pages(folio) - 1; + low_pfn += folio_nr_pages(folio) - folio_page_idx(folio, page) - 1; goto isolate_success_no_list; } From 2db579838296239545554443234fafb8f485cca0 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Tue, 23 Sep 2025 12:07:06 +0100 Subject: [PATCH 352/372] mm/page_vma_mapped: track if the page is mapped across page table boundary Patch series "mm: Improve mlock tracking for large folios", v3. The patchset includes several fixes and improvements related to mlock tracking of large folios. The main objective is to reduce the undercount of Mlocked memory in /proc/meminfo and improve the accuracy of the statistics. Patches 1-2: These patches address a minor race condition in folio_referenced_one() related to mlock_vma_folio(). Currently, mlock_vma_folio() is called on large folio without the page table lock, which can result in a race condition with unmap (i.e. MADV_DONTNEED). This can lead to partially mapped folios on the unevictable LRU list. While not a significant issue, I do not believe backporting is necessary. Patch 3: This patch adds mlocking logic similar to folio_referenced_one() to try_to_unmap_one(), allowing for mlocking of large folios where possible. Patch 4-5: These patches modifies finish_fault() and faultaround to map in the entire folio when possible, enabling efficient mlocking upon addition to the rmap. Patch 6: This patch makes rmap mlock large folios if they are fully mapped, addressing the primary source of mlock undercount for large folios. This patch (of 6): Add a PVMW_PGTABLE_CROSSSED flag that page_vma_mapped_walk() will set if the page is mapped across page table boundary. Unlike other PVMW_* flags, this one is result of page_vma_mapped_walk() and not set by the caller. folio_referenced_one() will use it to detect if it safe to mlock the folio. [akpm@linux-foundation.org: s/CROSSSED/CROSSED/] Link: https://lkml.kernel.org/r/20250923110711.690639-1-kirill@shutemov.name Link: https://lkml.kernel.org/r/20250923110711.690639-2-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Reviewed-by: Shakeel Butt Cc: Baolin Wang Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/rmap.h | 5 +++++ mm/page_vma_mapped.c | 1 + 2 files changed, 6 insertions(+) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index e8aff6d2deda..daa92a58585d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -922,6 +922,11 @@ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, /* Look for migration entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) +/* Result flags */ + +/* The page is mapped across page table boundary */ +#define PVMW_PGTABLE_CROSSED (1 << 16) + struct page_vma_mapped_walk { unsigned long pfn; unsigned long nr_pages; diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index e981a1a292d2..c498a91b6706 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -309,6 +309,7 @@ next_pte: } pte_unmap(pvmw->pte); pvmw->pte = NULL; + pvmw->flags |= PVMW_PGTABLE_CROSSED; goto restart; } pvmw->pte++; From a2880202767daded2898f62265f6cdf4cfb53bc4 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Tue, 23 Sep 2025 12:07:07 +0100 Subject: [PATCH 353/372] mm/rmap: fix a mlock race condition in folio_referenced_one() The mlock_vma_folio() function requires the page table lock to be held in order to safely mlock the folio. However, folio_referenced_one() mlocks a large folios outside of the page_vma_mapped_walk() loop where the page table lock has already been dropped. Rework the mlock logic to use the same code path inside the loop for both large and small folios. Use PVMW_PGTABLE_CROSSED to detect when the folio is mapped across a page table boundary. [akpm@linux-foundation.org: s/CROSSSED/CROSSED/] Link: https://lkml.kernel.org/r/20250923110711.690639-3-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Reviewed-by: Shakeel Butt Cc: Baolin Wang Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/rmap.c | 59 ++++++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 38 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 34333ae3bd80..d174168b8f93 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -850,34 +850,34 @@ static bool folio_referenced_one(struct folio *folio, { struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); - int referenced = 0; - unsigned long start = address, ptes = 0; + int ptes = 0, referenced = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; if (vma->vm_flags & VM_LOCKED) { - if (!folio_test_large(folio) || !pvmw.pte) { - /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma); - page_vma_mapped_walk_done(&pvmw); - pra->vm_flags |= VM_LOCKED; - return false; /* To break the loop */ - } - /* - * For large folio fully mapped to VMA, will - * be handled after the pvmw loop. - * - * For large folio cross VMA boundaries, it's - * expected to be picked by page reclaim. But - * should skip reference of pages which are in - * the range of VM_LOCKED vma. As page reclaim - * should just count the reference of pages out - * the range of VM_LOCKED vma. - */ ptes++; pra->mapcount--; - continue; + + /* Only mlock fully mapped pages */ + if (pvmw.pte && ptes != pvmw.nr_pages) + continue; + + /* + * All PTEs must be protected by page table lock in + * order to mlock the page. + * + * If page table boundary has been cross, current ptl + * only protect part of ptes. + */ + if (pvmw.flags & PVMW_PGTABLE_CROSSED) + continue; + + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma); + page_vma_mapped_walk_done(&pvmw); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ } /* @@ -913,23 +913,6 @@ static bool folio_referenced_one(struct folio *folio, pra->mapcount--; } - if ((vma->vm_flags & VM_LOCKED) && - folio_test_large(folio) && - folio_within_vma(folio, vma)) { - unsigned long s_align, e_align; - - s_align = ALIGN_DOWN(start, PMD_SIZE); - e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE); - - /* folio doesn't cross page table boundary and fully mapped */ - if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) { - /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma); - pra->vm_flags |= VM_LOCKED; - return false; /* To break the loop */ - } - } - if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) From 8c49fbafedf15149069cdb9e0d543c4a68a1c683 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Tue, 23 Sep 2025 12:07:08 +0100 Subject: [PATCH 354/372] mm/rmap: mlock large folios in try_to_unmap_one() Currently, try_to_unmap_once() only tries to mlock small folios. Use logic similar to folio_referenced_one() to mlock large folios: only do this for fully mapped folios and under page table lock that protects all page table entries. [akpm@linux-foundation.org: s/CROSSSED/CROSSED/] Link: https://lkml.kernel.org/r/20250923110711.690639-4-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Reviewed-by: Shakeel Butt Cc: Baolin Wang Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/rmap.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index d174168b8f93..92eeb3866494 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1870,6 +1870,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, unsigned long nr_pages = 1, end_addr; unsigned long pfn; unsigned long hsz = 0; + int ptes = 0; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1910,10 +1911,34 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + ptes++; + + /* + * Set 'ret' to indicate the page cannot be unmapped. + * + * Do not jump to walk_abort immediately as additional + * iteration might be required to detect fully mapped + * folio an mlock it. + */ + ret = false; + + /* Only mlock fully mapped pages */ + if (pvmw.pte && ptes != pvmw.nr_pages) + continue; + + /* + * All PTEs must be protected by page table lock in + * order to mlock the page. + * + * If page table boundary has been cross, current ptl + * only protect part of ptes. + */ + if (pvmw.flags & PVMW_PGTABLE_CROSSED) + goto walk_done; + /* Restore the mlock which got missed */ - if (!folio_test_large(folio)) - mlock_vma_folio(folio, vma); - goto walk_abort; + mlock_vma_folio(folio, vma); + goto walk_done; } if (!pvmw.pte) { From 19773df031bcc67d5caa06bf0ddbbff40174be7a Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Tue, 23 Sep 2025 12:07:09 +0100 Subject: [PATCH 355/372] mm/fault: try to map the entire file folio in finish_fault() finish_fault() uses per-page fault for file folios. This only occurs for file folios smaller than PMD_SIZE. The comment suggests that this approach prevents RSS inflation. However, it only prevents RSS accounting. The folio is still mapped to the process, and the fact that it is mapped by a single PTE does not affect memory pressure. Additionally, the kernel's ability to map large folios as PMD if they are large enough does not support this argument. When possible, map large folios in one shot. This reduces the number of minor page faults and allows for TLB coalescing. Mapping large folios at once will allow the rmap code to mlock it on add, as it will recognize that it is fully mapped and mlocking is safe. Link: https://lkml.kernel.org/r/20250923110711.690639-5-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Reviewed-by: Shakeel Butt Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/memory.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 17cebb97beae..74b45e258323 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5516,13 +5516,8 @@ fallback: nr_pages = folio_nr_pages(folio); - /* - * Using per-page fault to maintain the uffd semantics, and same - * approach also applies to non shmem/tmpfs faults to avoid - * inflating the RSS of the process. - */ - if (!vma_is_shmem(vma) || unlikely(userfaultfd_armed(vma)) || - unlikely(needs_fallback)) { + /* Using per-page fault to maintain the uffd semantics */ + if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) { nr_pages = 1; } else if (nr_pages > 1) { pgoff_t idx = folio_page_idx(folio, page); From 357b92761d942432c90aeeb965f9eb0c94466921 Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Tue, 23 Sep 2025 12:07:10 +0100 Subject: [PATCH 356/372] mm/filemap: map entire large folio faultaround Currently, kernel only maps part of large folio that fits into start_pgoff/end_pgoff range. Map entire folio where possible. It will match finish_fault() behaviour that user hits on cold page cache. Mapping large folios at once will allow the rmap code to mlock it on add, as it will recognize that it is fully mapped and mlocking is safe. Link: https://lkml.kernel.org/r/20250923110711.690639-6-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Cc: Baolin Wang Cc: David Hildenbrand Cc: Johannes Weiner Cc: Lorenzo Stoakes Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/filemap.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 2a05b1fdd445..a52dd38d2b4a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3670,6 +3670,21 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct page *page = folio_page(folio, start); unsigned int count = 0; pte_t *old_ptep = vmf->pte; + unsigned long addr0; + + /* + * Map the large folio fully where possible. + * + * The folio must not cross VMA or page table boundary. + */ + addr0 = addr - start * PAGE_SIZE; + if (folio_within_vma(folio, vmf->vma) && + (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { + vmf->pte -= start; + page -= start; + addr = addr0; + nr_pages = folio_nr_pages(folio); + } do { if (PageHWPoison(page + count)) From ab521b4142aa41fdf74efc20e2b1806f35dbc64b Mon Sep 17 00:00:00 2001 From: Kiryl Shutsemau Date: Tue, 23 Sep 2025 12:07:11 +0100 Subject: [PATCH 357/372] mm/rmap: improve mlock tracking for large folios The kernel currently does not mlock large folios when adding them to rmap, stating that it is difficult to confirm that the folio is fully mapped and safe to mlock it. This leads to a significant undercount of Mlocked in /proc/meminfo, causing problems in production where the stat was used to estimate system utilization and determine if load shedding is required. However, nowadays the caller passes a number of pages of the folio that are getting mapped, making it easy to check if the entire folio is mapped to the VMA. mlock the folio on rmap if it is fully mapped to the VMA. Mlocked in /proc/meminfo can still undercount, but the value is closer the truth and is useful for userspace. Link: https://lkml.kernel.org/r/20250923110711.690639-7-kirill@shutemov.name Signed-off-by: Kiryl Shutsemau Acked-by: David Hildenbrand Acked-by: Johannes Weiner Acked-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Baolin Wang Signed-off-by: Andrew Morton --- mm/rmap.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 92eeb3866494..ac4f783d6ec2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1463,12 +1463,12 @@ static __always_inline void __folio_add_anon_rmap(struct folio *folio, } /* - * For large folio, only mlock it if it's fully mapped to VMA. It's - * not easy to check whether the large folio is fully mapped to VMA - * here. Only mlock normal 4K folio and leave page reclaim to handle - * large folio. + * Only mlock it if the folio is fully mapped to the VMA. + * + * Partially mapped folios can be split on reclaim and part outside + * of mlocked VMA can be evicted or freed. */ - if (!folio_test_large(folio)) + if (folio_nr_pages(folio) == nr_pages) mlock_vma_folio(folio, vma); } @@ -1601,8 +1601,13 @@ static __always_inline void __folio_add_file_rmap(struct folio *folio, __folio_add_rmap(folio, page, nr_pages, vma, level); - /* See comments in folio_add_anon_rmap_*() */ - if (!folio_test_large(folio)) + /* + * Only mlock it if the folio is fully mapped to the VMA. + * + * Partially mapped folios can be split on reclaim and part outside + * of mlocked VMA can be evicted or freed. + */ + if (folio_nr_pages(folio) == nr_pages) mlock_vma_folio(folio, vma); } From 51032f26cff7a5ab458d549d88b905ca5bf7a7f5 Mon Sep 17 00:00:00 2001 From: Manish Kumar Date: Thu, 18 Sep 2025 23:15:28 +0530 Subject: [PATCH 358/372] mm/memory_hotplug: fix typo 'esecially' -> 'especially' Link: https://lkml.kernel.org/r/20250918174528.90879-1-manish1588@gmail.com Signed-off-by: Manish Kumar Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 883b8e4d51ba..e9f14de4a9c9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -955,7 +955,7 @@ static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn * effectively unused by the kernel, yet they account to "present pages". * Fortunately, these allocations are comparatively small in relevant setups * (e.g., fraction of system memory). - * b) Some hotplugged memory blocks in virtualized environments, esecially + * b) Some hotplugged memory blocks in virtualized environments, especially * hotplugged by virtio-mem, look like they are completely present, however, * only parts of the memory block are actually currently usable. * "present pages" is an upper limit that can get reached at runtime. As From 4afb85f5e359e521ef20c0260af17a4490fc83f7 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Thu, 18 Sep 2025 15:04:53 +0530 Subject: [PATCH 359/372] mm: remove PMD alignment constraint in execmem_vmalloc() When using vmalloc with VM_ALLOW_HUGE_VMAP flag, it will set the alignment to PMD_SIZE internally, if it deems huge mappings to be eligible. Therefore, setting the alignment in execmem_vmalloc is redundant. Apart from this, it also reduces the probability of allocation in case vmalloc fails to allocate hugepages - in the fallback case, vmalloc tries to use the original alignment and allocate basepages, which unfortunately will again be PMD_SIZE passed over from execmem_vmalloc, thus constraining the search for a free space in vmalloc region. Therefore, remove this constraint. Link: https://lkml.kernel.org/r/20250918093453.75676-1-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Mike Rapoport (Microsoft) Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/execmem.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/execmem.c b/mm/execmem.c index 0822305413ec..810a4ba9c924 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -38,9 +38,6 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size, if (kasan) vm_flags |= VM_DEFER_KMEMLEAK; - if (vm_flags & VM_ALLOW_HUGE_VMAP) - align = PMD_SIZE; - p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); From 0efdedfa537eb534c251a5b4794caaf72cc55869 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Thu, 18 Sep 2025 11:11:44 +0530 Subject: [PATCH 360/372] drivers/base/node: fix double free in register_one_node() When device_register() fails in register_node(), it calls put_device(&node->dev). This triggers node_device_release(), which calls kfree(to_node(dev)), thereby freeing the entire node structure. As a result, when register_node() returns an error, the node memory has already been freed. Calling kfree(node) again in register_one_node() leads to a double free. This patch removes the redundant kfree(node) from register_one_node() to prevent the double free. Link: https://lkml.kernel.org/r/20250918054144.58980-1-donettom@linux.ibm.com Fixes: 786eb990cfb7 ("drivers/base/node: handle error properly in register_one_node()") Signed-off-by: Donet Tom Acked-by: David Hildenbrand Acked-by: Oscar Salvador Cc: Alison Schofield Cc: Chris Mason Cc: Danilo Krummrich Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Hiroyouki Kamezawa Cc: Joanthan Cameron Cc: "Ritesh Harjani (IBM)" Cc: Yury Norov (NVIDIA) Cc: Zi Yan Signed-off-by: Andrew Morton --- drivers/base/node.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 45d512939c40..67b01d579737 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -887,7 +887,6 @@ int register_one_node(int nid) error = register_node(node_devices[nid], nid); if (error) { node_devices[nid] = NULL; - kfree(node); return error; } From 4d6fc29f36341d7795db1d1819b4c15fe9be7b23 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Wed, 24 Sep 2025 00:16:59 +0530 Subject: [PATCH 361/372] mm/ksm: fix incorrect KSM counter handling in mm_struct during fork Patch series "mm/ksm: Fix incorrect accounting of KSM counters during fork", v3. The first patch in this series fixes the incorrect accounting of KSM counters such as ksm_merging_pages, ksm_rmap_items, and the global ksm_zero_pages during fork. The following patch add a selftest to verify the ksm_merging_pages counter was updated correctly during fork. Test Results ============ Without the first patch ----------------------- # [RUN] test_fork_ksm_merging_page_count not ok 10 ksm_merging_page in child: 32 With the first patch -------------------- # [RUN] test_fork_ksm_merging_page_count ok 10 ksm_merging_pages is not inherited after fork This patch (of 2): Currently, the KSM-related counters in `mm_struct`, such as `ksm_merging_pages`, `ksm_rmap_items`, and `ksm_zero_pages`, are inherited by the child process during fork. This results in inconsistent accounting. When a process uses KSM, identical pages are merged and an rmap item is created for each merged page. The `ksm_merging_pages` and `ksm_rmap_items` counters are updated accordingly. However, after a fork, these counters are copied to the child while the corresponding rmap items are not. As a result, when the child later triggers an unmerge, there are no rmap items present in the child, so the counters remain stale, leading to incorrect accounting. A similar issue exists with `ksm_zero_pages`, which maintains both a global counter and a per-process counter. During fork, the per-process counter is inherited by the child, but the global counter is not incremented. Since the child also references zero pages, the global counter should be updated as well. Otherwise, during zero-page unmerge, both the global and per-process counters are decremented, causing the global counter to become inconsistent. To fix this, ksm_merging_pages and ksm_rmap_items are reset to 0 during fork, and the global ksm_zero_pages counter is updated with the per-process ksm_zero_pages value inherited by the child. This ensures that KSM statistics remain accurate and reflect the activity of each process correctly. Link: https://lkml.kernel.org/r/cover.1758648700.git.donettom@linux.ibm.com Link: https://lkml.kernel.org/r/7b9870eb67ccc0d79593940d9dbd4a0b39b5d396.1758648700.git.donettom@linux.ibm.com Fixes: 7609385337a4 ("ksm: count ksm merging pages for each process") Fixes: cb4df4cae4f2 ("ksm: count allocated ksm rmap_items for each process") Fixes: e2942062e01d ("ksm: count all zero pages placed by KSM") Signed-off-by: Donet Tom Reviewed-by: Chengming Zhou Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: David Hildenbrand Cc: Donet Tom Cc: "Ritesh Harjani (IBM)" Cc: Wei Yang Cc: xu xin Cc: [6.6+] Signed-off-by: Andrew Morton --- include/linux/ksm.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 22e67ca7cba3..067538fc4d58 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -56,8 +56,14 @@ static inline long mm_ksm_zero_pages(struct mm_struct *mm) static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ - if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) + if (mm_flags_test(MMF_VM_MERGEABLE, oldmm)) { + long nr_ksm_zero_pages = atomic_long_read(&mm->ksm_zero_pages); + + mm->ksm_merging_pages = 0; + mm->ksm_rmap_items = 0; + atomic_long_add(nr_ksm_zero_pages, &ksm_zero_pages); __ksm_enter(mm); + } } static inline int ksm_execve(struct mm_struct *mm) From 08ff89b5659d89966fc845ee5040051d318b8d01 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Wed, 24 Sep 2025 00:17:00 +0530 Subject: [PATCH 362/372] selftests/mm: add fork inheritance test for ksm_merging_pages counter Add a new selftest to verify whether the `ksm_merging_pages` counter in `mm_struct` is not inherited by a child process after fork. This helps ensure correctness of KSM accounting across process creation. Link: https://lkml.kernel.org/r/e7bb17d374133bd31a3e423aa9e46e1122e74971.1758648700.git.donettom@linux.ibm.com Signed-off-by: Donet Tom Acked-by: David Hildenbrand Cc: Aboorva Devarajan Cc: Chengming Zhou Cc: "Ritesh Harjani (IBM)" Cc: Wei Yang Cc: xu xin Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 712f43c87736..ac136f04b8d6 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -602,6 +602,46 @@ unmap: munmap(map, size); } +static void test_fork_ksm_merging_page_count(void) +{ + const unsigned int size = 2 * MiB; + char *map; + pid_t child_pid; + int status; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, KSM_MERGE_MADVISE); + if (map == MAP_FAILED) + return; + + child_pid = fork(); + if (!child_pid) { + init_global_file_handles(); + exit(ksm_get_self_merging_pages()); + } else if (child_pid < 0) { + ksft_test_result_fail("fork() failed\n"); + goto unmap; + } + + if (waitpid(child_pid, &status, 0) < 0) { + ksft_test_result_fail("waitpid() failed\n"); + goto unmap; + } + + status = WEXITSTATUS(status); + if (status) { + ksft_test_result_fail("ksm_merging_page in child: %d\n", status); + goto unmap; + } + + ksft_test_result_pass("ksm_merging_pages is not inherited after fork\n"); + +unmap: + ksm_stop(); + munmap(map, size); +} + static void init_global_file_handles(void) { mem_fd = open("/proc/self/mem", O_RDWR); @@ -620,7 +660,7 @@ static void init_global_file_handles(void) int main(int argc, char **argv) { - unsigned int tests = 8; + unsigned int tests = 9; int err; if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) { @@ -652,6 +692,7 @@ int main(int argc, char **argv) test_prctl_fork(); test_prctl_fork_exec(); test_prctl_unmerge(); + test_fork_ksm_merging_page_count(); err = ksft_get_fail_cnt(); if (err) From 3dfd02c900379d209ac9dcac24b4a61d8478842a Mon Sep 17 00:00:00 2001 From: Li Zhe Date: Fri, 19 Sep 2025 17:23:53 +0800 Subject: [PATCH 363/372] hugetlb: increase number of reserving hugepages via cmdline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 79359d6d24df ("hugetlb: perform vmemmap optimization on a list of pages") batches the submission of HugeTLB vmemmap optimization (HVO) during hugepage reservation. With HVO enabled, hugepages obtained from the buddy allocator are not submitted for optimization and their struct-page memory is therefore not released—until the entire reservation request has been satisfied. As a result, any struct-page memory freed in the course of the allocation cannot be reused for the ongoing reservation, artificially limiting the number of huge pages that can ultimately be provided. As commit b1222550fbf7 ("mm/hugetlb: do pre-HVO for bootmem allocated pages") already applies early HVO to bootmem-allocated huge pages, this patch extends the same benefit to non-bootmem pages by incrementally submitting them for HVO as they are allocated, thereby returning struct-page memory to the buddy allocator in real time. The change raises the maximum 2 MiB hugepage reservation from just under 376 GB to more than 381 GB on a 384 GB x86 VM. Link: https://lkml.kernel.org/r/20250919092353.41671-1-lizhe.67@bytedance.com Signed-off-by: Li Zhe Cc: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d2471a0b6002..1f65609cb724 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3538,7 +3538,14 @@ static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned l nodes_clear(node_alloc_noretry); for (i = 0; i < num; ++i) { - struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + struct folio *folio; + + if (hugetlb_vmemmap_optimizable_size(h) && + (si_mem_available() == 0) && !list_empty(&folio_list)) { + prep_and_add_allocated_folios(h, &folio_list); + INIT_LIST_HEAD(&folio_list); + } + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], &node_alloc_noretry, &next_node); if (!folio) break; From 08498be43ee676d8a5eefb22278266322578a3e0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 19 Sep 2025 07:12:43 +0000 Subject: [PATCH 364/372] mm/ksm: get mm_slot by mm_slot_entry() when slot is !NULL Patch series "mm_slot: fix the usage of mm_slot_entry", v2. When using mm_slot in ksm, there is code like: slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ..) { } The mm_slot_entry() won't return a valid value if slot is NULL generally. But currently it works since slot is the first element of struct ksm_mm_slot. To reduce the ambiguity and make it robust, access mm_slot_entry() when slot is !NULL. Link: https://lkml.kernel.org/r/20250919071244.17020-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20250919071244.17020-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Hildenbrand Reviewed-by: Dev Jain Reviewed-by: Lance Yang Cc: Kiryl Shutsemau Cc: xu xin Signed-off-by: Andrew Morton --- mm/ksm.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 2dbe92e3dd52..04019a15b25d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2936,15 +2936,17 @@ void __ksm_exit(struct mm_struct *mm) spin_lock(&ksm_mmlist_lock); slot = mm_slot_lookup(mm_slots_hash, mm); - mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); - if (mm_slot && ksm_scan.mm_slot != mm_slot) { - if (!mm_slot->rmap_list) { - hash_del(&slot->hash); - list_del(&slot->mm_node); - easy_to_free = 1; - } else { - list_move(&slot->mm_node, - &ksm_scan.mm_slot->slot.mm_node); + if (slot) { + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + if (ksm_scan.mm_slot != mm_slot) { + if (!mm_slot->rmap_list) { + hash_del(&slot->hash); + list_del(&slot->mm_node); + easy_to_free = 1; + } else { + list_move(&slot->mm_node, + &ksm_scan.mm_slot->slot.mm_node); + } } } spin_unlock(&ksm_mmlist_lock); From b4c9ffb54b3204dc8b4013c6410c897cff8cdd70 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 19 Sep 2025 07:12:44 +0000 Subject: [PATCH 365/372] mm/khugepaged: remove definition of struct khugepaged_mm_slot Current code is not correct to get struct khugepaged_mm_slot by mm_slot_entry() without checking mm_slot is !NULL. There is no problem reported since slot is the first element of struct khugepaged_mm_slot. While struct khugepaged_mm_slot is just a wrapper of struct mm_slot, there is no need to define it. Remove the definition of struct khugepaged_mm_slot, so there is not chance to miss use mm_slot_entry(). [richard.weiyang@gmail.com: fix use-after-free crash] Link: https://lkml.kernel.org/r/20250922002834.vz6ntj36e75ehkyp@master Link: https://lkml.kernel.org/r/20250919071244.17020-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Lance Yang Cc: David Hildenbrand Cc: Dev Jain Cc: Kiryl Shutsemau Cc: xu xin Signed-off-by: Andrew Morton --- mm/khugepaged.c | 58 ++++++++++++++++++------------------------------- 1 file changed, 21 insertions(+), 37 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9ed1af2b5c38..52786ffef80a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -103,14 +103,6 @@ struct collapse_control { nodemask_t alloc_nmask; }; -/** - * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned - * @slot: hash lookup from mm to mm_slot - */ -struct khugepaged_mm_slot { - struct mm_slot slot; -}; - /** * struct khugepaged_scan - cursor for scanning * @mm_head: the head of the mm list to scan @@ -121,7 +113,7 @@ struct khugepaged_mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct khugepaged_mm_slot *mm_slot; + struct mm_slot *mm_slot; unsigned long address; }; @@ -384,7 +376,10 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { - mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0); + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), + 0, NULL); if (!mm_slot_cache) return -ENOMEM; @@ -438,7 +433,6 @@ static bool hugepage_pmd_enabled(void) void __khugepaged_enter(struct mm_struct *mm) { - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int wakeup; @@ -447,12 +441,10 @@ void __khugepaged_enter(struct mm_struct *mm) if (unlikely(mm_flags_test_and_set(MMF_VM_HUGEPAGE, mm))) return; - mm_slot = mm_slot_alloc(mm_slot_cache); - if (!mm_slot) + slot = mm_slot_alloc(mm_slot_cache); + if (!slot) return; - slot = &mm_slot->slot; - spin_lock(&khugepaged_mm_lock); mm_slot_insert(mm_slots_hash, mm, slot); /* @@ -480,14 +472,12 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, void __khugepaged_exit(struct mm_struct *mm) { - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + if (slot && khugepaged_scan.mm_slot != slot) { hash_del(&slot->hash); list_del(&slot->mm_node); free = 1; @@ -496,9 +486,9 @@ void __khugepaged_exit(struct mm_struct *mm) if (free) { mm_flags_clear(MMF_VM_HUGEPAGE, mm); - mm_slot_free(mm_slot_cache, mm_slot); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); - } else if (mm_slot) { + } else if (slot) { /* * This is required to serialize against * hpage_collapse_test_exit() (which is guaranteed to run @@ -1432,9 +1422,8 @@ out: return result; } -static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) +static void collect_mm_slot(struct mm_slot *slot) { - struct mm_slot *slot = &mm_slot->slot; struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); @@ -1451,7 +1440,7 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) */ /* khugepaged_mm_lock actually not necessary for the below */ - mm_slot_free(mm_slot_cache, mm_slot); + mm_slot_free(mm_slot_cache, slot); mmdrop(mm); } } @@ -2394,7 +2383,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; - struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; @@ -2405,14 +2393,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) { - mm_slot = khugepaged_scan.mm_slot; - slot = &mm_slot->slot; + slot = khugepaged_scan.mm_slot; } else { slot = list_first_entry(&khugepaged_scan.mm_head, struct mm_slot, mm_node); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; - khugepaged_scan.mm_slot = mm_slot; + khugepaged_scan.mm_slot = slot; } spin_unlock(&khugepaged_mm_lock); @@ -2510,7 +2496,7 @@ breakouterloop: breakouterloop_mmap_lock: spin_lock(&khugepaged_mm_lock); - VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); + VM_BUG_ON(khugepaged_scan.mm_slot != slot); /* * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. @@ -2522,16 +2508,14 @@ breakouterloop_mmap_lock: * mm_slot not pointing to the exiting mm. */ if (!list_is_last(&slot->mm_node, &khugepaged_scan.mm_head)) { - slot = list_next_entry(slot, mm_node); - khugepaged_scan.mm_slot = - mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + khugepaged_scan.mm_slot = list_next_entry(slot, mm_node); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; khugepaged_full_scans++; } - collect_mm_slot(mm_slot); + collect_mm_slot(slot); } return progress; @@ -2618,7 +2602,7 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; set_freezable(); set_user_nice(current, MAX_NICE); @@ -2629,10 +2613,10 @@ static int khugepaged(void *none) } spin_lock(&khugepaged_mm_lock); - mm_slot = khugepaged_scan.mm_slot; + slot = khugepaged_scan.mm_slot; khugepaged_scan.mm_slot = NULL; - if (mm_slot) - collect_mm_slot(mm_slot); + if (slot) + collect_mm_slot(slot); spin_unlock(&khugepaged_mm_lock); return 0; } From cde31ecdd1aa1cc495bdf6d5cba84adc276d8861 Mon Sep 17 00:00:00 2001 From: Xie Yuanbin Date: Mon, 22 Sep 2025 22:36:18 +0800 Subject: [PATCH 366/372] mm/memory-failure: don't select MEMORY_ISOLATION We added that "select MEMORY_ISOLATION" in commit ee6f509c3274 ("mm: factor out memory isolate functions"). However, in commit add05cecef80 ("mm: soft-offline: don't free target page in successful page migration") we remove the need for it, where we removed the calls to set_migratetype_isolate() etc. What CONFIG_MEMORY_FAILURE soft-offline support wants is migrate_pages() support. But that comes with CONFIG_MIGRATION. And isolate_folio_to_list() has nothing to do with CONFIG_MEMORY_ISOLATION. Therefore, we can remove "select MEMORY_ISOLATION" of MEMORY_FAILURE. Link: https://lkml.kernel.org/r/20250922143618.48640-1-xieyuanbin1@huawei.com Signed-off-by: Xie Yuanbin Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Acked-by: Miaohe Lin Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Mike Rapoport Cc: Naoya Horiguchi Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index d1ed839ca710..bde9f842a4a8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -748,7 +748,6 @@ config MEMORY_FAILURE depends on MMU depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" - select MEMORY_ISOLATION select RAS help Enables code to recover from some memory failures on systems From 989c2f55ca4839121cbf23b5802f8513dbd54e1e Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Fri, 26 Sep 2025 17:24:26 +0800 Subject: [PATCH 367/372] mm: silence data-race in update_hiwater_rss KCSAN reports a data race on mm_cluster.hiwater_rss, which can be accessed concurrently from various paths like page migration and memory unmapping without synchronization. Since hiwater_rss is a statistical field for accounting purposes, this data race is benign. Annotate both the read and write accesses with data_race() to make KCSAN happy. Link: https://lkml.kernel.org/r/20250926092426.43312-1-lance.yang@linux.dev Signed-off-by: Lance Yang Reported-by: syzbot+60192c8877d0bc92a92b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-mm/68d6364e.050a0220.3390a8.000d.GAE@google.com Acked-by: Vlastimil Babka Cc: David Hildenbrand Cc: Jann Horn Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Marco Elver Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index fcb1e72eea40..06978b4dbeb8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2742,7 +2742,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) unsigned long _rss = get_mm_rss(mm); if (data_race(mm->hiwater_rss) < _rss) - (mm)->hiwater_rss = _rss; + data_race(mm->hiwater_rss = _rss); } static inline void update_hiwater_vm(struct mm_struct *mm) From fb552b2425cf8f16c9c72229a972d1744b24d855 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Fri, 26 Sep 2025 08:06:59 +0000 Subject: [PATCH 368/372] alloc_tag: fix boot failure due to NULL pointer dereference There is a boot failure when both CONFIG_DEBUG_KMEMLEAK and CONFIG_MEM_ALLOC_PROFILING are enabled. BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: 0010:__alloc_tagging_slab_alloc_hook+0x181/0x2f0 Call Trace: kmem_cache_alloc_noprof+0x1c8/0x5c0 __alloc_object+0x2f/0x290 __create_object+0x22/0x80 kmemleak_init+0x122/0x190 mm_core_init+0xb6/0x160 start_kernel+0x39f/0x920 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0x104/0x120 common_startup_64+0x12c/0x138 In kmemleak, mem_pool_alloc() directly calls kmem_cache_alloc_noprof(), as a result, current->alloc_tag is NULL, leading to a null pointer dereference. Move the checks for SLAB_NO_OBJ_EXT, SLAB_NOLEAKTRACE, and __GFP_NO_OBJ_EXT to the parent function __alloc_tagging_slab_alloc_hook() to fix this. Also this distinguishes the SLAB_NOLEAKTRACE case between the actual memory allocation failures case, make CODETAG_FLAG_INACCURATE more accurate. Link: https://lkml.kernel.org/r/20250926080659.741991-1-ranxiaokai627@163.com Fixes: b9e2f58ffb84 ("alloc_tag: mark inaccurate allocation counters in /proc/allocinfo output") Signed-off-by: Ran Xiaokai Reviewed-by: Harry Yoo Acked-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Cc: Christoph Lameter (Ampere) Cc: David Rientjes Cc: Johannes Weiner Cc: Roman Gushchin Cc: Shakeel Butt Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/slub.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 9c04f29ee8de..9d73cca9f1de 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2109,15 +2109,6 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) { struct slab *slab; - if (!p) - return NULL; - - if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) - return NULL; - - if (flags & __GFP_NO_OBJ_EXT) - return NULL; - slab = virt_to_slab(p); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { @@ -2135,6 +2126,15 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { struct slabobj_ext *obj_exts; + if (!object) + return; + + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return; + + if (flags & __GFP_NO_OBJ_EXT) + return; + obj_exts = prepare_slab_obj_exts_hook(s, flags, object); /* * Currently obj_exts is used only for allocation profiling. From dd83609b88986f4add37c0871c3434310652ebd5 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Fri, 26 Sep 2025 09:02:54 +0530 Subject: [PATCH 369/372] hugetlbfs: skip VMAs without shareable locks in hugetlb_vmdelete_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hugetlb_vmdelete_list() uses trylock to acquire VMA locks during truncate operations. As per the original design in commit 40549ba8f8e0 ("hugetlb: use new vma_lock for pmd sharing synchronization"), if the trylock fails or the VMA has no lock, it should skip that VMA. Any remaining mapped pages are handled by remove_inode_hugepages() which is called after hugetlb_vmdelete_list() and uses proper lock ordering to guarantee unmapping success. Currently, when hugetlb_vma_trylock_write() returns success (1) for VMAs without shareable locks, the code proceeds to call unmap_hugepage_range(). This causes assertion failures in huge_pmd_unshare() → hugetlb_vma_assert_locked() because no lock is actually held: WARNING: CPU: 1 PID: 6594 Comm: syz.0.28 Not tainted Call Trace: hugetlb_vma_assert_locked+0x1dd/0x250 huge_pmd_unshare+0x2c8/0x540 __unmap_hugepage_range+0x6e3/0x1aa0 unmap_hugepage_range+0x32e/0x410 hugetlb_vmdelete_list+0x189/0x1f0 Fix by using goto to ensure locks acquired by trylock are always released, even when skipping VMAs without shareable locks. Link: https://lkml.kernel.org/r/20250926033255.10930-1-kartikey406@gmail.com Fixes: 40549ba8f8e0 ("hugetlb: use new vma_lock for pmd sharing synchronization") Signed-off-by: Deepanshu Kartikey Reported-by: syzbot+f26d7c75c26ec19790e7@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f26d7c75c26ec19790e7 Suggested-by: Andrew Morton Cc: David Hildenbrand Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3cfdf4091001..94b4d854429e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -478,6 +478,14 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, if (!hugetlb_vma_trylock_write(vma)) continue; + /* + * Skip VMAs without shareable locks. Per the design in commit + * 40549ba8f8e0, these will be handled by remove_inode_hugepages() + * called after this function with proper locking. + */ + if (!__vma_shareable_lock(vma)) + goto skip; + v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); @@ -488,6 +496,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * vmas. Therefore, lock is not held when calling * unmap_hugepage_range for private vmas. */ +skip: hugetlb_vma_unlock_write(vma); } } From 1acc369373008b9eeb930fbb47847c0693055553 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 22 Sep 2025 14:09:38 +0000 Subject: [PATCH 370/372] mm/khugepaged: use start_addr/addr for improved readability When collapsing a pmd, there are two address in use: * address points to the start of pmd * address points to each individual page Current naming makes it difficult to distinguish these two and is hence error prone. Considering the plan to collapse mTHP, name the first one `start_addr' and the second `addr' for better readability and consistency. Link: https://lkml.kernel.org/r/20250922140938.27343-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Suggested-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Nico Pache Acked-by: David Hildenbrand Reviewed-by: Dev Jain Cc: Baolin Wang Cc: Barry Song Cc: Lance Yang Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/khugepaged.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 52786ffef80a..7ab2d1a42df3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -537,18 +537,19 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, - unsigned long address, + unsigned long start_addr, pte_t *pte, struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; struct folio *folio = NULL; + unsigned long addr = start_addr; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, address += PAGE_SIZE) { + _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { @@ -571,7 +572,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PTE_UFFD_WP; goto out; } - page = vm_normal_page(vma, address, pteval); + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out; @@ -656,8 +657,8 @@ next: */ if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || - folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, - address))) + folio_test_referenced(folio) || + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } @@ -986,21 +987,21 @@ static int check_pmd_still_valid(struct mm_struct *mm, */ static int __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, + unsigned long start_addr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; - unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE); int result; pte_t *pte = NULL; spinlock_t *ptl; - for (address = haddr; address < end; address += PAGE_SIZE) { + for (addr = start_addr; addr < end; addr += PAGE_SIZE) { struct vm_fault vmf = { .vma = vma, - .address = address, - .pgoff = linear_page_index(vma, address), + .address = addr, + .pgoff = linear_page_index(vma, addr), .flags = FAULT_FLAG_ALLOW_RETRY, .pmd = pmd, }; @@ -1010,7 +1011,7 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, * Here the ptl is only used to check pte_same() in * do_swap_page(), so readonly version is enough. */ - pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_PMD_NULL; @@ -1253,7 +1254,7 @@ out_nolock: static int hpage_collapse_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, bool *mmap_locked, + unsigned long start_addr, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; @@ -1262,26 +1263,26 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, int none_or_zero = 0, shared = 0; struct page *page = NULL; struct folio *folio = NULL; - unsigned long _address; + unsigned long addr; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK); - result = find_pmd_or_thp_or_none(mm, address, &pmd); + result = find_pmd_or_thp_or_none(mm, start_addr, &pmd); if (result != SCAN_SUCCEED) goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - pte = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = pte_offset_map_lock(mm, pmd, start_addr, &ptl); if (!pte) { result = SCAN_PMD_NULL; goto out; } - for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; - _pte++, _address += PAGE_SIZE) { + for (addr = start_addr, _pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, addr += PAGE_SIZE) { pte_t pteval = ptep_get(_pte); if (is_swap_pte(pteval)) { ++unmapped; @@ -1329,7 +1330,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out_unmap; } - page = vm_normal_page(vma, _address, pteval); + page = vm_normal_page(vma, addr, pteval); if (unlikely(!page) || unlikely(is_zone_device_page(page))) { result = SCAN_PAGE_NULL; goto out_unmap; @@ -1398,7 +1399,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, if (cc->is_khugepaged && (pte_young(pteval) || folio_test_young(folio) || folio_test_referenced(folio) || - mmu_notifier_test_young(vma->vm_mm, _address))) + mmu_notifier_test_young(vma->vm_mm, addr))) referenced++; } if (cc->is_khugepaged && @@ -1411,7 +1412,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (result == SCAN_SUCCEED) { - result = collapse_huge_page(mm, address, referenced, + result = collapse_huge_page(mm, start_addr, referenced, unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ *mmap_locked = false; From 81e78b7ec61e89e8bab9736551839f79b063614c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 23 Sep 2025 16:00:58 +0200 Subject: [PATCH 371/372] mm: convert folio_page() back to a macro In commit 73b3294b1152 ("mm: simplify folio_page() and folio_page_idx()") we converted folio_page() into a static inline function. However briefly afterwards in commit a847b17009ec ("mm: constify highmem related functions for improved const-correctness") we had to add some nasty const-away casting to make the compiler happy when checking const correctness. So let's just convert it back to a simple macro so the compiler can check const correctness properly. There is the alternative of using a _Generic() similar to page_folio(), but there is not a lot of benefit compared to just using a simple macro. Link: https://lkml.kernel.org/r/20250923140058.2020023-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Kiryl Shutsemau Reviewed-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Dev Jain Reviewed-by: Suren Baghdasaryan Reviewed-by: Lance Yang Reviewed-by: Wei Yang Cc: Lorenzo Stoakes Cc: "Liam R. Howlett" Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 568011930e35..48e27768e7ba 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -316,10 +316,7 @@ static __always_inline unsigned long _compound_head(const struct page *page) * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ -static inline struct page *folio_page(const struct folio *folio, unsigned long n) -{ - return (struct page *)(&folio->page + n); -} +#define folio_page(folio, n) (&(folio)->page + (n)) static __always_inline int PageTail(const struct page *page) { From 1367da7eb875d01102d2ed18654b24d261ff5393 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Wed, 24 Sep 2025 23:41:38 +0530 Subject: [PATCH 372/372] mm: swap: check for stable address space before operating on the VMA It is possible to hit a zero entry while traversing the vmas in unuse_mm() called from swapoff path and accessing it causes the OOPS: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000446--> Loading the memory from offset 0x40 on the XA_ZERO_ENTRY as address. Mem abort info: ESR = 0x0000000096000005 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault The issue is manifested from the below race between the fork() on a process and swapoff: fork(dup_mmap()) swapoff(unuse_mm) --------------- ----------------- 1) Identical mtree is built using __mt_dup(). 2) copy_pte_range()--> copy_nonpresent_pte(): The dst mm is added into the mmlist to be visible to the swapoff operation. 3) Fatal signal is sent to the parent process(which is the current during the fork) thus skip the duplication of the vmas and mark the vma range with XA_ZERO_ENTRY as a marker for this process that helps during exit_mmap(). 4) swapoff is tried on the 'mm' added to the 'mmlist' as part of the 2. 5) unuse_mm(), that iterates through the vma's of this 'mm' will hit the non-NULL zero entry and operating on this zero entry as a vma is resulting into the oops. The proper fix would be around not exposing this partially-valid tree to others when droping the mmap lock, which is being solved with [1]. A simpler solution would be checking for MMF_UNSTABLE, as it is set if mm_struct is not fully initialized in dup_mmap(). Thanks to Liam/Lorenzo/David for all the suggestions in fixing this issue. Link: https://lkml.kernel.org/r/20250924181138.1762750-1-charan.kalla@oss.qualcomm.com Link: https://lore.kernel.org/all/20250815191031.3769540-1-Liam.Howlett@oracle.com/ [1] Fixes: d24062914837 ("fork: use __mt_dup() to duplicate maple tree in dup_mmap()") Signed-off-by: Charan Teja Kalla Suggested-by: David Hildenbrand Cc: Baoquan He Cc: Barry Song Cc: Chris Li Cc: Kairui Song Cc: Kemeng Shi Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Nhat Pham Cc: Peng Zhang Cc: Signed-off-by: Andrew Morton --- mm/swapfile.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 890b410d77b6..10760240a3a2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2389,6 +2389,8 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); + if (check_stable_address_space(mm)) + goto unlock; for_each_vma(vmi, vma) { if (vma->anon_vma && !is_vm_hugetlb_page(vma)) { ret = unuse_vma(vma, type); @@ -2398,6 +2400,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) cond_resched(); } +unlock: mmap_read_unlock(mm); return ret; }