From 15ac613f124e51a6623975efad9657b1f3ee47e7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 19 May 2025 15:56:57 +0100 Subject: [PATCH 01/13] KVM: s390: rename PROT_NONE to PROT_TYPE_DUMMY The enum type prot_type declared in arch/s390/kvm/gaccess.c declares an unfortunate identifier within it - PROT_NONE. This clashes with the protection bit define from the uapi for mmap() declared in include/uapi/asm-generic/mman-common.h, which is indeed what those casually reading this code would assume this to refer to. This means that any changes which subsequently alter headers in any way which results in the uapi header being imported here will cause build errors. Resolve the issue by renaming PROT_NONE to PROT_TYPE_DUMMY. Link: https://lkml.kernel.org/r/20250519145657.178365-1-lorenzo.stoakes@oracle.com Fixes: b3cefd6bf16e ("KVM: s390: Pass initialized arg even if unused") Signed-off-by: Lorenzo Stoakes Suggested-by: Ignacio Moreno Gonzalez Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505140943.IgHDa9s7-lkp@intel.com/ Acked-by: Christian Borntraeger Acked-by: Ignacio Moreno Gonzalez Acked-by: Yang Shi Reviewed-by: David Hildenbrand Acked-by: Liam R. Howlett Reviewed-by: Oscar Salvador Reviewed-by: Claudio Imbrenda Cc: Cc: Alexander Gordeev Cc: Heiko Carstens Cc: James Houghton Cc: Janosch Frank Cc: Matthew Wilcox (Oracle) Cc: Paolo Bonzini Cc: Sven Schnelle Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/kvm/gaccess.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index e23670e1949c..21c2e61fece4 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -319,7 +319,7 @@ enum prot_type { PROT_TYPE_DAT = 3, PROT_TYPE_IEP = 4, /* Dummy value for passing an initialized value when code != PGM_PROTECTION */ - PROT_NONE, + PROT_TYPE_DUMMY, }; static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, u8 ar, @@ -335,7 +335,7 @@ static int trans_exc_ending(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (code) { case PGM_PROTECTION: switch (prot) { - case PROT_NONE: + case PROT_TYPE_DUMMY: /* We should never get here, acts like termination */ WARN_ON_ONCE(1); break; @@ -805,7 +805,7 @@ static int guest_range_to_gpas(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, gpa = kvm_s390_real_to_abs(vcpu, ga); if (!kvm_is_gpa_in_memslot(vcpu->kvm, gpa)) { rc = PGM_ADDRESSING; - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; } } if (rc) @@ -963,7 +963,7 @@ int access_guest_with_key(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, if (rc == PGM_PROTECTION) prot = PROT_TYPE_KEYC; else - prot = PROT_NONE; + prot = PROT_TYPE_DUMMY; rc = trans_exc_ending(vcpu, rc, ga, ar, mode, prot, terminate); } out_unlock: From 684088173b1852e5f5891a7b2a124b7974cb827d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 29 May 2025 13:38:32 +0300 Subject: [PATCH 02/13] mm: fix vmstat after removing NR_BOUNCE Hongyu noticed that the nr_unaccepted counter kept growing even in the absence of unaccepted memory on the machine. This happens due to a commit that removed NR_BOUNCE: it removed the counter from the enum zone_stat_item, but left it in the vmstat_text array. As a result, all counters below nr_bounce in /proc/vmstat are shifted by one line, causing the numa_hit counter to be labeled as nr_unaccepted. To fix this issue, remove nr_bounce from the vmstat_text array. Link: https://lkml.kernel.org/r/20250529103832.2937460-1-kirill.shutemov@linux.intel.com Fixes: 194df9f66db8 ("mm: remove NR_BOUNCE zone stat") Signed-off-by: Kirill A. Shutemov Reported-by: Hongyu Ning Reviewed-by: Jens Axboe Reviewed-by: Vlastimil Babka Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Cc: Jens Axboe Signed-off-by: Andrew Morton --- mm/vmstat.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6f740f070b3d..429ae5339bfe 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1201,7 +1201,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", #endif From 9c49e5d09f076001e05537734d7df002162eb2b5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 2 Jun 2025 10:49:26 -0700 Subject: [PATCH 03/13] mm/madvise: handle madvise_lock() failure during race unwinding When unwinding race on -ERESTARTNOINTR handling of process_madvise(), madvise_lock() failure is ignored. Check the failure and abort remaining works in the case. Link: https://lkml.kernel.org/r/20250602174926.1074-1-sj@kernel.org Fixes: 4000e3d0a367 ("mm/madvise: remove redundant mmap_lock operations from process_madvise()") Signed-off-by: SeongJae Park Reported-by: Barry Song <21cnbao@gmail.com> Closes: https://lore.kernel.org/CAGsJ_4xJXXO0G+4BizhohSZ4yDteziPw43_uF8nPXPWxUVChzw@mail.gmail.com Reviewed-by: Jann Horn Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Reviewed-by: Shakeel Butt Reviewed-by: Barry Song Cc: Liam Howlett Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/madvise.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 8433ac9b27e0..5f7a66a1617e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1881,7 +1881,9 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, /* Drop and reacquire lock to unwind race. */ madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); - madvise_lock(mm, behavior); + ret = madvise_lock(mm, behavior); + if (ret) + goto out; madvise_init_tlb(&madv_behavior, mm); continue; } @@ -1892,6 +1894,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, madvise_finish_tlb(&madv_behavior); madvise_unlock(mm, behavior); +out: ret = (total_len - iov_iter_count(iter)) ? : ret; return ret; From 044d2aee6c575231ed4a24fb3d119bad0937488b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 21 May 2025 09:06:02 -0700 Subject: [PATCH 04/13] alloc_tag: handle module codetag load errors as module load failures Failures inside codetag_load_module() are currently ignored. As a result an error there would not cause a module load failure and freeing of the associated resources. Correct this behavior by propagating the error code to the caller and handling possible errors. With this change, error to allocate percpu counters, which happens at this stage, will not be ignored and will cause a module load failure and freeing of resources. With this change we also do not need to disable memory allocation profiling when this error happens, instead we fail to load the module. Link: https://lkml.kernel.org/r/20250521160602.1940771-1-surenb@google.com Fixes: 10075262888b ("alloc_tag: allocate percpu counters for module tags dynamically") Signed-off-by: Suren Baghdasaryan Reported-by: Casey Chen Closes: https://lore.kernel.org/all/20250520231620.15259-1-cachen@purestorage.com/ Cc: Daniel Gomez Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Luis Chamberalin Cc: Petr Pavlu Cc: Sami Tolvanen Cc: Signed-off-by: Andrew Morton --- include/linux/codetag.h | 8 ++++---- kernel/module/main.c | 5 +++-- lib/alloc_tag.c | 12 +++++++----- lib/codetag.c | 34 +++++++++++++++++++++++++--------- 4 files changed, 39 insertions(+), 20 deletions(-) diff --git a/include/linux/codetag.h b/include/linux/codetag.h index 0ee4c21c6dbc..5f2b9a1f722c 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -36,8 +36,8 @@ union codetag_ref { struct codetag_type_desc { const char *section; size_t tag_size; - void (*module_load)(struct module *mod, - struct codetag *start, struct codetag *end); + int (*module_load)(struct module *mod, + struct codetag *start, struct codetag *end); void (*module_unload)(struct module *mod, struct codetag *start, struct codetag *end); #ifdef CONFIG_MODULES @@ -89,7 +89,7 @@ void *codetag_alloc_module_section(struct module *mod, const char *name, unsigned long align); void codetag_free_module_sections(struct module *mod); void codetag_module_replaced(struct module *mod, struct module *new_mod); -void codetag_load_module(struct module *mod); +int codetag_load_module(struct module *mod); void codetag_unload_module(struct module *mod); #else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ @@ -103,7 +103,7 @@ codetag_alloc_module_section(struct module *mod, const char *name, unsigned long align) { return NULL; } static inline void codetag_free_module_sections(struct module *mod) {} static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {} -static inline void codetag_load_module(struct module *mod) {} +static inline int codetag_load_module(struct module *mod) { return 0; } static inline void codetag_unload_module(struct module *mod) {} #endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 3d64e69cc03e..08b59c37735e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -3386,11 +3386,12 @@ static int load_module(struct load_info *info, const char __user *uargs, goto sysfs_cleanup; } + if (codetag_load_module(mod)) + goto sysfs_cleanup; + /* Get rid of temporary copy. */ free_copy(info, flags); - codetag_load_module(mod); - /* Done! */ trace_module_load(mod); diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 45dae7da70e1..d48b80f3f007 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -607,15 +607,16 @@ out: mas_unlock(&mas); } -static void load_module(struct module *mod, struct codetag *start, struct codetag *stop) +static int load_module(struct module *mod, struct codetag *start, struct codetag *stop) { /* Allocate module alloc_tag percpu counters */ struct alloc_tag *start_tag; struct alloc_tag *stop_tag; struct alloc_tag *tag; + /* percpu counters for core allocations are already statically allocated */ if (!mod) - return; + return 0; start_tag = ct_to_alloc_tag(start); stop_tag = ct_to_alloc_tag(stop); @@ -627,12 +628,13 @@ static void load_module(struct module *mod, struct codetag *start, struct codeta free_percpu(tag->counters); tag->counters = NULL; } - shutdown_mem_profiling(true); - pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s. Memory allocation profiling is disabled!\n", + pr_err("Failed to allocate memory for allocation tag percpu counters in the module %s\n", mod->name); - break; + return -ENOMEM; } } + + return 0; } static void replace_module(struct module *mod, struct module *new_mod) diff --git a/lib/codetag.c b/lib/codetag.c index de332e98d6f5..650d54d7e14d 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -167,6 +167,7 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) { struct codetag_range range; struct codetag_module *cmod; + int mod_id; int err; range = get_section_range(mod, cttype->desc.section); @@ -190,11 +191,20 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) cmod->range = range; down_write(&cttype->mod_lock); - err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL); - if (err >= 0) { - cttype->count += range_size(cttype, &range); - if (cttype->desc.module_load) - cttype->desc.module_load(mod, range.start, range.stop); + mod_id = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL); + if (mod_id >= 0) { + if (cttype->desc.module_load) { + err = cttype->desc.module_load(mod, range.start, range.stop); + if (!err) + cttype->count += range_size(cttype, &range); + else + idr_remove(&cttype->mod_idr, mod_id); + } else { + cttype->count += range_size(cttype, &range); + err = 0; + } + } else { + err = mod_id; } up_write(&cttype->mod_lock); @@ -295,17 +305,23 @@ void codetag_module_replaced(struct module *mod, struct module *new_mod) mutex_unlock(&codetag_lock); } -void codetag_load_module(struct module *mod) +int codetag_load_module(struct module *mod) { struct codetag_type *cttype; + int ret = 0; if (!mod) - return; + return 0; mutex_lock(&codetag_lock); - list_for_each_entry(cttype, &codetag_types, link) - codetag_module_init(cttype, mod); + list_for_each_entry(cttype, &codetag_types, link) { + ret = codetag_module_init(cttype, mod); + if (ret) + break; + } mutex_unlock(&codetag_lock); + + return ret; } void codetag_unload_module(struct module *mod) From 41ffaa0ea76206e4fb7589a5a499e6752c33188a Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Mon, 2 Jun 2025 09:23:39 -0700 Subject: [PATCH 05/13] mm/mempolicy: fix incorrect freeing of wi_kobj We should not free wi_group->wi_kobj here. In the error path of add_weighted_interleave_group() where this snippet is called from, kobj_{del, put} is immediately called right after this section. Thus, it is not only unnecessary but also incorrect to free it here. Link: https://lkml.kernel.org/r/20250602162345.2595696-1-joshua.hahnjy@gmail.com Fixes: e341f9c3c841 ("mm/mempolicy: Weighted Interleave Auto-tuning") Signed-off-by: Joshua Hahn Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202506011545.Fduxqxqj-lkp@intel.com/ Cc: Alistair Popple Cc: Byungchul Park Cc: David Hildenbrand Cc: Gregory Price Cc: "Huang, Ying" Cc: Mathew Brost Cc: Rakie Kim Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72fd72e156b1..3b1dfd08338b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -3708,15 +3708,13 @@ static void wi_state_free(void) lockdep_is_held(&wi_state_lock)); if (!old_wi_state) { mutex_unlock(&wi_state_lock); - goto out; + return; } rcu_assign_pointer(wi_state, NULL); mutex_unlock(&wi_state_lock); synchronize_rcu(); kfree(old_wi_state); -out: - kfree(&wi_group->wi_kobj); } static struct kobj_attribute wi_auto_attr = From 334d7c4fb60cf21e0abac134d92fe49e9b04377e Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Mon, 28 Apr 2025 15:28:48 +0530 Subject: [PATCH 06/13] iov_iter: use iov_offset for length calculation in iov_iter_aligned_bvec If iov_offset is non-zero, then we need to consider iov_offset in length calculation, otherwise we might pass smaller IOs such as 512 bytes, in below scenario [1]. This issue is reproducible using lib-uring test/fixed-seg.c application with fixed buffer on a 512 LBA formatted device. [1] At present we pass the alignment check, for 512 LBA formatted devices, len_mask = 511 when IO is smaller, i->count = 512 has an offset, i->io_offset = 3584 with bvec values, bvec->bv_offset = 256, bvec->bv_len = 3840. In short, the first 256 bytes are in the current page, next 256 bytes are in the another page. Ideally we expect to fail the IO. I can think of 2 userspace scenarios where we experience this. a: From userspace, we observe a different behaviour when device LBA size is 512 vs 4096 bytes. For 4096 LBA formatted device, I see the same liburing test [2] failing, whereas 512 the test passes without this. This is reproducible everytime. [2] https://github.com/axboe/liburing/ b: Although I was not able to reproduce the below condition, but I suspect below case should be possible from user space for devices with 512 LBA formatted device. Lets say from userspace while allocating a virtually single chunk of memory, if we get 2 physical chunk of memory, and IO happens to be at the boundary of first physical chunk with length crossing first chunk, then we allow IOs to proceed and hence we might map wrong physical address length and proceed with IO rather than failing. : --- a/test/fixed-seg.c : +++ b/test/fixed-seg.c : @@ -64,7 +64,7 @@ static int test(struct io_uring *ring, int fd, int : vec_off) : return T_EXIT_FAIL; : } : : - ret = read_it(ring, fd, 4096, vec_off); : + ret = read_it(ring, fd, 4096, 7*512 + 256); : if (ret) { : fprintf(stderr, "4096 0 failed\n"); : return T_EXIT_FAIL; Effectively this is a write crossing the page boundary. Link: https://lkml.kernel.org/r/20250428095849.11709-1-nj.shetty@samsung.com Fixes: 2263639f96f2 ("iov_iter: streamline iovec/bvec alignment iteration") Reviewed-by: Jens Axboe Reviewed-by: Anuj Gupta Signed-off-by: Nitesh Shetty Cc: Al Viro Cc: Christian Brauner Cc: Keith Busch Signed-off-by: Andrew Morton --- lib/iov_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 969d4ad510df..f9193f952f49 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -817,7 +817,7 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, size_t size = i->count; do { - size_t len = bvec->bv_len; + size_t len = bvec->bv_len - skip; if (len > size) len = size; From a2946fb271b1e01f49df35042e338cca1bd28095 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 30 May 2025 11:49:17 +1000 Subject: [PATCH 07/13] MAINTAINERS: add Alistair as reviewer of mm memory policy I'm particularly familiar with mm/migrate.c and especially mm/migrate_device.c so add myself to MAINTAINERS. Link: https://lkml.kernel.org/r/20250530014917.2946940-1-apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Lorenzo Stoakes Acked-by: Matthew Brost Acked-by: David Hildenbrand Cc: Joshua Hahn Cc: Rakie Kim Cc: Zi Yan Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 98201e1f4ab5..4b4317dd0788 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15744,6 +15744,7 @@ R: Rakie Kim R: Byungchul Park R: Gregory Price R: Ying Huang +R: Alistair Popple L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org From 081056dc00a27bccb55ccc3c6f230a3d5fd3f7e0 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 27 May 2025 23:23:53 +0200 Subject: [PATCH 08/13] mm/hugetlb: unshare page tables during VMA split, not before Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Fix it by explicitly calling into the hugetlb unshare logic from __split_vma() in the same place where THP splitting also happens. At that point, both the VMA and the rmap(s) are write-locked. An annoying detail is that we can now call into the helper hugetlb_unshare_pmds() from two different locking contexts: 1. from hugetlb_split(), holding: - mmap lock (exclusively) - VMA lock - file rmap lock (exclusively) 2. hugetlb_unshare_all_pmds(), which I think is designed to be able to call us with only the mmap lock held (in shared mode), but currently only runs while holding mmap lock (exclusively) and VMA lock Backporting note: This commit fixes a racy protection that was introduced in commit b30c14cd6102 ("hugetlb: unshare some PMDs when splitting VMAs"); that commit claimed to fix an issue introduced in 5.13, but it should actually also go all the way back. [jannh@google.com: v2] Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-1-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-0-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-1-f4136f5ec58a@google.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn Cc: Liam Howlett Reviewed-by: Lorenzo Stoakes Reviewed-by: Oscar Salvador Cc: Lorenzo Stoakes Cc: Vlastimil Babka Cc: [b30c14cd6102: hugetlb: unshare some PMDs when splitting VMAs] Cc: Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 3 ++ mm/hugetlb.c | 60 +++++++++++++++++++++++--------- mm/vma.c | 7 ++++ tools/testing/vma/vma_internal.h | 2 ++ 4 files changed, 56 insertions(+), 16 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0598f36931de..42f374e828a2 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -279,6 +279,7 @@ bool is_hugetlb_entry_migration(pte_t pte); bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); void fixup_hugetlb_reservations(struct vm_area_struct *vma); +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr); #else /* !CONFIG_HUGETLB_PAGE */ @@ -476,6 +477,8 @@ static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma) { } +static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {} + #endif /* !CONFIG_HUGETLB_PAGE */ #ifndef pgd_write diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f0b1d53079f9..7ba020d489d4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -121,7 +121,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, bool take_locks); static struct resv_map *vma_resv_map(struct vm_area_struct *vma); static void hugetlb_free_folio(struct folio *folio) @@ -5426,26 +5426,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) { if (addr & ~(huge_page_mask(hstate_vma(vma)))) return -EINVAL; + return 0; +} +void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) +{ /* * PMD sharing is only possible for PUD_SIZE-aligned address ranges * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. + * This function is called in the middle of a VMA split operation, with + * MM, VMA and rmap all write-locked to prevent concurrent page table + * walks (except hardware and gup_fast()). */ + vma_assert_write_locked(vma); + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + if (addr & ~PUD_MASK) { - /* - * hugetlb_vm_op_split is called right before we attempt to - * split the VMA. We will need to unshare PMDs in the old and - * new VMAs, so let's unshare before we split. - */ unsigned long floor = addr & PUD_MASK; unsigned long ceil = floor + PUD_SIZE; - if (floor >= vma->vm_start && ceil <= vma->vm_end) - hugetlb_unshare_pmds(vma, floor, ceil); + if (floor >= vma->vm_start && ceil <= vma->vm_end) { + /* + * Locking: + * Use take_locks=false here. + * The file rmap lock is already held. + * The hugetlb VMA lock can't be taken when we already + * hold the file rmap lock, and we don't need it because + * its purpose is to synchronize against concurrent page + * table walks, which are not possible thanks to the + * locks held by our caller. + */ + hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false); + } } - - return 0; } static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) @@ -7885,9 +7899,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re spin_unlock_irq(&hugetlb_lock); } +/* + * If @take_locks is false, the caller must ensure that no concurrent page table + * access can happen (except for gup_fast() and hardware page walks). + * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like + * concurrent page fault handling) and the file rmap lock. + */ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, + bool take_locks) { struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -7911,8 +7932,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, start, end); mmu_notifier_invalidate_range_start(&range); - hugetlb_vma_lock_write(vma); - i_mmap_lock_write(vma->vm_file->f_mapping); + if (take_locks) { + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + } else { + i_mmap_assert_write_locked(vma->vm_file->f_mapping); + } for (address = start; address < end; address += PUD_SIZE) { ptep = hugetlb_walk(vma, address, sz); if (!ptep) @@ -7922,8 +7947,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, spin_unlock(ptl); } flush_hugetlb_tlb_range(vma, start, end); - i_mmap_unlock_write(vma->vm_file->f_mapping); - hugetlb_vma_unlock_write(vma); + if (take_locks) { + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + } /* * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. @@ -7938,7 +7965,8 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), - ALIGN_DOWN(vma->vm_end, PUD_SIZE)); + ALIGN_DOWN(vma->vm_end, PUD_SIZE), + /* take_locks = */ true); } /* diff --git a/mm/vma.c b/mm/vma.c index 1c6595f282e5..7ebc9eb608f4 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -539,7 +539,14 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); + + /* + * Get rid of huge pages and shared page tables straddling the split + * boundary. + */ vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); + if (is_vm_hugetlb_page(vma)) + hugetlb_split(vma, addr); if (new_below) { vma->vm_start = addr; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 441feb21aa5a..4505b1c31be1 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -932,6 +932,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, (void)next; } +static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {} + static inline void vma_iter_free(struct vma_iterator *vmi) { mas_destroy(&vmi->mas); From 1013af4f585fccc4d3e5c5824d174de2257f7d6d Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 27 May 2025 23:23:54 +0200 Subject: [PATCH 09/13] mm/hugetlb: fix huge_pmd_unshare() vs GUP-fast race huge_pmd_unshare() drops a reference on a page table that may have previously been shared across processes, potentially turning it into a normal page table used in another process in which unrelated VMAs can afterwards be installed. If this happens in the middle of a concurrent gup_fast(), gup_fast() could end up walking the page tables of another process. While I don't see any way in which that immediately leads to kernel memory corruption, it is really weird and unexpected. Fix it with an explicit broadcast IPI through tlb_remove_table_sync_one(), just like we do in khugepaged when removing page tables for a THP collapse. Link: https://lkml.kernel.org/r/20250528-hugetlb-fixes-splitrace-v2-2-1329349bad1a@google.com Link: https://lkml.kernel.org/r/20250527-hugetlb-fixes-splitrace-v1-2-f4136f5ec58a@google.com Fixes: 39dde65c9940 ("[PATCH] shared page table for hugetlb page") Signed-off-by: Jann Horn Reviewed-by: Lorenzo Stoakes Cc: Liam Howlett Cc: Muchun Song Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7ba020d489d4..8746ed2fec13 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7629,6 +7629,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, return 0; pud_clear(pud); + /* + * Once our caller drops the rmap lock, some other process might be + * using this page table as a normal, non-hugetlb page table. + * Wait for pending gup_fast() in other threads to finish before letting + * that happen. + */ + tlb_remove_table_sync_one(); ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep)); mm_dec_nr_pmds(mm); return 1; From 1b11a28308923a67fc76b8f7be06e1b2d80d07ca Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Tue, 3 Jun 2025 18:50:17 -0400 Subject: [PATCH 10/13] MAINTAINERS: add tlb trace events to MMU GATHER AND TLB INVALIDATION The MMU GATHER AND TLB INVALIDATION entry lists other TLB-related files. Add the tlb.h tracepoint file there as well. Link: https://lore.kernel.org/linux-mm/ce048e11-f79d-44a6-bacc-46e1ebc34b24@redhat.com/ Link: https://lkml.kernel.org/r/20250603-tlb-maintainers-v1-1-726d193c6693@columbia.edu Signed-off-by: Tal Zussman Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Cc: "Aneesh Kumar K.V" Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Will Deacon Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 4b4317dd0788..6b0b5f55ab72 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16654,6 +16654,7 @@ L: linux-mm@kvack.org S: Maintained F: arch/*/include/asm/tlb.h F: include/asm-generic/tlb.h +F: include/trace/events/tlb.h F: mm/mmu_gather.c MN88472 MEDIA DRIVER From ffa74d44f4b95a97b36a038c66f53e4edeaffb99 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 3 Jun 2025 09:53:07 +0200 Subject: [PATCH 11/13] kmsan: test: add module description Every module should have a description, and kbuild now warns for those that don't. WARNING: modpost: missing MODULE_DESCRIPTION() in mm/kmsan/kmsan_test.o Link: https://lkml.kernel.org/r/20250603075323.1839608-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Alexander Potapenko Cc: Andrey Konovalov Cc: Arnd Bergmann Cc: Dmitriy Vyukov Cc: Macro Elver Cc: Sabyrzhan Tasbolatov Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 9733a22c46c1..c6c5b2bbede0 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -732,3 +732,4 @@ kunit_test_suites(&kmsan_test_suite); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Alexander Potapenko "); +MODULE_DESCRIPTION("Test cases for KMSAN"); From 7266f590ca1fab3adf7c2546894dc418f2d86ba6 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 4 Jun 2025 17:31:39 +0100 Subject: [PATCH 12/13] MAINTAINERS: add mm swap section In furtherance of ongoing efforts to ensure people are aware of who de-facto maintains/has an interest in specific parts of mm, as well trying to avoid get_maintainers.pl listing only Andrew and the mailing list for mm files - establish a swap memory management section and add relevant maintainers/reviewers. Link: https://lkml.kernel.org/r/20250604163139.126630-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Chris Li Acked-by: Kairui Song Acked-by: Kemeng Shi Acked-by: Baoquan He Acked-by: Barry Song Acked-by: Nhat Pham Signed-off-by: Andrew Morton --- MAINTAINERS | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6b0b5f55ab72..99dcf9036b9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15816,6 +15816,25 @@ S: Maintained F: include/linux/secretmem.h F: mm/secretmem.c +MEMORY MANAGEMENT - SWAP +M: Andrew Morton +R: Kemeng Shi +R: Kairui Song +R: Nhat Pham +R: Baoquan He +R: Barry Song +R: Chris Li +L: linux-mm@kvack.org +S: Maintained +F: include/linux/swap.h +F: include/linux/swapfile.h +F: include/linux/swapops.h +F: mm/page_io.c +F: mm/swap.c +F: mm/swap.h +F: mm/swap_state.c +F: mm/swapfile.c + MEMORY MANAGEMENT - THP (TRANSPARENT HUGE PAGE) M: Andrew Morton M: David Hildenbrand From 2da20fd904f87f7bb31b79719bc3dda4093f8cdb Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Sun, 4 May 2025 20:08:31 +0200 Subject: [PATCH 13/13] kernel/rcu/tree_stall: add /sys/kernel/rcu_stall_count Expose a simple counter to userspace for monitoring tools. (akpm: 2536c5c7d6ae added the documentation but the code changes were lost) Link: https://lkml.kernel.org/r/20250504180831.4190860-3-max.kellermann@ionos.com Fixes: 2536c5c7d6ae ("kernel/rcu/tree_stall: add /sys/kernel/rcu_stall_count") Signed-off-by: Max Kellermann Cc: Core Minyard Cc: Doug Anderson Cc: Joel Granados Cc: Max Kellermann Cc: Song Liu Cc: Sourabh Jain Signed-off-by: Andrew Morton --- kernel/rcu/tree_stall.h | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 56b21219442b..486c00536207 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -20,6 +20,28 @@ int sysctl_panic_on_rcu_stall __read_mostly; int sysctl_max_rcu_stall_to_panic __read_mostly; +#ifdef CONFIG_SYSFS + +static unsigned int rcu_stall_count; + +static ssize_t rcu_stall_count_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + return sysfs_emit(page, "%u\n", rcu_stall_count); +} + +static struct kobj_attribute rcu_stall_count_attr = __ATTR_RO(rcu_stall_count); + +static __init int kernel_rcu_stall_sysfs_init(void) +{ + sysfs_add_file_to_group(kernel_kobj, &rcu_stall_count_attr.attr, NULL); + return 0; +} + +late_initcall(kernel_rcu_stall_sysfs_init); + +#endif // CONFIG_SYSFS + #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) #else @@ -784,6 +806,10 @@ static void check_cpu_stall(struct rcu_data *rdp) if (kvm_check_and_clear_guest_paused()) return; +#ifdef CONFIG_SYSFS + ++rcu_stall_count; +#endif + rcu_stall_notifier_call_chain(RCU_STALL_NOTIFY_NORM, (void *)j - gps); if (READ_ONCE(csd_lock_suppress_rcu_stall) && csd_lock_is_stuck()) { pr_err("INFO: %s detected stall, but suppressed full report due to a stuck CSD-lock.\n", rcu_state.name);