From d5dc831eb36cac45c072145ecc45e193737e7814 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 21 Jan 2026 07:57:10 +0100 Subject: [PATCH 01/39] slab: replace cache_from_obj() with inline checks Eric Dumazet has noticed cache_from_obj() is not inlined with clang and suggested splitting it into two functions, where the smaller inlined one assumes the fastpath is !CONFIG_SLAB_FREELIST_HARDENED. However most distros enable it these days and so this would likely add a function call to the object free fastpaths. Instead take a step back and consider that cache_from_obj() is a relict from when memcgs created their separate kmem_cache copies, as the outdated comment in build_detached_freelist() reminds us. Meanwhile hardening/debugging had reused cache_from_obj() to validate that the freed object really belongs to a slab from the cache we think we are freeing from. In build_detached_freelist() simply remove this, because it did not handle the NULL result from cache_from_obj() failure properly, nor validate objects (for the NULL slab->slab_cache pointer) when called via kfree_bulk(). If anyone is motivated to implement it properly, it should be possible in a similar way to kmem_cache_free(). In kmem_cache_free(), do the hardening/debugging checks directly so they are inlined by definition and virt_to_slab(obj) is performed just once. In case they failed, call a newly introduced warn_free_bad_obj() that performs the warnings outside of the fastpath, and leak the object. As an intentional change, leak the object when slab->slab_cache differs from the cache given to kmem_cache_free(). Previously we would only leak when the object is not in a valid slab page or the slab->slab_cache pointer is NULL, and otherwise trust the slab->slab_cache over the kmem_cache_free() argument. But if those differ, it means something went wrong enough that it's best not to continue freeing. As a result the fastpath should be inlined in all configs and the warnings are moved away. Reported-by: Eric Dumazet Closes: https://lore.kernel.org/all/20260115130642.3419324-1-edumazet@google.com/ Reviewed-by: Harry Yoo Reviewed-by: Hao Li Acked-by: Eric Dumazet Signed-off-by: Vlastimil Babka --- mm/slub.c | 56 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f77b7407c51b..28af56acc3ab 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6742,30 +6742,26 @@ void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) } #endif -static inline struct kmem_cache *virt_to_cache(const void *obj) +static noinline void warn_free_bad_obj(struct kmem_cache *s, void *obj) { + struct kmem_cache *cachep; struct slab *slab; slab = virt_to_slab(obj); - if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) - return NULL; - return slab->slab_cache; -} + if (WARN_ONCE(!slab, + "kmem_cache_free(%s, %p): object is not in a slab page\n", + s->name, obj)) + return; -static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) -{ - struct kmem_cache *cachep; + cachep = slab->slab_cache; - if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && - !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) - return s; - - cachep = virt_to_cache(x); - if (WARN(cachep && cachep != s, - "%s: Wrong slab cache. %s but object is from %s\n", - __func__, s->name, cachep->name)) - print_tracking(cachep, x); - return cachep; + if (WARN_ONCE(cachep != s, + "kmem_cache_free(%s, %p): object belongs to different cache %s\n", + s->name, obj, cachep ? cachep->name : "(NULL)")) { + if (cachep) + print_tracking(cachep, obj); + return; + } } /** @@ -6778,11 +6774,25 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) */ void kmem_cache_free(struct kmem_cache *s, void *x) { - s = cache_from_obj(s, x); - if (!s) - return; + struct slab *slab; + + slab = virt_to_slab(x); + + if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) || + kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + + /* + * Intentionally leak the object in these cases, because it + * would be too dangerous to continue. + */ + if (unlikely(!slab || (slab->slab_cache != s))) { + warn_free_bad_obj(s, x); + return; + } + } + trace_kmem_cache_free(_RET_IP_, x, s); - slab_free(s, virt_to_slab(x), x, _RET_IP_); + slab_free(s, slab, x, _RET_IP_); } EXPORT_SYMBOL(kmem_cache_free); @@ -7309,7 +7319,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, df->s = slab->slab_cache; } else { df->slab = slab; - df->s = cache_from_obj(s, object); /* Support for memcg */ + df->s = s; } /* Start new detached freelist */ From b55b423e8518361124ff0a9e15df431b3682ee4f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:39 +0100 Subject: [PATCH 02/39] mm/slab: add rcu_barrier() to kvfree_rcu_barrier_on_cache() After we submit the rcu_free sheaves to call_rcu() we need to make sure the rcu callbacks complete. kvfree_rcu_barrier() does that via flush_all_rcu_sheaves() but kvfree_rcu_barrier_on_cache() doesn't. Fix that. This currently causes no issues because the caches with sheaves we have are never destroyed. The problem flagged by kernel test robot was reported for a patch that enables sheaves for (almost) all caches, and occurred only with CONFIG_KASAN. Harry Yoo found the root cause [1]: It turns out the object freed by sheaf_flush_unused() was in KASAN percpu quarantine list (confirmed by dumping the list) by the time __kmem_cache_shutdown() returns an error. Quarantined objects are supposed to be flushed by kasan_cache_shutdown(), but things go wrong if the rcu callback (rcu_free_sheaf_nobarn()) is processed after kasan_cache_shutdown() finishes. That's why rcu_barrier() in __kmem_cache_shutdown() didn't help, because it's called after kasan_cache_shutdown(). Calling rcu_barrier() in kvfree_rcu_barrier_on_cache() guarantees that it'll be added to the quarantine list before kasan_cache_shutdown() is called. So it's a valid fix! [1] https://lore.kernel.org/all/aWd6f3jERlrB5yeF@hyeyoo/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202601121442.c530bed3-lkp@intel.com Fixes: 0f35040de593 ("mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction") Cc: stable@vger.kernel.org Reviewed-by: Harry Yoo Tested-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index eed7ea556cb1..ee994ec7f251 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -2133,8 +2133,11 @@ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); */ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) { - if (s->cpu_sheaves) + if (s->cpu_sheaves) { flush_rcu_sheaves_on_cache(s); + rcu_barrier(); + } + /* * TODO: Introduce a version of __kvfree_rcu_barrier() that works * on a specific slab cache. From f8b4cd2dad097e4ea5aed3511f42b9eb771e7b19 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Fri, 23 Jan 2026 07:52:40 +0100 Subject: [PATCH 03/39] mm/slab: fix false lockdep warning in __kfree_rcu_sheaf() kvfree_call_rcu() can be called while holding a raw_spinlock_t. Since __kfree_rcu_sheaf() may acquire a spinlock_t (which becomes a sleeping lock on PREEMPT_RT) and violate lock nesting rules, kvfree_call_rcu() bypasses the sheaves layer entirely on PREEMPT_RT. However, lockdep still complains about acquiring spinlock_t while holding raw_spinlock_t, even on !PREEMPT_RT where spinlock_t is a spinning lock. This causes a false lockdep warning [1]: ============================= [ BUG: Invalid wait context ] 6.19.0-rc6-next-20260120 #21508 Not tainted ----------------------------- migration/1/23 is trying to lock: ffff8afd01054e98 (&barn->lock){..-.}-{3:3}, at: barn_get_empty_sheaf+0x1d/0xb0 other info that might help us debug this: context-{5:5} 3 locks held by migration/1/23: #0: ffff8afd01fd89a8 (&p->pi_lock){-.-.}-{2:2}, at: __balance_push_cpu_stop+0x3f/0x200 #1: ffffffff9f15c5c8 (rcu_read_lock){....}-{1:3}, at: cpuset_cpus_allowed_fallback+0x27/0x250 #2: ffff8afd1f470be0 ((local_lock_t *)&pcs->lock){+.+.}-{3:3}, at: __kfree_rcu_sheaf+0x52/0x3d0 stack backtrace: CPU: 1 UID: 0 PID: 23 Comm: migration/1 Not tainted 6.19.0-rc6-next-20260120 #21508 PREEMPTLAZY Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 Stopper: __balance_push_cpu_stop+0x0/0x200 <- balance_push+0x118/0x170 Call Trace: __dump_stack+0x22/0x30 dump_stack_lvl+0x60/0x80 dump_stack+0x19/0x24 __lock_acquire+0xd3a/0x28e0 ? __lock_acquire+0x5a9/0x28e0 ? __lock_acquire+0x5a9/0x28e0 ? barn_get_empty_sheaf+0x1d/0xb0 lock_acquire+0xc3/0x270 ? barn_get_empty_sheaf+0x1d/0xb0 ? __kfree_rcu_sheaf+0x52/0x3d0 _raw_spin_lock_irqsave+0x47/0x70 ? barn_get_empty_sheaf+0x1d/0xb0 barn_get_empty_sheaf+0x1d/0xb0 ? __kfree_rcu_sheaf+0x52/0x3d0 __kfree_rcu_sheaf+0x19f/0x3d0 kvfree_call_rcu+0xaf/0x390 set_cpus_allowed_force+0xc8/0xf0 [...] This wasn't triggered until sheaves were enabled for all slab caches, since kfree_rcu() wasn't being called with a raw spinlock held for caches with sheaves (vma, maple node). As suggested by Vlastimil Babka, fix this by using a lockdep map with LD_WAIT_CONFIG wait type to tell lockdep that acquiring spinlock_t is valid in this case, as those spinlocks won't be used on PREEMPT_RT. Note that kfree_rcu_sheaf_map should be acquired using _try() variant, otherwise the acquisition of the lockdep map itself will trigger an invalid wait context warning. Reported-by: Paul E. McKenney Closes: https://lore.kernel.org/linux-mm/c858b9af-2510-448b-9ab3-058f7b80dd42@paulmck-laptop [1] Fixes: ec66e0d59952 ("slab: add sheaf support for batching kfree_rcu() operations") Suggested-by: Vlastimil Babka Signed-off-by: Harry Yoo Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Vlastimil Babka --- mm/slub.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 28af56acc3ab..49223795a7d6 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -6265,11 +6265,29 @@ empty: free_empty_sheaf(s, sheaf); } +/* + * kvfree_call_rcu() can be called while holding a raw_spinlock_t. Since + * __kfree_rcu_sheaf() may acquire a spinlock_t (sleeping lock on PREEMPT_RT), + * this would violate lock nesting rules. Therefore, kvfree_call_rcu() avoids + * this problem by bypassing the sheaves layer entirely on PREEMPT_RT. + * + * However, lockdep still complains that it is invalid to acquire spinlock_t + * while holding raw_spinlock_t, even on !PREEMPT_RT where spinlock_t is a + * spinning lock. Tell lockdep that acquiring spinlock_t is valid here + * by temporarily raising the wait-type to LD_WAIT_CONFIG. + */ +static DEFINE_WAIT_OVERRIDE_MAP(kfree_rcu_sheaf_map, LD_WAIT_CONFIG); + bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) { struct slub_percpu_sheaves *pcs; struct slab_sheaf *rcu_sheaf; + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) + return false; + + lock_map_acquire_try(&kfree_rcu_sheaf_map); + if (!local_trylock(&s->cpu_sheaves->lock)) goto fail; @@ -6346,10 +6364,12 @@ do_free: local_unlock(&s->cpu_sheaves->lock); stat(s, FREE_RCU_SHEAF); + lock_map_release(&kfree_rcu_sheaf_map); return true; fail: stat(s, FREE_RCU_SHEAF_FAIL); + lock_map_release(&kfree_rcu_sheaf_map); return false; } From b26e52c523ea871ef1cae6e3955418cfffe2117f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:41 +0100 Subject: [PATCH 04/39] slab: add SLAB_CONSISTENCY_CHECKS to SLAB_NEVER_MERGE All the debug flags prevent merging, except SLAB_CONSISTENCY_CHECKS. This is suboptimal because this flag (like any debug flags) prevents the usage of any fastpaths, and thus affect performance of any aliased cache. Also the objects from an aliased cache than the one specified for debugging could also interfere with the debugging efforts. Fix this by adding the whole SLAB_DEBUG_FLAGS collection to SLAB_NEVER_MERGE instead of individual debug flags, so it now also includes SLAB_CONSISTENCY_CHECKS. Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index ee994ec7f251..e691ede0e6a8 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -45,9 +45,8 @@ struct kmem_cache *kmem_cache; /* * Set of flags that will prevent slab merging */ -#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB | SLAB_NO_MERGE) +#define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \ + SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_ACCOUNT) From 8598351edc42f38d2a1eaed9abca39c98e7b0bbf Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:42 +0100 Subject: [PATCH 05/39] mm/slab: move and refactor __kmem_cache_alias() Move __kmem_cache_alias() to slab_common.c since it's called by __kmem_cache_create_args() and calls find_mergeable() that both are in this file. We can remove two slab.h declarations and make them static. Instead declare sysfs_slab_alias() from slub.c so that __kmem_cache_alias() can keep calling it. Add args parameter to __kmem_cache_alias() and find_mergeable() instead of align and ctor. With that we can also move the checks for usersize and sheaf_capacity there from __kmem_cache_create_args() and make the result more symmetric with slab_unmergeable(). No functional changes intended. Reviewed-by: Harry Yoo Reviewed-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- mm/slab.h | 8 +++----- mm/slab_common.c | 44 +++++++++++++++++++++++++++++++++++++------- mm/slub.c | 30 +----------------------------- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index e767aa7e91b0..0993800fcced 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -281,9 +281,12 @@ struct kmem_cache { #define SLAB_SUPPORTS_SYSFS 1 void sysfs_slab_unlink(struct kmem_cache *s); void sysfs_slab_release(struct kmem_cache *s); +int sysfs_slab_alias(struct kmem_cache *s, const char *name); #else static inline void sysfs_slab_unlink(struct kmem_cache *s) { } static inline void sysfs_slab_release(struct kmem_cache *s) { } +static inline int sysfs_slab_alias(struct kmem_cache *s, const char *name) + { return 0; } #endif void *fixup_red_left(struct kmem_cache *s, void *p); @@ -400,11 +403,6 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); -struct kmem_cache *find_mergeable(unsigned size, unsigned align, - slab_flags_t flags, const char *name, void (*ctor)(void *)); -struct kmem_cache * -__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)); slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); diff --git a/mm/slab_common.c b/mm/slab_common.c index e691ede0e6a8..ee245a880603 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -174,15 +174,22 @@ int slab_unmergeable(struct kmem_cache *s) return 0; } -struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, - slab_flags_t flags, const char *name, void (*ctor)(void *)) +static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags, + const char *name, struct kmem_cache_args *args) { struct kmem_cache *s; + unsigned int align; if (slab_nomerge) return NULL; - if (ctor) + if (args->ctor) + return NULL; + + if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize) + return NULL; + + if (args->sheaf_capacity) return NULL; flags = kmem_cache_flags(flags, name); @@ -191,7 +198,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, return NULL; size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); + align = calculate_alignment(flags, args->align, size); size = ALIGN(size, align); list_for_each_entry_reverse(s, &slab_caches, list) { @@ -252,6 +259,31 @@ out: return ERR_PTR(err); } +static struct kmem_cache * +__kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags, + struct kmem_cache_args *args) +{ + struct kmem_cache *s; + + s = find_mergeable(size, flags, name, args); + if (s) { + if (sysfs_slab_alias(s, name)) + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", + name); + + s->refcount++; + + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + s->object_size = max(s->object_size, size); + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); + } + + return s; +} + /** * __kmem_cache_create_args - Create a kmem cache. * @name: A string which is used in /proc/slabinfo to identify this cache. @@ -323,9 +355,7 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, object_size - args->usersize < args->useroffset)) args->usersize = args->useroffset = 0; - if (!args->usersize && !args->sheaf_capacity) - s = __kmem_cache_alias(name, object_size, args->align, flags, - args->ctor); + s = __kmem_cache_alias(name, object_size, flags, args); if (s) goto out_unlock; diff --git a/mm/slub.c b/mm/slub.c index 49223795a7d6..afc3e511ff39 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -350,11 +350,8 @@ enum track_item { TRACK_ALLOC, TRACK_FREE }; #ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); -static int sysfs_slab_alias(struct kmem_cache *, const char *); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } -static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) - { return 0; } #endif #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) @@ -8580,31 +8577,6 @@ void __init kmem_cache_init_late(void) WARN_ON(!flushwq); } -struct kmem_cache * -__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, - slab_flags_t flags, void (*ctor)(void *)) -{ - struct kmem_cache *s; - - s = find_mergeable(size, align, flags, name, ctor); - if (s) { - if (sysfs_slab_alias(s, name)) - pr_err("SLUB: Unable to add cache alias %s to sysfs\n", - name); - - s->refcount++; - - /* - * Adjust the object sizes so that we clear - * the complete object on kzalloc. - */ - s->object_size = max(s->object_size, size); - s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); - } - - return s; -} - int do_kmem_cache_create(struct kmem_cache *s, const char *name, unsigned int size, struct kmem_cache_args *args, slab_flags_t flags) @@ -9837,7 +9809,7 @@ struct saved_alias { static struct saved_alias *alias_list; -static int sysfs_slab_alias(struct kmem_cache *s, const char *name) +int sysfs_slab_alias(struct kmem_cache *s, const char *name) { struct saved_alias *al; From aff8518575715b8b902d64a4ad8dfa3e33381efa Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:43 +0100 Subject: [PATCH 06/39] mm/slab: make caches with sheaves mergeable Before enabling sheaves for all caches (with automatically determined capacity), their enablement should no longer prevent merging of caches. Limit this merge prevention only to caches that were created with a specific sheaf capacity, by adding the SLAB_NO_MERGE flag to them. Reviewed-by: Harry Yoo Reviewed-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index ee245a880603..5c15a4ce5743 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -162,9 +162,6 @@ int slab_unmergeable(struct kmem_cache *s) return 1; #endif - if (s->cpu_sheaves) - return 1; - /* * We may have set a slab to be unmergeable during bootstrap. */ @@ -189,9 +186,6 @@ static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags, if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize) return NULL; - if (args->sheaf_capacity) - return NULL; - flags = kmem_cache_flags(flags, name); if (flags & SLAB_NEVER_MERGE) @@ -336,6 +330,13 @@ struct kmem_cache *__kmem_cache_create_args(const char *name, flags &= ~SLAB_DEBUG_FLAGS; #endif + /* + * Caches with specific capacity are special enough. It's simpler to + * make them unmergeable. + */ + if (args->sheaf_capacity) + flags |= SLAB_NO_MERGE; + mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, object_size); From d907bf434fcd64c9609aa2983574e7c1f28e5493 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 27 Jan 2026 19:31:50 +0900 Subject: [PATCH 07/39] mm/slab: factor out slab_args_unmergeable() slab_mergeable() determines whether a slab cache can be merged, but it should not be used when the cache is not fully created yet. Extract the pre-cache-creation mergeability checks into slab_args_unmergeable(), which evaluates kmem_cache_args, slab flags, and slab_nomerge to determine if a cache will be mergeable before it is created. Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260127103151.21883-2-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/mm/slab_common.c b/mm/slab_common.c index 5c15a4ce5743..b6836f8500b6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -171,24 +171,32 @@ int slab_unmergeable(struct kmem_cache *s) return 0; } +static bool slab_args_unmergeable(struct kmem_cache_args *args, + slab_flags_t flags) +{ + if (slab_nomerge) + return true; + + if (args->ctor) + return true; + + if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize) + return true; + + if (flags & SLAB_NEVER_MERGE) + return true; + + return false; +} + static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags, const char *name, struct kmem_cache_args *args) { struct kmem_cache *s; unsigned int align; - if (slab_nomerge) - return NULL; - - if (args->ctor) - return NULL; - - if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize) - return NULL; - flags = kmem_cache_flags(flags, name); - - if (flags & SLAB_NEVER_MERGE) + if (slab_args_unmergeable(args, flags)) return NULL; size = ALIGN(size, sizeof(void *)); From 4b038a9670154e8bb4832d80f0f2b68b1b812171 Mon Sep 17 00:00:00 2001 From: Hao Li Date: Wed, 10 Dec 2025 08:26:31 +0800 Subject: [PATCH 08/39] slub: keep empty main sheaf as spare in __pcs_replace_empty_main() When __pcs_replace_empty_main() fails to obtain a full sheaf directly from the barn, it may either: - Refill an empty sheaf obtained via barn_get_empty_sheaf(), or - Allocate a brand new full sheaf via alloc_full_sheaf(). After reacquiring the per-CPU lock, if pcs->main is still empty and pcs->spare is NULL, the current code donates the empty main sheaf to the barn via barn_put_empty_sheaf() and installs the full sheaf as pcs->main, leaving pcs->spare unpopulated. Instead, keep the existing empty main sheaf locally as the spare: pcs->spare = pcs->main; pcs->main = full; This populates pcs->spare earlier, which can reduce future barn traffic. Suggested-by: Vlastimil Babka Signed-off-by: Hao Li Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka Tested-by: Zhao Liu --- mm/slub.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index afc3e511ff39..e90f3e558ae7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5049,7 +5049,10 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, */ if (pcs->main->size == 0) { - barn_put_empty_sheaf(barn, pcs->main); + if (!pcs->spare) + pcs->spare = pcs->main; + else + barn_put_empty_sheaf(barn, pcs->main); pcs->main = full; return pcs; } From e47c897a29491ade20b27612fdd3107c39a07357 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:44 +0100 Subject: [PATCH 09/39] slab: add sheaves to most caches In the first step to replace cpu (partial) slabs with sheaves, enable sheaves for almost all caches. Treat args->sheaf_capacity as a minimum, and calculate sheaf capacity with a formula that roughly follows the formula for number of objects in cpu partial slabs in set_cpu_partial(). This should achieve roughly similar contention on the barn spin lock as there's currently for node list_lock without sheaves, to make benchmarking results comparable. It can be further tuned later. Don't enable sheaves for bootstrap caches as that wouldn't work. In order to recognize them by SLAB_NO_OBJ_EXT, make sure the flag exists even for !CONFIG_SLAB_OBJ_EXT. This limitation will be lifted for kmalloc caches after the necessary bootstrapping changes. Also do not enable sheaves for SLAB_NOLEAKTRACE caches to avoid recursion with kmemleak tracking (thanks to Breno Leitao). Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Reviewed-by: Hao Li Tested-by: Breno Leitao Reviewed-by: Liam R. Howlett Tested-by: Zhao Liu Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 6 ----- mm/slub.c | 56 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 2482992248dc..2682ee57ec90 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -57,9 +57,7 @@ enum _slab_flag_bits { #endif _SLAB_OBJECT_POISON, _SLAB_CMPXCHG_DOUBLE, -#ifdef CONFIG_SLAB_OBJ_EXT _SLAB_NO_OBJ_EXT, -#endif _SLAB_FLAGS_LAST_BIT }; @@ -238,11 +236,7 @@ enum _slab_flag_bits { #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* Slab created using create_boot_cache */ -#ifdef CONFIG_SLAB_OBJ_EXT #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) -#else -#define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED -#endif /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. diff --git a/mm/slub.c b/mm/slub.c index e90f3e558ae7..b6c307114756 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7893,6 +7893,53 @@ static void set_cpu_partial(struct kmem_cache *s) #endif } +static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, + struct kmem_cache_args *args) + +{ + unsigned int capacity; + size_t size; + + + if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS) + return 0; + + /* + * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT). + * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not + * have sheaves to avoid recursion when sheaf allocation triggers + * kmemleak tracking. + */ + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) + return 0; + + /* + * For now we use roughly similar formula (divided by two as there are + * two percpu sheaves) as what was used for percpu partial slabs, which + * should result in similar lock contention (barn or list_lock) + */ + if (s->size >= PAGE_SIZE) + capacity = 4; + else if (s->size >= 1024) + capacity = 12; + else if (s->size >= 256) + capacity = 26; + else + capacity = 60; + + /* Increment capacity to make sheaf exactly a kmalloc size bucket */ + size = struct_size_t(struct slab_sheaf, objects, capacity); + size = kmalloc_size_roundup(size); + capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *); + + /* + * Respect an explicit request for capacity that's typically motivated by + * expected maximum size of kmem_cache_prefill_sheaf() to not end up + * using low-performance oversize sheaves + */ + return max(capacity, args->sheaf_capacity); +} + /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -8027,6 +8074,10 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; + /* kmalloc caches need extra care to support sheaves */ + if (!is_kmalloc_cache(s)) + s->sheaf_capacity = calculate_sheaf_capacity(s, args); + /* * Determine the number of objects per slab */ @@ -8631,15 +8682,12 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, set_cpu_partial(s); - if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) - && !(s->flags & SLAB_DEBUG_FLAGS)) { + if (s->sheaf_capacity) { s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); if (!s->cpu_sheaves) { err = -ENOMEM; goto out; } - // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? - s->sheaf_capacity = args->sheaf_capacity; } #ifdef CONFIG_NUMA From f3421f8d154cc0906da145299c72f4a7f046ffde Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:45 +0100 Subject: [PATCH 10/39] slab: introduce percpu sheaves bootstrap Until now, kmem_cache->cpu_sheaves was !NULL only for caches with sheaves enabled. Since we want to enable them for almost all caches, it's suboptimal to test the pointer in the fast paths, so instead allocate it for all caches in do_kmem_cache_create(). Instead of testing the cpu_sheaves pointer to recognize caches (yet) without sheaves, test kmem_cache->sheaf_capacity for being 0, where needed, using a new cache_has_sheaves() helper. However, for the fast paths sake we also assume that the main sheaf always exists (pcs->main is !NULL), and during bootstrap we cannot allocate sheaves yet. Solve this by introducing a single static bootstrap_sheaf that's assigned as pcs->main during bootstrap. It has a size of 0, so during allocations, the fast path will find it's empty. Since the size of 0 matches sheaf_capacity of 0, the freeing fast paths will find it's "full". In the slow path handlers, we use cache_has_sheaves() to recognize that the cache doesn't (yet) have real sheaves, and fall back. Thus sharing the single bootstrap sheaf like this for multiple caches and cpus is safe. Reviewed-by: Harry Yoo Reviewed-by: Hao Li Reviewed-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- mm/slab.h | 12 +++++ mm/slab_common.c | 9 ++-- mm/slub.c | 123 +++++++++++++++++++++++++++++++---------------- 3 files changed, 97 insertions(+), 47 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 0993800fcced..f833eb12b92a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -277,6 +277,18 @@ struct kmem_cache { struct kmem_cache_node *node[MAX_NUMNODES]; }; +/* + * Every cache has !NULL s->cpu_sheaves but they may point to the + * bootstrap_sheaf temporarily during init, or permanently for the boot caches + * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This + * helper distinguishes whether cache has real non-bootstrap sheaves. + */ +static inline bool cache_has_sheaves(struct kmem_cache *s) +{ + /* Test CONFIG_SLUB_TINY for code elimination purposes */ + return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity; +} + #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) #define SLAB_SUPPORTS_SYSFS 1 void sysfs_slab_unlink(struct kmem_cache *s); diff --git a/mm/slab_common.c b/mm/slab_common.c index b6836f8500b6..cea223e5d5b5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1663,11 +1663,8 @@ static bool kfree_rcu_sheaf(void *obj) return false; s = slab->slab_cache; - if (s->cpu_sheaves) { - if (likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id())) - return __kfree_rcu_sheaf(s, obj); - } + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) + return __kfree_rcu_sheaf(s, obj); return false; } @@ -2171,7 +2168,7 @@ EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); */ void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) { - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { flush_rcu_sheaves_on_cache(s); rcu_barrier(); } diff --git a/mm/slub.c b/mm/slub.c index b6c307114756..f40406b4166a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2846,12 +2846,23 @@ static void pcs_destroy(struct kmem_cache *s) { int cpu; + /* + * We may be unwinding cache creation that failed before or during the + * allocation of this. + */ + if (!s->cpu_sheaves) + return; + + /* pcs->main can only point to the bootstrap sheaf, nothing to free */ + if (!cache_has_sheaves(s)) + goto free_pcs; + for_each_possible_cpu(cpu) { struct slub_percpu_sheaves *pcs; pcs = per_cpu_ptr(s->cpu_sheaves, cpu); - /* can happen when unwinding failed create */ + /* This can happen when unwinding failed cache creation. */ if (!pcs->main) continue; @@ -2873,6 +2884,7 @@ static void pcs_destroy(struct kmem_cache *s) } } +free_pcs: free_percpu(s->cpu_sheaves); s->cpu_sheaves = NULL; } @@ -4030,7 +4042,7 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; - if (!s->cpu_sheaves) + if (!cache_has_sheaves(s)) return false; pcs = per_cpu_ptr(s->cpu_sheaves, cpu); @@ -4052,7 +4064,7 @@ static void flush_cpu_slab(struct work_struct *w) s = sfw->s; - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) pcs_flush_all(s); flush_this_cpu_slab(s); @@ -4157,7 +4169,7 @@ void flush_all_rcu_sheaves(void) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - if (!s->cpu_sheaves) + if (!cache_has_sheaves(s)) continue; flush_rcu_sheaves_on_cache(s); } @@ -4179,7 +4191,7 @@ static int slub_cpu_dead(unsigned int cpu) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { __flush_cpu_slab(s, cpu); - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) __pcs_flush_all_cpu(s, cpu); } mutex_unlock(&slab_mutex); @@ -4979,6 +4991,12 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + /* Bootstrap or debug cache, back off */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + if (pcs->spare && pcs->spare->size > 0) { swap(pcs->main, pcs->spare); return pcs; @@ -5165,6 +5183,11 @@ next_batch: struct slab_sheaf *full; struct node_barn *barn; + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return allocated; + } + if (pcs->spare && pcs->spare->size > 0) { swap(pcs->main, pcs->spare); goto do_alloc; @@ -5244,8 +5267,7 @@ static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list if (unlikely(object)) goto out; - if (s->cpu_sheaves) - object = alloc_from_pcs(s, gfpflags, node); + object = alloc_from_pcs(s, gfpflags, node); if (!object) object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); @@ -5353,18 +5375,10 @@ kmem_cache_prefill_sheaf(struct kmem_cache *s, gfp_t gfp, unsigned int size) struct slab_sheaf *sheaf = NULL; struct node_barn *barn; - if (unlikely(size > s->sheaf_capacity)) { + if (unlikely(!size)) + return NULL; - /* - * slab_debug disables cpu sheaves intentionally so all - * prefilled sheaves become "oversize" and we give up on - * performance for the debugging. Same with SLUB_TINY. - * Creating a cache without sheaves and then requesting a - * prefilled sheaf is however not expected, so warn. - */ - WARN_ON_ONCE(s->sheaf_capacity == 0 && - !IS_ENABLED(CONFIG_SLUB_TINY) && - !(s->flags & SLAB_DEBUG_FLAGS)); + if (unlikely(size > s->sheaf_capacity)) { sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); if (!sheaf) @@ -6082,6 +6096,12 @@ __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) restart: lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); + /* Bootstrap or debug cache, back off */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + return NULL; + } + barn = get_barn(s); if (!barn) { local_unlock(&s->cpu_sheaves->lock); @@ -6298,6 +6318,12 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) struct slab_sheaf *empty; struct node_barn *barn; + /* Bootstrap or debug cache, fall back */ + if (unlikely(!cache_has_sheaves(s))) { + local_unlock(&s->cpu_sheaves->lock); + goto fail; + } + if (pcs->spare && pcs->spare->size == 0) { pcs->rcu_free = pcs->spare; pcs->spare = NULL; @@ -6694,9 +6720,8 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) return; - if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || - slab_nid(slab) == numa_mem_id()) - && likely(!slab_test_pfmemalloc(slab))) { + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) + && likely(!slab_test_pfmemalloc(slab))) { if (likely(free_to_pcs(s, object))) return; } @@ -7409,7 +7434,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) * freeing to sheaves is so incompatible with the detached freelist so * once we go that way, we have to do everything differently */ - if (s && s->cpu_sheaves) { + if (s && cache_has_sheaves(s)) { free_to_pcs_bulk(s, size, p); return; } @@ -7520,8 +7545,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, size--; } - if (s->cpu_sheaves) - i = alloc_from_pcs_bulk(s, size, p); + i = alloc_from_pcs_bulk(s, size, p); if (i < size) { /* @@ -7732,6 +7756,7 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) static int init_percpu_sheaves(struct kmem_cache *s) { + static struct slab_sheaf bootstrap_sheaf = {}; int cpu; for_each_possible_cpu(cpu) { @@ -7741,7 +7766,28 @@ static int init_percpu_sheaves(struct kmem_cache *s) local_trylock_init(&pcs->lock); - pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); + /* + * Bootstrap sheaf has zero size so fast-path allocation fails. + * It has also size == s->sheaf_capacity, so fast-path free + * fails. In the slow paths we recognize the situation by + * checking s->sheaf_capacity. This allows fast paths to assume + * s->cpu_sheaves and pcs->main always exists and are valid. + * It's also safe to share the single static bootstrap_sheaf + * with zero-sized objects array as it's never modified. + * + * Bootstrap_sheaf also has NULL pointer to kmem_cache so we + * recognize it and not attempt to free it when destroying the + * cache. + * + * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node, + * caches with debug enabled, and all caches with SLUB_TINY. + * For kmalloc caches it's used temporarily during the initial + * bootstrap. + */ + if (!s->sheaf_capacity) + pcs->main = &bootstrap_sheaf; + else + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); if (!pcs->main) return -ENOMEM; @@ -7816,8 +7862,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); - if (s->cpu_sheaves) - pcs_destroy(s); + pcs_destroy(s); #ifdef CONFIG_PREEMPT_RT if (s->cpu_slab) lockdep_unregister_key(&s->lock_key); @@ -7839,7 +7884,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) continue; } - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); if (!barn) @@ -8162,7 +8207,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) flush_all_cpus_locked(s); /* we might have rcu sheaves in flight */ - if (s->cpu_sheaves) + if (cache_has_sheaves(s)) rcu_barrier(); /* Attempt to free all objects */ @@ -8474,7 +8519,7 @@ static int slab_mem_going_online_callback(int nid) if (get_node(s, nid)) continue; - if (s->cpu_sheaves) { + if (cache_has_sheaves(s)) { barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); if (!barn) { @@ -8682,12 +8727,10 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, set_cpu_partial(s); - if (s->sheaf_capacity) { - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); - if (!s->cpu_sheaves) { - err = -ENOMEM; - goto out; - } + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); + if (!s->cpu_sheaves) { + err = -ENOMEM; + goto out; } #ifdef CONFIG_NUMA @@ -8706,11 +8749,9 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!alloc_kmem_cache_cpus(s)) goto out; - if (s->cpu_sheaves) { - err = init_percpu_sheaves(s); - if (err) - goto out; - } + err = init_percpu_sheaves(s); + if (err) + goto out; err = 0; From f1427a1d64156bb88d84f364855c364af6f67a3b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:46 +0100 Subject: [PATCH 11/39] slab: make percpu sheaves compatible with kmalloc_nolock()/kfree_nolock() Before we enable percpu sheaves for kmalloc caches, we need to make sure kmalloc_nolock() and kfree_nolock() will continue working properly and not spin when not allowed to. Percpu sheaves themselves use local_trylock() so they are already compatible. We just need to be careful with the barn->lock spin_lock. Pass a new allow_spin parameter where necessary to use spin_trylock_irqsave(). In kmalloc_nolock_noprof() we can now attempt alloc_from_pcs() safely, for now it will always fail until we enable sheaves for kmalloc caches next. Similarly in kfree_nolock() we can attempt free_to_pcs(). Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Reviewed-by: Hao Li Reviewed-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- mm/slub.c | 82 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f40406b4166a..988433435779 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2889,7 +2889,8 @@ free_pcs: s->cpu_sheaves = NULL; } -static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) +static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn, + bool allow_spin) { struct slab_sheaf *empty = NULL; unsigned long flags; @@ -2897,7 +2898,10 @@ static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) if (!data_race(barn->nr_empty)) return NULL; - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return NULL; if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, @@ -2974,7 +2978,8 @@ static struct slab_sheaf *barn_get_full_or_empty_sheaf(struct node_barn *barn) * change. */ static struct slab_sheaf * -barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) +barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty, + bool allow_spin) { struct slab_sheaf *full = NULL; unsigned long flags; @@ -2982,7 +2987,10 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) if (!data_race(barn->nr_full)) return NULL; - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return NULL; if (likely(barn->nr_full)) { full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, @@ -3003,7 +3011,8 @@ barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) * barn. But if there are too many full sheaves, reject this with -E2BIG. */ static struct slab_sheaf * -barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) +barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full, + bool allow_spin) { struct slab_sheaf *empty; unsigned long flags; @@ -3014,7 +3023,10 @@ barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) if (!data_race(barn->nr_empty)) return ERR_PTR(-ENOMEM); - spin_lock_irqsave(&barn->lock, flags); + if (likely(allow_spin)) + spin_lock_irqsave(&barn->lock, flags); + else if (!spin_trylock_irqsave(&barn->lock, flags)) + return ERR_PTR(-EBUSY); if (likely(barn->nr_empty)) { empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, @@ -5008,7 +5020,8 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, return NULL; } - full = barn_replace_empty_sheaf(barn, pcs->main); + full = barn_replace_empty_sheaf(barn, pcs->main, + gfpflags_allow_spinning(gfp)); if (full) { stat(s, BARN_GET); @@ -5025,7 +5038,7 @@ __pcs_replace_empty_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, empty = pcs->spare; pcs->spare = NULL; } else { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); } } @@ -5165,7 +5178,8 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) } static __fastpath_inline -unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) +unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size, + void **p) { struct slub_percpu_sheaves *pcs; struct slab_sheaf *main; @@ -5199,7 +5213,8 @@ next_batch: return allocated; } - full = barn_replace_empty_sheaf(barn, pcs->main); + full = barn_replace_empty_sheaf(barn, pcs->main, + gfpflags_allow_spinning(gfp)); if (full) { stat(s, BARN_GET); @@ -5700,7 +5715,7 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; struct kmem_cache *s; bool can_retry = true; - void *ret = ERR_PTR(-EBUSY); + void *ret; VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | __GFP_NO_OBJ_EXT)); @@ -5731,6 +5746,12 @@ retry: */ return NULL; + ret = alloc_from_pcs(s, alloc_gfp, node); + if (ret) + goto success; + + ret = ERR_PTR(-EBUSY); + /* * Do not call slab_alloc_node(), since trylock mode isn't * compatible with slab_pre_alloc_hook/should_failslab and @@ -5767,6 +5788,7 @@ retry: ret = NULL; } +success: maybe_wipe_obj_freeptr(s, ret); slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, slab_want_init_on_alloc(alloc_gfp, s), size); @@ -6087,7 +6109,8 @@ static void __pcs_install_empty_sheaf(struct kmem_cache *s, * unlocked. */ static struct slub_percpu_sheaves * -__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) +__pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, + bool allow_spin) { struct slab_sheaf *empty; struct node_barn *barn; @@ -6111,7 +6134,7 @@ restart: put_fail = false; if (!pcs->spare) { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, allow_spin); if (empty) { pcs->spare = pcs->main; pcs->main = empty; @@ -6125,7 +6148,7 @@ restart: return pcs; } - empty = barn_replace_full_sheaf(barn, pcs->main); + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin); if (!IS_ERR(empty)) { stat(s, BARN_PUT); @@ -6133,7 +6156,8 @@ restart: return pcs; } - if (PTR_ERR(empty) == -E2BIG) { + /* sheaf_flush_unused() doesn't support !allow_spin */ + if (PTR_ERR(empty) == -E2BIG && allow_spin) { /* Since we got here, spare exists and is full */ struct slab_sheaf *to_flush = pcs->spare; @@ -6158,6 +6182,14 @@ restart: alloc_empty: local_unlock(&s->cpu_sheaves->lock); + /* + * alloc_empty_sheaf() doesn't support !allow_spin and it's + * easier to fall back to freeing directly without sheaves + * than add the support (and to sheaf_flush_unused() above) + */ + if (!allow_spin) + return NULL; + empty = alloc_empty_sheaf(s, GFP_NOWAIT); if (empty) goto got_empty; @@ -6200,7 +6232,7 @@ got_empty: * The object is expected to have passed slab_free_hook() already. */ static __fastpath_inline -bool free_to_pcs(struct kmem_cache *s, void *object) +bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) { struct slub_percpu_sheaves *pcs; @@ -6211,7 +6243,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object) if (unlikely(pcs->main->size == s->sheaf_capacity)) { - pcs = __pcs_replace_full_main(s, pcs); + pcs = __pcs_replace_full_main(s, pcs, allow_spin); if (unlikely(!pcs)) return false; } @@ -6336,7 +6368,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) goto fail; } - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); if (empty) { pcs->rcu_free = empty; @@ -6456,7 +6488,7 @@ next_batch: goto no_empty; if (!pcs->spare) { - empty = barn_get_empty_sheaf(barn); + empty = barn_get_empty_sheaf(barn, true); if (!empty) goto no_empty; @@ -6470,7 +6502,7 @@ next_batch: goto do_free; } - empty = barn_replace_full_sheaf(barn, pcs->main); + empty = barn_replace_full_sheaf(barn, pcs->main, true); if (IS_ERR(empty)) { stat(s, BARN_PUT_FAIL); goto no_empty; @@ -6722,7 +6754,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) && likely(!slab_test_pfmemalloc(slab))) { - if (likely(free_to_pcs(s, object))) + if (likely(free_to_pcs(s, object, true))) return; } @@ -6993,6 +7025,12 @@ void kfree_nolock(const void *object) * since kasan quarantine takes locks and not supported from NMI. */ kasan_slab_free(s, x, false, false, /* skip quarantine */true); + + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { + if (likely(free_to_pcs(s, x, false))) + return; + } + do_slab_free(s, slab, x, x, 0, _RET_IP_); } EXPORT_SYMBOL_GPL(kfree_nolock); @@ -7545,7 +7583,7 @@ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, size--; } - i = alloc_from_pcs_bulk(s, size, p); + i = alloc_from_pcs_bulk(s, flags, size, p); if (i < size) { /* From 913ffd3a1bf5d154995c6cfab44994b07b3c103f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:47 +0100 Subject: [PATCH 12/39] slab: handle kmalloc sheaves bootstrap Enable sheaves for kmalloc caches. For other types than KMALLOC_NORMAL, we can simply allow them in calculate_sizes() as they are created later than KMALLOC_NORMAL caches and can allocate sheaves and barns from those. For KMALLOC_NORMAL caches we perform additional step after first creating them without sheaves. Then bootstrap_cache_sheaves() simply allocates and initializes barns and sheaves and finally sets s->sheaf_capacity to make them actually used. Afterwards the only caches left without sheaves (unless SLUB_TINY or debugging is enabled) are kmem_cache and kmem_cache_node. These are only used when creating or destroying other kmem_caches. Thus they are not performance critical and we can simply leave it that way. Reviewed-by: Harry Yoo Reviewed-by: Hao Li Reviewed-by: Liam R. Howlett Signed-off-by: Vlastimil Babka --- mm/slub.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 988433435779..cd8d3712b195 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2593,7 +2593,8 @@ static void *setup_object(struct kmem_cache *s, void *object) return object; } -static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) +static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, + unsigned int capacity) { struct slab_sheaf *sheaf; size_t sheaf_size; @@ -2611,7 +2612,7 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) if (s->flags & SLAB_KMALLOC) gfp |= __GFP_NO_OBJ_EXT; - sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); + sheaf_size = struct_size(sheaf, objects, capacity); sheaf = kzalloc(sheaf_size, gfp); if (unlikely(!sheaf)) @@ -2624,6 +2625,12 @@ static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) return sheaf; } +static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, + gfp_t gfp) +{ + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); +} + static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) { kfree(sheaf); @@ -8157,8 +8164,11 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; - /* kmalloc caches need extra care to support sheaves */ - if (!is_kmalloc_cache(s)) + /* + * For KMALLOC_NORMAL caches we enable sheaves later by + * bootstrap_kmalloc_sheaves() to avoid recursion + */ + if (!is_kmalloc_normal(s)) s->sheaf_capacity = calculate_sheaf_capacity(s, args); /* @@ -8653,6 +8663,74 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) return s; } +/* + * Finish the sheaves initialization done normally by init_percpu_sheaves() and + * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it + * since sheaves and barns are allocated by kmalloc. + */ +static void __init bootstrap_cache_sheaves(struct kmem_cache *s) +{ + struct kmem_cache_args empty_args = {}; + unsigned int capacity; + bool failed = false; + int node, cpu; + + capacity = calculate_sheaf_capacity(s, &empty_args); + + /* capacity can be 0 due to debugging or SLUB_TINY */ + if (!capacity) + return; + + for_each_node_mask(node, slab_nodes) { + struct node_barn *barn; + + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); + + if (!barn) { + failed = true; + goto out; + } + + barn_init(barn); + get_node(s, node)->barn = barn; + } + + for_each_possible_cpu(cpu) { + struct slub_percpu_sheaves *pcs; + + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); + + pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity); + + if (!pcs->main) { + failed = true; + break; + } + } + +out: + /* + * It's still early in boot so treat this like same as a failure to + * create the kmalloc cache in the first place + */ + if (failed) + panic("Out of memory when creating kmem_cache %s\n", s->name); + + s->sheaf_capacity = capacity; +} + +static void __init bootstrap_kmalloc_sheaves(void) +{ + enum kmalloc_cache_type type; + + for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { + for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { + if (kmalloc_caches[type][idx]) + bootstrap_cache_sheaves(kmalloc_caches[type][idx]); + } + } +} + void __init kmem_cache_init(void) { static __initdata struct kmem_cache boot_kmem_cache, @@ -8696,6 +8774,8 @@ void __init kmem_cache_init(void) setup_kmalloc_cache_index_table(); create_kmalloc_caches(); + bootstrap_kmalloc_sheaves(); + /* Setup random freelists for each cache */ init_freelist_randomization(); From ed30c4adfc2b56909ca43fb5e4750a646928cbf4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:48 +0100 Subject: [PATCH 13/39] slab: add optimized sheaf refill from partial list At this point we have sheaves enabled for all caches, but their refill is done via __kmem_cache_alloc_bulk() which relies on cpu (partial) slabs - now a redundant caching layer that we are about to remove. The refill will thus be done from slabs on the node partial list. Introduce new functions that can do that in an optimized way as it's easier than modifying the __kmem_cache_alloc_bulk() call chain. Introduce struct partial_bulk_context, a variant of struct partial_context that can return a list of slabs from the partial list with the sum of free objects in them within the requested min and max. Introduce get_partial_node_bulk() that removes the slabs from freelist and returns them in the list. There is a racy read of slab->counters so make sure the non-atomic write in __update_freelist_slow() is not tearing. Introduce get_freelist_nofreeze() which grabs the freelist without freezing the slab. Introduce alloc_from_new_slab() which can allocate multiple objects from a newly allocated slab where we don't need to synchronize with freeing. In some aspects it's similar to alloc_single_from_new_slab() but assumes the cache is a non-debug one so it can avoid some actions. It supports the allow_spin parameter, which we always set true here, but the followup change will reuse the function in a context where it may be false. Introduce __refill_objects() that uses the functions above to fill an array of objects. It has to handle the possibility that the slabs will contain more objects that were requested, due to concurrent freeing of objects to those slabs. When no more slabs on partial lists are available, it will allocate new slabs. It is intended to be only used in context where spinning is allowed, so add a WARN_ON_ONCE check there. Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are only refilled from contexts that allow spinning, or even blocking. Reviewed-by: Suren Baghdasaryan Reviewed-by: Hao Li Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 293 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 272 insertions(+), 21 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index cd8d3712b195..872340cc5f92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -248,6 +248,14 @@ struct partial_context { void *object; }; +/* Structure holding parameters for get_partial_node_bulk() */ +struct partial_bulk_context { + gfp_t flags; + unsigned int min_objects; + unsigned int max_objects; + struct list_head slabs; +}; + static inline bool kmem_cache_debug(struct kmem_cache *s) { return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); @@ -779,7 +787,8 @@ __update_freelist_slow(struct slab *slab, struct freelist_counters *old, if (slab->freelist == old->freelist && slab->counters == old->counters) { slab->freelist = new->freelist; - slab->counters = new->counters; + /* prevent tearing for the read in get_partial_node_bulk() */ + WRITE_ONCE(slab->counters, new->counters); ret = true; } slab_unlock(slab); @@ -2638,9 +2647,9 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) stat(s, SHEAF_FREE); } -static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, - size_t size, void **p); - +static unsigned int +__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max); static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, gfp_t gfp) @@ -2651,8 +2660,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, if (!to_fill) return 0; - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, - &sheaf->objects[sheaf->size]); + filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp, + to_fill, to_fill); sheaf->size += filled; @@ -3518,6 +3527,57 @@ static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, #endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); +static bool get_partial_node_bulk(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_bulk_context *pc) +{ + struct slab *slab, *slab2; + unsigned int total_free = 0; + unsigned long flags; + + /* Racy check to avoid taking the lock unnecessarily. */ + if (!n || data_race(!n->nr_partial)) + return false; + + INIT_LIST_HEAD(&pc->slabs); + + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + struct freelist_counters flc; + unsigned int slab_free; + + if (!pfmemalloc_match(slab, pc->flags)) + continue; + + /* + * determine the number of free objects in the slab racily + * + * slab_free is a lower bound due to possible subsequent + * concurrent freeing, so the caller may get more objects than + * requested and must handle that + */ + flc.counters = data_race(READ_ONCE(slab->counters)); + slab_free = flc.objects - flc.inuse; + + /* we have already min and this would get us over the max */ + if (total_free >= pc->min_objects + && total_free + slab_free > pc->max_objects) + break; + + remove_partial(n, slab); + + list_add(&slab->slab_list, &pc->slabs); + + total_free += slab_free; + if (total_free >= pc->max_objects) + break; + } + + spin_unlock_irqrestore(&n->list_lock, flags); + return total_free > 0; +} + /* * Try to allocate a partial slab from a specific node. */ @@ -4444,6 +4504,33 @@ static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) return old.freelist; } +/* + * Get the slab's freelist and do not freeze it. + * + * Assumes the slab is isolated from node partial list and not frozen. + * + * Assumes this is performed only for caches without debugging so we + * don't need to worry about adding the slab to the full list. + */ +static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) +{ + struct freelist_counters old, new; + + do { + old.freelist = slab->freelist; + old.counters = slab->counters; + + new.freelist = NULL; + new.counters = old.counters; + VM_WARN_ON_ONCE(new.frozen); + + new.inuse = old.objects; + + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); + + return old.freelist; +} + /* * Freeze the partial slab and return the pointer to the freelist. */ @@ -4467,6 +4554,72 @@ static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) return old.freelist; } +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + * + * Note that we also wipe custom freelist pointers. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj && + !freeptr_outside_object(s)) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, + void **p, unsigned int count, bool allow_spin) +{ + unsigned int allocated = 0; + struct kmem_cache_node *n; + bool needs_add_partial; + unsigned long flags; + void *object; + + /* + * Are we going to put the slab on the partial list? + * Note slab->inuse is 0 on a new slab. + */ + needs_add_partial = (slab->objects > count); + + if (!allow_spin && needs_add_partial) { + + n = get_node(s, slab_nid(slab)); + + if (!spin_trylock_irqsave(&n->list_lock, flags)) { + /* Unlucky, discard newly allocated slab */ + defer_deactivate_slab(slab, NULL); + return 0; + } + } + + object = slab->freelist; + while (object && allocated < count) { + p[allocated] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[allocated]); + + slab->inuse++; + allocated++; + } + slab->freelist = object; + + if (needs_add_partial) { + + if (allow_spin) { + n = get_node(s, slab_nid(slab)); + spin_lock_irqsave(&n->list_lock, flags); + } + add_partial(n, slab, DEACTIVATE_TO_HEAD); + spin_unlock_irqrestore(&n->list_lock, flags); + } + + inc_slabs_node(s, slab_nid(slab), slab->objects); + return allocated; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -4909,21 +5062,6 @@ redo: return object; } -/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - * - * Note that we also wipe custom freelist pointers. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj && - !freeptr_outside_object(s)) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - static __fastpath_inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { @@ -5384,6 +5522,9 @@ static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, return ret; } +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p); + /* * returns a sheaf that has at least the requested size * when prefilling is needed, do so with given gfp flags @@ -7497,6 +7638,116 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); +static unsigned int +__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + struct partial_bulk_context pc; + struct slab *slab, *slab2; + unsigned int refilled = 0; + unsigned long flags; + void *object; + int node; + + pc.flags = gfp; + pc.min_objects = min; + pc.max_objects = max; + + node = numa_mem_id(); + + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) + return 0; + + /* TODO: consider also other nodes? */ + if (!get_partial_node_bulk(s, get_node(s, node), &pc)) + goto new_slab; + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + + object = get_freelist_nofreeze(s, slab); + + while (object && refilled < max) { + p[refilled] = object; + object = get_freepointer(s, object); + maybe_wipe_obj_freeptr(s, p[refilled]); + + refilled++; + } + + /* + * Freelist had more objects than we can accommodate, we need to + * free them back. We can treat it like a detached freelist, just + * need to find the tail object. + */ + if (unlikely(object)) { + void *head = object; + void *tail; + int cnt = 0; + + do { + tail = object; + cnt++; + object = get_freepointer(s, object); + } while (object); + do_slab_free(s, slab, head, tail, cnt, _RET_IP_); + } + + if (refilled >= max) + break; + } + + if (unlikely(!list_empty(&pc.slabs))) { + struct kmem_cache_node *n = get_node(s, node); + + spin_lock_irqsave(&n->list_lock, flags); + + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) + continue; + + list_del(&slab->slab_list); + add_partial(n, slab, DEACTIVATE_TO_HEAD); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + /* any slabs left are completely free and for discard */ + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { + + list_del(&slab->slab_list); + discard_slab(s, slab); + } + } + + + if (likely(refilled >= min)) + goto out; + +new_slab: + + slab = new_slab(s, pc.flags, node); + if (!slab) + goto out; + + stat(s, ALLOC_SLAB); + + /* + * TODO: possible optimization - if we know we will consume the whole + * slab we might skip creating the freelist? + */ + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, + /* allow_spin = */ true); + + if (refilled < min) + goto new_slab; +out: + + return refilled; +} + static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) From 17c38c88294d75506c67cae378c9e940d1ce55e3 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:49 +0100 Subject: [PATCH 14/39] slab: remove cpu (partial) slabs usage from allocation paths We now rely on sheaves as the percpu caching layer and can refill them directly from partial or newly allocated slabs. Start removing the cpu (partial) slabs code, first from allocation paths. This means that any allocation not satisfied from percpu sheaves will end up in ___slab_alloc(), where we remove the usage of cpu (partial) slabs, so it will only perform get_partial() or new_slab(). In the latter case we reuse alloc_from_new_slab() (when we don't use the debug/tiny alloc_single_from_new_slab() variant). In get_partial_node() we used to return a slab for freezing as the cpu slab and to refill the partial slab. Now we only want to return a single object and leave the slab on the list (unless it became full). We can't simply reuse alloc_single_from_partial() as that assumes freeing uses free_to_partial_list(). Instead we need to use __slab_update_freelist() to work properly against a racing __slab_free(). To reflect the new purpose of get_partial() functions, rename them to get_from_partial(), get_from_partial_node(), and get_from_any_partial(). The rest of the changes is removing functions that no longer have any callers. Reviewed-by: Harry Yoo Reviewed-by: Hao Li Reviewed-by: Suren Baghdasaryan Acked-by: Alexei Starovoitov Signed-off-by: Vlastimil Babka --- mm/slub.c | 628 ++++++++---------------------------------------------- 1 file changed, 87 insertions(+), 541 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 872340cc5f92..75e085b5ad6f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -241,11 +241,10 @@ DEFINE_STATIC_KEY_FALSE(slub_debug_enabled); static DEFINE_STATIC_KEY_FALSE(strict_numa); #endif -/* Structure holding parameters for get_partial() call chain */ +/* Structure holding parameters for get_from_partial() call chain */ struct partial_context { gfp_t flags; unsigned int orig_size; - void *object; }; /* Structure holding parameters for get_partial_node_bulk() */ @@ -604,36 +603,6 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_ptr_decode(s, p, ptr_addr); } -static void prefetch_freepointer(const struct kmem_cache *s, void *object) -{ - prefetchw(object + s->offset); -} - -/* - * When running under KMSAN, get_freepointer_safe() may return an uninitialized - * pointer value in the case the current thread loses the race for the next - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in - * slab_alloc_node() will fail, so the uninitialized value won't be used, but - * KMSAN will still check all arguments of cmpxchg because of imperfect - * handling of inline assembly. - * To work around this problem, we apply __no_kmsan_checks to ensure that - * get_freepointer_safe() returns initialized memory. - */ -__no_kmsan_checks -static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) -{ - unsigned long freepointer_addr; - freeptr_t p; - - if (!debug_pagealloc_enabled_static()) - return get_freepointer(s, object); - - object = kasan_reset_tag(object); - freepointer_addr = (unsigned long)object + s->offset; - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); - return freelist_ptr_decode(s, p, freepointer_addr); -} - static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) { unsigned long freeptr_addr = (unsigned long)object + s->offset; @@ -713,23 +682,11 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); s->cpu_partial_slabs = nr_slabs; } - -static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) -{ - return s->cpu_partial_slabs; -} -#else -#ifdef SLAB_SUPPORTS_SYSFS +#elif defined(SLAB_SUPPORTS_SYSFS) static inline void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) { } -#endif - -static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) -{ - return 0; -} #endif /* CONFIG_SLUB_CPU_PARTIAL */ /* @@ -1071,7 +1028,7 @@ static void set_track_update(struct kmem_cache *s, void *object, p->handle = handle; #endif p->addr = addr; - p->cpu = smp_processor_id(); + p->cpu = raw_smp_processor_id(); p->pid = current->pid; p->when = jiffies; } @@ -3579,20 +3536,20 @@ static bool get_partial_node_bulk(struct kmem_cache *s, } /* - * Try to allocate a partial slab from a specific node. + * Try to allocate object from a partial slab on a specific node. */ -static struct slab *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n, - struct partial_context *pc) +static void *get_from_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, + struct partial_context *pc) { - struct slab *slab, *slab2, *partial = NULL; + struct slab *slab, *slab2; unsigned long flags; - unsigned int partial_slabs = 0; + void *object = NULL; /* * Racy check. If we mistakenly see no partial slabs then we * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partial() + * partial slab and there is none available then get_from_partial() * will return NULL. */ if (!n || !n->nr_partial) @@ -3603,54 +3560,55 @@ static struct slab *get_partial_node(struct kmem_cache *s, else if (!spin_trylock_irqsave(&n->list_lock, flags)) return NULL; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { + + struct freelist_counters old, new; + if (!pfmemalloc_match(slab, pc->flags)) continue; if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - void *object = alloc_single_from_partial(s, n, slab, + object = alloc_single_from_partial(s, n, slab, pc->orig_size); - if (object) { - partial = slab; - pc->object = object; + if (object) break; - } continue; } - remove_partial(n, slab); + /* + * get a single object from the slab. This might race against + * __slab_free(), which however has to take the list_lock if + * it's about to make the slab fully free. + */ + do { + old.freelist = slab->freelist; + old.counters = slab->counters; - if (!partial) { - partial = slab; - stat(s, ALLOC_FROM_PARTIAL); + new.freelist = get_freepointer(s, old.freelist); + new.counters = old.counters; + new.inuse++; - if ((slub_get_cpu_partial(s) == 0)) { - break; - } - } else { - put_cpu_partial(s, slab, 0); - stat(s, CPU_PARTIAL_NODE); + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { - break; - } - } + object = old.freelist; + if (!new.freelist) + remove_partial(n, slab); + + break; } spin_unlock_irqrestore(&n->list_lock, flags); - return partial; + return object; } /* - * Get a slab from somewhere. Search in increasing NUMA distances. + * Get an object from somewhere. Search in increasing NUMA distances. */ -static struct slab *get_any_partial(struct kmem_cache *s, - struct partial_context *pc) +static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(pc->flags); - struct slab *slab; unsigned int cpuset_mems_cookie; /* @@ -3685,8 +3643,10 @@ static struct slab *get_any_partial(struct kmem_cache *s, if (n && cpuset_zone_allowed(zone, pc->flags) && n->nr_partial > s->min_partial) { - slab = get_partial_node(s, n, pc); - if (slab) { + + void *object = get_from_partial_node(s, n, pc); + + if (object) { /* * Don't check read_mems_allowed_retry() * here - if mems_allowed was updated in @@ -3694,7 +3654,7 @@ static struct slab *get_any_partial(struct kmem_cache *s, * between allocation and the cpuset * update */ - return slab; + return object; } } } @@ -3704,22 +3664,22 @@ static struct slab *get_any_partial(struct kmem_cache *s, } /* - * Get a partial slab, lock it and return it. + * Get an object from a partial slab */ -static struct slab *get_partial(struct kmem_cache *s, int node, - struct partial_context *pc) +static void *get_from_partial(struct kmem_cache *s, int node, + struct partial_context *pc) { - struct slab *slab; int searchnode = node; + void *object; if (node == NUMA_NO_NODE) searchnode = numa_mem_id(); - slab = get_partial_node(s, get_node(s, searchnode), pc); - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) - return slab; + object = get_from_partial_node(s, get_node(s, searchnode), pc); + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) + return object; - return get_any_partial(s, pc); + return get_from_any_partial(s, pc); } #ifdef CONFIG_PREEMPTION @@ -4277,19 +4237,6 @@ static int slub_cpu_dead(unsigned int cpu) return 0; } -/* - * Check if the objects in a per cpu structure fit numa - * locality expectations. - */ -static inline int node_match(struct slab *slab, int node) -{ -#ifdef CONFIG_NUMA - if (node != NUMA_NO_NODE && slab_nid(slab) != node) - return 0; -#endif - return 1; -} - #ifdef CONFIG_SLUB_DEBUG static int count_free(struct slab *slab) { @@ -4474,36 +4421,6 @@ __update_cpu_freelist_fast(struct kmem_cache *s, &old.freelist_tid, new.freelist_tid); } -/* - * Check the slab->freelist and either transfer the freelist to the - * per cpu freelist or deactivate the slab. - * - * The slab is still frozen if the return value is not NULL. - * - * If this function returns NULL then the slab has been unfrozen. - */ -static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) -{ - struct freelist_counters old, new; - - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); - - do { - old.freelist = slab->freelist; - old.counters = slab->counters; - - new.freelist = NULL; - new.counters = old.counters; - - new.inuse = old.objects; - new.frozen = old.freelist != NULL; - - - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); - - return old.freelist; -} - /* * Get the slab's freelist and do not freeze it. * @@ -4531,29 +4448,6 @@ static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *sla return old.freelist; } -/* - * Freeze the partial slab and return the pointer to the freelist. - */ -static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) -{ - struct freelist_counters old, new; - - do { - old.freelist = slab->freelist; - old.counters = slab->counters; - - new.freelist = NULL; - new.counters = old.counters; - VM_BUG_ON(new.frozen); - - new.inuse = old.objects; - new.frozen = 1; - - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); - - return old.freelist; -} - /* * If the object has been wiped upon free, make sure it's fully initialized by * zeroing out freelist pointer. @@ -4621,170 +4515,23 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, } /* - * Slow path. The lockless freelist is empty or we need to perform - * debugging duties. + * Slow path. We failed to allocate via percpu sheaves or they are not available + * due to bootstrap or debugging enabled or SLUB_TINY. * - * Processing is still very fast if new objects have been freed to the - * regular freelist. In that case we simply take over the regular freelist - * as the lockless freelist and zap the regular freelist. - * - * If that is not working then we fall back to the partial lists. We take the - * first element of the freelist as the object to allocate now and move the - * rest of the freelist to the lockless freelist. - * - * And if we were unable to get a new slab from the partial slab lists then - * we need to allocate a new slab. This is the slowest path since it involves - * a call to the page allocator and the setup of a new slab. - * - * Version of __slab_alloc to use when we know that preemption is - * already disabled (which is the case for bulk allocation). + * We try to allocate from partial slab lists and fall back to allocating a new + * slab. */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) + unsigned long addr, unsigned int orig_size) { bool allow_spin = gfpflags_allow_spinning(gfpflags); - void *freelist; + void *object; struct slab *slab; - unsigned long flags; struct partial_context pc; bool try_thisnode = true; stat(s, ALLOC_SLOWPATH); -reread_slab: - - slab = READ_ONCE(c->slab); - if (!slab) { - /* - * if the node is not online or has no normal memory, just - * ignore the node constraint - */ - if (unlikely(node != NUMA_NO_NODE && - !node_isset(node, slab_nodes))) - node = NUMA_NO_NODE; - goto new_slab; - } - - if (unlikely(!node_match(slab, node))) { - /* - * same as above but node_match() being false already - * implies node != NUMA_NO_NODE. - * - * We don't strictly honor pfmemalloc and NUMA preferences - * when !allow_spin because: - * - * 1. Most kmalloc() users allocate objects on the local node, - * so kmalloc_nolock() tries not to interfere with them by - * deactivating the cpu slab. - * - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause - * unnecessary slab allocations even when n->partial list - * is not empty. - */ - if (!node_isset(node, slab_nodes) || - !allow_spin) { - node = NUMA_NO_NODE; - } else { - stat(s, ALLOC_NODE_MISMATCH); - goto deactivate_slab; - } - } - - /* - * By rights, we should be searching for a slab page that was - * PFMEMALLOC but right now, we are losing the pfmemalloc - * information when the page leaves the per-cpu allocator - */ - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) - goto deactivate_slab; - - /* must check again c->slab in case we got preempted and it changed */ - local_lock_cpu_slab(s, flags); - - if (unlikely(slab != c->slab)) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - freelist = c->freelist; - if (freelist) - goto load_freelist; - - freelist = get_freelist(s, slab); - - if (!freelist) { - c->slab = NULL; - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - stat(s, DEACTIVATE_BYPASS); - goto new_slab; - } - - stat(s, ALLOC_REFILL); - -load_freelist: - - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); - - /* - * freelist is pointing to the list of objects to be used. - * slab is pointing to the slab from which the objects are obtained. - * That slab must be frozen for per cpu allocations to work. - */ - VM_BUG_ON(!c->slab->frozen); - c->freelist = get_freepointer(s, freelist); - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - return freelist; - -deactivate_slab: - - local_lock_cpu_slab(s, flags); - if (slab != c->slab) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - freelist = c->freelist; - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - local_unlock_cpu_slab(s, flags); - deactivate_slab(s, slab, freelist); - -new_slab: - -#ifdef CONFIG_SLUB_CPU_PARTIAL - while (slub_percpu_partial(c)) { - local_lock_cpu_slab(s, flags); - if (unlikely(c->slab)) { - local_unlock_cpu_slab(s, flags); - goto reread_slab; - } - if (unlikely(!slub_percpu_partial(c))) { - local_unlock_cpu_slab(s, flags); - /* we were preempted and partial list got empty */ - goto new_objects; - } - - slab = slub_percpu_partial(c); - slub_set_percpu_partial(c, slab); - - if (likely(node_match(slab, node) && - pfmemalloc_match(slab, gfpflags)) || - !allow_spin) { - c->slab = slab; - freelist = get_freelist(s, slab); - VM_BUG_ON(!freelist); - stat(s, CPU_PARTIAL_ALLOC); - goto load_freelist; - } - - local_unlock_cpu_slab(s, flags); - - slab->next = NULL; - __put_partials(s, slab); - } -#endif - new_objects: pc.flags = gfpflags; @@ -4792,12 +4539,12 @@ new_objects: * When a preferred node is indicated but no __GFP_THISNODE * * 1) try to get a partial slab from target node only by having - * __GFP_THISNODE in pc.flags for get_partial() + * __GFP_THISNODE in pc.flags for get_from_partial() * 2) if 1) failed, try to allocate a new slab from target node with * GPF_NOWAIT | __GFP_THISNODE opportunistically * 3) if 2) failed, retry with original gfpflags which will allow - * get_partial() try partial lists of other nodes before potentially - * allocating new page from other nodes + * get_from_partial() try partial lists of other nodes before + * potentially allocating new page from other nodes */ if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) && try_thisnode)) { @@ -4809,33 +4556,11 @@ new_objects: } pc.orig_size = orig_size; - slab = get_partial(s, node, &pc); - if (slab) { - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - freelist = pc.object; - /* - * For debug caches here we had to go through - * alloc_single_from_partial() so just store the - * tracking info and return the object. - * - * Due to disabled preemption we need to disallow - * blocking. The flags are further adjusted by - * gfp_nested_mask() in stack_depot itself. - */ - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr, - gfpflags & ~(__GFP_DIRECT_RECLAIM)); + object = get_from_partial(s, node, &pc); + if (object) + goto success; - return freelist; - } - - freelist = freeze_slab(s, slab); - goto retry_load_slab; - } - - slub_put_cpu_ptr(s->cpu_slab); slab = new_slab(s, pc.flags, node); - c = slub_get_cpu_ptr(s->cpu_slab); if (unlikely(!slab)) { if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) @@ -4850,70 +4575,31 @@ new_objects: stat(s, ALLOC_SLAB); if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); - if (unlikely(!freelist)) { - /* This could cause an endless loop. Fail instead. */ - if (!allow_spin) - return NULL; - goto new_objects; - } + if (likely(object)) + goto success; + } else { + alloc_from_new_slab(s, slab, &object, 1, allow_spin); - if (s->flags & SLAB_STORE_USER) - set_track(s, freelist, TRACK_ALLOC, addr, - gfpflags & ~(__GFP_DIRECT_RECLAIM)); - - return freelist; + /* we don't need to check SLAB_STORE_USER here */ + if (likely(object)) + return object; } - /* - * No other reference to the slab yet so we can - * muck around with it freely without cmpxchg - */ - freelist = slab->freelist; - slab->freelist = NULL; - slab->inuse = slab->objects; - slab->frozen = 1; + if (allow_spin) + goto new_objects; - inc_slabs_node(s, slab_nid(slab), slab->objects); + /* This could cause an endless loop. Fail instead. */ + return NULL; - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { - /* - * For !pfmemalloc_match() case we don't load freelist so that - * we don't make further mismatched allocations easier. - */ - deactivate_slab(s, slab, get_freepointer(s, freelist)); - return freelist; - } +success: + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) + set_track(s, object, TRACK_ALLOC, addr, gfpflags); -retry_load_slab: - - local_lock_cpu_slab(s, flags); - if (unlikely(c->slab)) { - void *flush_freelist = c->freelist; - struct slab *flush_slab = c->slab; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - local_unlock_cpu_slab(s, flags); - - if (unlikely(!allow_spin)) { - /* Reentrant slub cannot take locks, defer */ - defer_deactivate_slab(flush_slab, flush_freelist); - } else { - deactivate_slab(s, flush_slab, flush_freelist); - } - - stat(s, CPUSLAB_FLUSH); - - goto retry_load_slab; - } - c->slab = slab; - - goto load_freelist; + return object; } + /* * We disallow kprobes in ___slab_alloc() to prevent reentrance * @@ -4928,87 +4614,11 @@ retry_load_slab: */ NOKPROBE_SYMBOL(___slab_alloc); -/* - * A wrapper for ___slab_alloc() for contexts where preemption is not yet - * disabled. Compensates for possible cpu changes by refetching the per cpu area - * pointer. - */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) -{ - void *p; - -#ifdef CONFIG_PREEMPT_COUNT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling preemption. Need to reload cpu area - * pointer. - */ - c = slub_get_cpu_ptr(s->cpu_slab); -#endif - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { - if (local_lock_is_locked(&s->cpu_slab->lock)) { - /* - * EBUSY is an internal signal to kmalloc_nolock() to - * retry a different bucket. It's not propagated - * to the caller. - */ - p = ERR_PTR(-EBUSY); - goto out; - } - } - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); -out: -#ifdef CONFIG_PREEMPT_COUNT - slub_put_cpu_ptr(s->cpu_slab); -#endif - return p; -} - static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - struct kmem_cache_cpu *c; - struct slab *slab; - unsigned long tid; void *object; -redo: - /* - * Must read kmem_cache cpu data via this cpu ptr. Preemption is - * enabled. We may switch back and forth between cpus while - * reading from one cpu area. That does not matter as long - * as we end up on the original cpu again when doing the cmpxchg. - * - * We must guarantee that tid and kmem_cache_cpu are retrieved on the - * same cpu. We read first the kmem_cache_cpu pointer and use it to read - * the tid. If we are preempted and switched to another cpu between the - * two reads, it's OK as the two are still associated with the same cpu - * and cmpxchg later will validate the cpu. - */ - c = raw_cpu_ptr(s->cpu_slab); - tid = READ_ONCE(c->tid); - - /* - * Irqless object alloc/free algorithm used here depends on sequence - * of fetching cpu_slab's data. tid should be fetched before anything - * on c to guarantee that object and slab associated with previous tid - * won't be used with current tid. If we fetch tid first, object and - * slab could be one associated with next tid and our alloc/free - * request will be failed. In this case, we will retry. So, no problem. - */ - barrier(); - - /* - * The transaction ids are globally unique per cpu and per operation on - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double - * occurs on the right processor and that there was no operation on the - * linked list in between. - */ - - object = c->freelist; - slab = c->slab; - #ifdef CONFIG_NUMA if (static_branch_unlikely(&strict_numa) && node == NUMA_NO_NODE) { @@ -5017,47 +4627,20 @@ redo: if (mpol) { /* - * Special BIND rule support. If existing slab + * Special BIND rule support. If the local node * is in permitted set then do not redirect * to a particular node. * Otherwise we apply the memory policy to get * the node we need to allocate on. */ - if (mpol->mode != MPOL_BIND || !slab || - !node_isset(slab_nid(slab), mpol->nodes)) - + if (mpol->mode != MPOL_BIND || + !node_isset(numa_mem_id(), mpol->nodes)) node = mempolicy_slab_node(); } } #endif - if (!USE_LOCKLESS_FAST_PATH() || - unlikely(!object || !slab || !node_match(slab, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); - } else { - void *next_object = get_freepointer_safe(s, object); - - /* - * The cmpxchg will only match if there was no additional - * operation and if we are on the right processor. - * - * The cmpxchg does the following atomically (without lock - * semantics!) - * 1. Relocate first pointer to the current per cpu area. - * 2. Verify that tid and freelist have not been changed - * 3. If they were not changed replace tid and freelist - * - * Since this is without lock semantics the protection is only - * against code executing on this cpu *not* from access by - * other cpus. - */ - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { - note_cmpxchg_failure("slab_alloc", s, tid); - goto redo; - } - prefetch_freepointer(s, next_object); - stat(s, ALLOC_FASTPATH); - } + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); return object; } @@ -7752,62 +7335,25 @@ static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { - struct kmem_cache_cpu *c; - unsigned long irqflags; int i; /* - * Drain objects in the per cpu slab, while disabling local - * IRQs, which protects against PREEMPT and interrupts - * handlers invoking normal fastpath. + * TODO: this might be more efficient (if necessary) by reusing + * __refill_objects() */ - c = slub_get_cpu_ptr(s->cpu_slab); - local_lock_irqsave(&s->cpu_slab->lock, irqflags); - for (i = 0; i < size; i++) { - void *object = c->freelist; - if (unlikely(!object)) { - /* - * We may have removed an object from c->freelist using - * the fastpath in the previous iteration; in that case, - * c->tid has not been bumped yet. - * Since ___slab_alloc() may reenable interrupts while - * allocating memory, we should bump c->tid now. - */ - c->tid = next_tid(c->tid); + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, + s->object_size); + if (unlikely(!p[i])) + goto error; - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); - - /* - * Invoking slow path likely have side-effect - * of re-populating per CPU c->freelist - */ - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c, s->object_size); - if (unlikely(!p[i])) - goto error; - - c = this_cpu_ptr(s->cpu_slab); - maybe_wipe_obj_freeptr(s, p[i]); - - local_lock_irqsave(&s->cpu_slab->lock, irqflags); - - continue; /* goto for-loop */ - } - c->freelist = get_freepointer(s, object); - p[i] = object; maybe_wipe_obj_freeptr(s, p[i]); - stat(s, ALLOC_FASTPATH); } - c->tid = next_tid(c->tid); - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); - slub_put_cpu_ptr(s->cpu_slab); return i; error: - slub_put_cpu_ptr(s->cpu_slab); __kmem_cache_free_bulk(s, i, p); return 0; From e323b52cf00ffc3f5ac79420af7ab340b4576a5c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:50 +0100 Subject: [PATCH 15/39] slab: remove SLUB_CPU_PARTIAL We have removed the partial slab usage from allocation paths. Now remove the whole config option and associated code. Reviewed-by: Harry Yoo Reviewed-by: Hao Li Reviewed-by: Suren Baghdasaryan Signed-off-by: Vlastimil Babka --- mm/Kconfig | 11 -- mm/slab.h | 29 ----- mm/slub.c | 319 +++-------------------------------------------------- 3 files changed, 18 insertions(+), 341 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index bd0ea5454af8..08593674cd20 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -247,17 +247,6 @@ config SLUB_STATS out which slabs are relevant to a particular load. Try running: slabinfo -DA -config SLUB_CPU_PARTIAL - default y - depends on SMP && !SLUB_TINY - bool "Enable per cpu partial caches" - help - Per cpu partial caches accelerate objects allocation and freeing - that is local to a processor at the price of more indeterminism - in the latency of the free. On overflow these caches will be cleared - which requires the taking of locks that may cause latency spikes. - Typically one would choose no for a realtime system. - config RANDOM_KMALLOC_CACHES default n depends on !SLUB_TINY diff --git a/mm/slab.h b/mm/slab.h index f833eb12b92a..565279a68fd0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -77,12 +77,6 @@ struct slab { struct llist_node llnode; void *flush_freelist; }; -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct { - struct slab *next; - int slabs; /* Nr of slabs left */ - }; -#endif }; /* Double-word boundary */ struct freelist_counters; @@ -188,23 +182,6 @@ static inline size_t slab_size(const struct slab *slab) return PAGE_SIZE << slab_order(slab); } -#ifdef CONFIG_SLUB_CPU_PARTIAL -#define slub_percpu_partial(c) ((c)->partial) - -#define slub_set_percpu_partial(c, p) \ -({ \ - slub_percpu_partial(c) = (p)->next; \ -}) - -#define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) -#else -#define slub_percpu_partial(c) NULL - -#define slub_set_percpu_partial(c, p) - -#define slub_percpu_partial_read_once(c) NULL -#endif // CONFIG_SLUB_CPU_PARTIAL - /* * Word size structure that can be atomically updated or read and that * contains both the order and the number of objects that a slab of the @@ -228,12 +205,6 @@ struct kmem_cache { unsigned int object_size; /* Object size without metadata */ struct reciprocal_value reciprocal_size; unsigned int offset; /* Free pointer offset */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - /* Number of per cpu partial objects to keep around */ - unsigned int cpu_partial; - /* Number of per cpu partial slabs to keep around */ - unsigned int cpu_partial_slabs; -#endif unsigned int sheaf_capacity; struct kmem_cache_order_objects oo; diff --git a/mm/slub.c b/mm/slub.c index 75e085b5ad6f..c254b4b66d1b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -268,15 +268,6 @@ void *fixup_red_left(struct kmem_cache *s, void *p) return p; } -static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_CPU_PARTIAL - return !kmem_cache_debug(s); -#else - return false; -#endif -} - /* * Issues still to be resolved: * @@ -431,9 +422,6 @@ struct freelist_tid { struct kmem_cache_cpu { struct freelist_tid; struct slab *slab; /* The slab from which we are allocating */ -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct slab *partial; /* Partially allocated slabs */ -#endif local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS unsigned int stat[NR_SLUB_STAT_ITEMS]; @@ -666,29 +654,6 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) -{ - unsigned int nr_slabs; - - s->cpu_partial = nr_objects; - - /* - * We take the number of objects but actually limit the number of - * slabs on the per cpu partial list, in order to limit excessive - * growth of the list. For simplicity we assume that the slabs will - * be half-full. - */ - nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); - s->cpu_partial_slabs = nr_slabs; -} -#elif defined(SLAB_SUPPORTS_SYSFS) -static inline void -slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) -{ -} -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - /* * If network-based swap is enabled, slub must keep track of whether memory * were allocated from pfmemalloc reserves. @@ -3476,12 +3441,6 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, return object; } -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); -#else -static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, - int drain) { } -#endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); static bool get_partial_node_bulk(struct kmem_cache *s, @@ -3894,131 +3853,6 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, #define local_unlock_cpu_slab(s, flags) \ local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) -#ifdef CONFIG_SLUB_CPU_PARTIAL -static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) -{ - struct kmem_cache_node *n = NULL, *n2 = NULL; - struct slab *slab, *slab_to_discard = NULL; - unsigned long flags = 0; - - while (partial_slab) { - slab = partial_slab; - partial_slab = slab->next; - - n2 = get_node(s, slab_nid(slab)); - if (n != n2) { - if (n) - spin_unlock_irqrestore(&n->list_lock, flags); - - n = n2; - spin_lock_irqsave(&n->list_lock, flags); - } - - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { - slab->next = slab_to_discard; - slab_to_discard = slab; - } else { - add_partial(n, slab, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - } - - if (n) - spin_unlock_irqrestore(&n->list_lock, flags); - - while (slab_to_discard) { - slab = slab_to_discard; - slab_to_discard = slab_to_discard->next; - - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } -} - -/* - * Put all the cpu partial slabs to the node partial list. - */ -static void put_partials(struct kmem_cache *s) -{ - struct slab *partial_slab; - unsigned long flags; - - local_lock_irqsave(&s->cpu_slab->lock, flags); - partial_slab = this_cpu_read(s->cpu_slab->partial); - this_cpu_write(s->cpu_slab->partial, NULL); - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (partial_slab) - __put_partials(s, partial_slab); -} - -static void put_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - struct slab *partial_slab; - - partial_slab = slub_percpu_partial(c); - c->partial = NULL; - - if (partial_slab) - __put_partials(s, partial_slab); -} - -/* - * Put a slab into a partial slab slot if available. - * - * If we did not find a slot then simply move all the partials to the - * per node partial list. - */ -static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) -{ - struct slab *oldslab; - struct slab *slab_to_put = NULL; - unsigned long flags; - int slabs = 0; - - local_lock_cpu_slab(s, flags); - - oldslab = this_cpu_read(s->cpu_slab->partial); - - if (oldslab) { - if (drain && oldslab->slabs >= s->cpu_partial_slabs) { - /* - * Partial array is full. Move the existing set to the - * per node partial list. Postpone the actual unfreezing - * outside of the critical section. - */ - slab_to_put = oldslab; - oldslab = NULL; - } else { - slabs = oldslab->slabs; - } - } - - slabs++; - - slab->slabs = slabs; - slab->next = oldslab; - - this_cpu_write(s->cpu_slab->partial, slab); - - local_unlock_cpu_slab(s, flags); - - if (slab_to_put) { - __put_partials(s, slab_to_put); - stat(s, CPU_PARTIAL_DRAIN); - } -} - -#else /* CONFIG_SLUB_CPU_PARTIAL */ - -static inline void put_partials(struct kmem_cache *s) { } -static inline void put_partials_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) { } - -#endif /* CONFIG_SLUB_CPU_PARTIAL */ - static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { unsigned long flags; @@ -4056,8 +3890,6 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) deactivate_slab(s, slab, freelist); stat(s, CPUSLAB_FLUSH); } - - put_partials_cpu(s, c); } static inline void flush_this_cpu_slab(struct kmem_cache *s) @@ -4066,15 +3898,13 @@ static inline void flush_this_cpu_slab(struct kmem_cache *s) if (c->slab) flush_slab(s, c); - - put_partials(s); } static bool has_cpu_slab(int cpu, struct kmem_cache *s) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - return c->slab || slub_percpu_partial(c); + return c->slab; } static bool has_pcs_used(int cpu, struct kmem_cache *s) @@ -5652,13 +5482,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, return; } - /* - * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below - * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) - * is the only other reason it can be false, and it is already handled - * above. - */ - do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); @@ -5683,26 +5506,19 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * Unless it's frozen. */ if ((!new.inuse || was_full) && !was_frozen) { + + n = get_node(s, slab_nid(slab)); /* - * If slab becomes non-full and we have cpu partial - * lists, we put it there unconditionally to avoid - * taking the list_lock. Otherwise we need it. + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. */ - if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { + spin_lock_irqsave(&n->list_lock, flags); - n = get_node(s, slab_nid(slab)); - /* - * Speculatively acquire the list_lock. - * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); - - on_node_partial = slab_test_node_partial(slab); - } + on_node_partial = slab_test_node_partial(slab); } } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); @@ -5715,13 +5531,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * activity can be necessary. */ stat(s, FREE_FROZEN); - } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { - /* - * If we started with a full slab then put it onto the - * per cpu partial list. - */ - put_cpu_partial(s, slab, 1); - stat(s, CPU_PARTIAL_FREE); } /* @@ -5750,10 +5559,9 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, /* * Objects left in the slab. If it was not on the partial list before - * then add it. This can only happen when cache has no per cpu partial - * list otherwise we would have put it there. + * then add it. */ - if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { + if (unlikely(was_full)) { add_partial(n, slab, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -6422,8 +6230,8 @@ redo: if (unlikely(!allow_spin)) { /* * __slab_free() can locklessly cmpxchg16 into a slab, - * but then it might need to take spin_lock or local_lock - * in put_cpu_partial() for further processing. + * but then it might need to take spin_lock + * for further processing. * Avoid the complexity and simply add to a deferred list. */ defer_free(s, head); @@ -7747,39 +7555,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) return 1; } -static void set_cpu_partial(struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_CPU_PARTIAL - unsigned int nr_objects; - - /* - * cpu_partial determined the maximum number of objects kept in the - * per cpu partial lists of a processor. - * - * Per cpu partial lists mainly contain slabs that just have one - * object freed. If they are used for allocation then they can be - * filled up again with minimal effort. The slab will never hit the - * per node partial lists and therefore no locking will be required. - * - * For backwards compatibility reasons, this is determined as number - * of objects, even though we now limit maximum number of pages, see - * slub_set_cpu_partial() - */ - if (!kmem_cache_has_cpu_partial(s)) - nr_objects = 0; - else if (s->size >= PAGE_SIZE) - nr_objects = 6; - else if (s->size >= 1024) - nr_objects = 24; - else if (s->size >= 256) - nr_objects = 52; - else - nr_objects = 120; - - slub_set_cpu_partial(s, nr_objects); -#endif -} - static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, struct kmem_cache_args *args) @@ -8640,8 +8415,6 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); - set_cpu_partial(s); - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); if (!s->cpu_sheaves) { err = -ENOMEM; @@ -9005,20 +8778,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[node] += x; -#ifdef CONFIG_SLUB_CPU_PARTIAL - slab = slub_percpu_partial_read_once(c); - if (slab) { - node = slab_nid(slab); - if (flags & SO_TOTAL) - WARN_ON_ONCE(1); - else if (flags & SO_OBJECTS) - WARN_ON_ONCE(1); - else - x = data_race(slab->slabs); - total += x; - nodes[node] += x; - } -#endif } } @@ -9153,12 +8912,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - unsigned int nr_partial = 0; -#ifdef CONFIG_SLUB_CPU_PARTIAL - nr_partial = s->cpu_partial; -#endif - - return sysfs_emit(buf, "%u\n", nr_partial); + return sysfs_emit(buf, "0\n"); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -9170,11 +8924,9 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, err = kstrtouint(buf, 10, &objects); if (err) return err; - if (objects && !kmem_cache_has_cpu_partial(s)) + if (objects) return -EINVAL; - slub_set_cpu_partial(s, objects); - flush_all(s); return length; } SLAB_ATTR(cpu_partial); @@ -9213,42 +8965,7 @@ SLAB_ATTR_RO(objects_partial); static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) { - int objects = 0; - int slabs = 0; - int cpu __maybe_unused; - int len = 0; - -#ifdef CONFIG_SLUB_CPU_PARTIAL - for_each_online_cpu(cpu) { - struct slab *slab; - - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - - if (slab) - slabs += data_race(slab->slabs); - } -#endif - - /* Approximate half-full slabs, see slub_set_cpu_partial() */ - objects = (slabs * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); - -#ifdef CONFIG_SLUB_CPU_PARTIAL - for_each_online_cpu(cpu) { - struct slab *slab; - - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - if (slab) { - slabs = data_race(slab->slabs); - objects = (slabs * oo_objects(s->oo)) / 2; - len += sysfs_emit_at(buf, len, " C%d=%d(%d)", - cpu, objects, slabs); - } - } -#endif - len += sysfs_emit_at(buf, len, "\n"); - - return len; + return sysfs_emit(buf, "0(0)\n"); } SLAB_ATTR_RO(slabs_cpu_partial); From bdc9282f7809678db34e3d7e094b267a6bdd9dac Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:51 +0100 Subject: [PATCH 16/39] slab: remove the do_slab_free() fastpath We have removed cpu slab usage from allocation paths. Now remove do_slab_free() which was freeing objects to the cpu slab when the object belonged to it. Instead call __slab_free() directly, which was previously the fallback. This simplifies kfree_nolock() - when freeing to percpu sheaf fails, we can call defer_free() directly. Also remove functions that became unused. Reviewed-by: Harry Yoo Reviewed-by: Hao Li Reviewed-by: Suren Baghdasaryan Acked-by: Alexei Starovoitov Signed-off-by: Vlastimil Babka --- mm/slub.c | 149 +++++------------------------------------------------- 1 file changed, 13 insertions(+), 136 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index c254b4b66d1b..4346ace499bb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3678,29 +3678,6 @@ static inline unsigned int init_tid(int cpu) return cpu; } -static inline void note_cmpxchg_failure(const char *n, - const struct kmem_cache *s, unsigned long tid) -{ -#ifdef SLUB_DEBUG_CMPXCHG - unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - - pr_info("%s %s: cmpxchg redo ", n, s->name); - - if (IS_ENABLED(CONFIG_PREEMPTION) && - tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { - pr_warn("due to cpu change %d -> %d\n", - tid_to_cpu(tid), tid_to_cpu(actual_tid)); - } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { - pr_warn("due to cpu running other code. Event %ld->%ld\n", - tid_to_event(tid), tid_to_event(actual_tid)); - } else { - pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", - actual_tid, tid, next_tid(tid)); - } -#endif - stat(s, CMPXCHG_DOUBLE_CPU_FAIL); -} - static void init_kmem_cache_cpus(struct kmem_cache *s) { #ifdef CONFIG_PREEMPT_RT @@ -4239,18 +4216,6 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } -static inline bool -__update_cpu_freelist_fast(struct kmem_cache *s, - void *freelist_old, void *freelist_new, - unsigned long tid) -{ - struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; - struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; - - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, - &old.freelist_tid, new.freelist_tid); -} - /* * Get the slab's freelist and do not freeze it. * @@ -6188,99 +6153,6 @@ void defer_free_barrier(void) irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); } -/* - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that - * can perform fastpath freeing without additional function calls. - * - * The fastpath is only possible if we are freeing to the current cpu slab - * of this processor. This typically the case if we have just allocated - * the item before. - * - * If fastpath is not possible then fall back to __slab_free where we deal - * with all sorts of special processing. - * - * Bulk free of a freelist with several objects (all pointing to the - * same slab) possible by specifying head and tail ptr, plus objects - * count (cnt). Bulk free indicated by tail pointer being set. - */ -static __always_inline void do_slab_free(struct kmem_cache *s, - struct slab *slab, void *head, void *tail, - int cnt, unsigned long addr) -{ - /* cnt == 0 signals that it's called from kfree_nolock() */ - bool allow_spin = cnt; - struct kmem_cache_cpu *c; - unsigned long tid; - void **freelist; - -redo: - /* - * Determine the currently cpus per cpu slab. - * The cpu may change afterward. However that does not matter since - * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succeed. - */ - c = raw_cpu_ptr(s->cpu_slab); - tid = READ_ONCE(c->tid); - - /* Same with comment on barrier() in __slab_alloc_node() */ - barrier(); - - if (unlikely(slab != c->slab)) { - if (unlikely(!allow_spin)) { - /* - * __slab_free() can locklessly cmpxchg16 into a slab, - * but then it might need to take spin_lock - * for further processing. - * Avoid the complexity and simply add to a deferred list. - */ - defer_free(s, head); - } else { - __slab_free(s, slab, head, tail, cnt, addr); - } - return; - } - - if (unlikely(!allow_spin)) { - if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && - local_lock_is_locked(&s->cpu_slab->lock)) { - defer_free(s, head); - return; - } - cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ - } - - if (USE_LOCKLESS_FAST_PATH()) { - freelist = READ_ONCE(c->freelist); - - set_freepointer(s, tail, freelist); - - if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { - note_cmpxchg_failure("slab_free", s, tid); - goto redo; - } - } else { - __maybe_unused unsigned long flags = 0; - - /* Update the free list under the local lock */ - local_lock_cpu_slab(s, flags); - c = this_cpu_ptr(s->cpu_slab); - if (unlikely(slab != c->slab)) { - local_unlock_cpu_slab(s, flags); - goto redo; - } - tid = c->tid; - freelist = c->freelist; - - set_freepointer(s, tail, freelist); - c->freelist = head; - c->tid = next_tid(tid); - - local_unlock_cpu_slab(s, flags); - } - stat_add(s, FREE_FASTPATH, cnt); -} - static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *object, unsigned long addr) @@ -6297,7 +6169,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, return; } - do_slab_free(s, slab, object, object, 1, addr); + __slab_free(s, slab, object, object, 1, addr); } #ifdef CONFIG_MEMCG @@ -6306,7 +6178,7 @@ static noinline void memcg_alloc_abort_single(struct kmem_cache *s, void *object) { if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) - do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); + __slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); } #endif @@ -6321,7 +6193,7 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, * to remove objects, whose reuse must be delayed. */ if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) - do_slab_free(s, slab, head, tail, cnt, addr); + __slab_free(s, slab, head, tail, cnt, addr); } #ifdef CONFIG_SLUB_RCU_DEBUG @@ -6347,14 +6219,14 @@ static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) /* resume freeing */ if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) - do_slab_free(s, slab, object, object, 1, _THIS_IP_); + __slab_free(s, slab, object, object, 1, _THIS_IP_); } #endif /* CONFIG_SLUB_RCU_DEBUG */ #ifdef CONFIG_KASAN_GENERIC void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { - do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); + __slab_free(cache, virt_to_slab(x), x, x, 1, addr); } #endif @@ -6570,7 +6442,12 @@ void kfree_nolock(const void *object) return; } - do_slab_free(s, slab, x, x, 0, _RET_IP_); + /* + * __slab_free() can locklessly cmpxchg16 into a slab, but then it might + * need to take spin_lock for further processing. + * Avoid the complexity and simply add to a deferred list. + */ + defer_free(s, x); } EXPORT_SYMBOL_GPL(kfree_nolock); @@ -6996,7 +6873,7 @@ static void __kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (kfence_free(df.freelist)) continue; - do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, + __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); } @@ -7082,7 +6959,7 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, cnt++; object = get_freepointer(s, object); } while (object); - do_slab_free(s, slab, head, tail, cnt, _RET_IP_); + __slab_free(s, slab, head, tail, cnt, _RET_IP_); } if (refilled >= max) From ab2f752ac31c0a9e0a38d3dec1ec5d8c5f65f4da Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:52 +0100 Subject: [PATCH 17/39] slab: remove defer_deactivate_slab() There are no more cpu slabs so we don't need their deferred deactivation. The function is now only used from places where we allocate a new slab but then can't spin on node list_lock to put it on the partial list. Instead of the deferred action we can free it directly via __free_slab(), we just need to tell it to use _nolock() freeing of the underlying pages and take care of the accounting. Since free_frozen_pages_nolock() variant does not yet exist for code outside of the page allocator, create it as a trivial wrapper for __free_frozen_pages(..., FPI_TRYLOCK). Reviewed-by: Harry Yoo Reviewed-by: Hao Li Reviewed-by: Suren Baghdasaryan Acked-by: Alexei Starovoitov Signed-off-by: Vlastimil Babka --- mm/internal.h | 1 + mm/page_alloc.c | 5 +++++ mm/slab.h | 8 +------ mm/slub.c | 58 ++++++++++++++++++------------------------------- 4 files changed, 28 insertions(+), 44 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index e430da900430..1f44ccb4badf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -846,6 +846,7 @@ static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int ord struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); #define alloc_frozen_pages_nolock(...) \ alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) +void free_frozen_pages_nolock(struct page *page, unsigned int order); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c380f063e8b7..0127e9d661ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2981,6 +2981,11 @@ void free_frozen_pages(struct page *page, unsigned int order) __free_frozen_pages(page, order, FPI_NONE); } +void free_frozen_pages_nolock(struct page *page, unsigned int order) +{ + __free_frozen_pages(page, order, FPI_TRYLOCK); +} + /* * Free a batch of folios */ diff --git a/mm/slab.h b/mm/slab.h index 565279a68fd0..f88568e31268 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -71,13 +71,7 @@ struct slab { struct kmem_cache *slab_cache; union { struct { - union { - struct list_head slab_list; - struct { /* For deferred deactivate_slab() */ - struct llist_node llnode; - void *flush_freelist; - }; - }; + struct list_head slab_list; /* Double-word boundary */ struct freelist_counters; }; diff --git a/mm/slub.c b/mm/slub.c index 4346ace499bb..883effab47fd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3262,7 +3262,7 @@ static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node) flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); } -static void __free_slab(struct kmem_cache *s, struct slab *slab) +static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin) { struct page *page = slab_page(slab); int order = compound_order(page); @@ -3273,14 +3273,26 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) __ClearPageSlab(page); mm_account_reclaimed_pages(pages); unaccount_slab(slab, order, s); - free_frozen_pages(page, order); + if (allow_spin) + free_frozen_pages(page, order); + else + free_frozen_pages_nolock(page, order); +} + +static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab) +{ + /* + * Since it was just allocated, we can skip the actions in + * discard_slab() and free_slab(). + */ + __free_slab(s, slab, false); } static void rcu_free_slab(struct rcu_head *h) { struct slab *slab = container_of(h, struct slab, rcu_head); - __free_slab(slab->slab_cache, slab); + __free_slab(slab->slab_cache, slab, true); } static void free_slab(struct kmem_cache *s, struct slab *slab) @@ -3296,7 +3308,7 @@ static void free_slab(struct kmem_cache *s, struct slab *slab) if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); else - __free_slab(s, slab); + __free_slab(s, slab, true); } static void discard_slab(struct kmem_cache *s, struct slab *slab) @@ -3389,8 +3401,6 @@ static void *alloc_single_from_partial(struct kmem_cache *s, return object; } -static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); - /* * Called only for kmem_cache_debug() caches to allocate from a freshly * allocated slab. Allocate a single object instead of whole freelist @@ -3406,8 +3416,8 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, void *object; if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { - /* Unlucky, discard newly allocated slab */ - defer_deactivate_slab(slab, NULL); + /* Unlucky, discard newly allocated slab. */ + free_new_slab_nolock(s, slab); return NULL; } @@ -4279,7 +4289,7 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, if (!spin_trylock_irqsave(&n->list_lock, flags)) { /* Unlucky, discard newly allocated slab */ - defer_deactivate_slab(slab, NULL); + free_new_slab_nolock(s, slab); return 0; } } @@ -6059,7 +6069,6 @@ flush_remote: struct defer_free { struct llist_head objects; - struct llist_head slabs; struct irq_work work; }; @@ -6067,23 +6076,21 @@ static void free_deferred_objects(struct irq_work *work); static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { .objects = LLIST_HEAD_INIT(objects), - .slabs = LLIST_HEAD_INIT(slabs), .work = IRQ_WORK_INIT(free_deferred_objects), }; /* * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe - * to take sleeping spin_locks from __slab_free() and deactivate_slab(). + * to take sleeping spin_locks from __slab_free(). * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). */ static void free_deferred_objects(struct irq_work *work) { struct defer_free *df = container_of(work, struct defer_free, work); struct llist_head *objs = &df->objects; - struct llist_head *slabs = &df->slabs; struct llist_node *llnode, *pos, *t; - if (llist_empty(objs) && llist_empty(slabs)) + if (llist_empty(objs)) return; llnode = llist_del_all(objs); @@ -6107,16 +6114,6 @@ static void free_deferred_objects(struct irq_work *work) __slab_free(s, slab, x, x, 1, _THIS_IP_); } - - llnode = llist_del_all(slabs); - llist_for_each_safe(pos, t, llnode) { - struct slab *slab = container_of(pos, struct slab, llnode); - - if (slab->frozen) - deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); - else - free_slab(slab->slab_cache, slab); - } } static void defer_free(struct kmem_cache *s, void *head) @@ -6132,19 +6129,6 @@ static void defer_free(struct kmem_cache *s, void *head) irq_work_queue(&df->work); } -static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) -{ - struct defer_free *df; - - slab->flush_freelist = flush_freelist; - - guard(preempt)(); - - df = this_cpu_ptr(&defer_free_objects); - if (llist_add(&slab->llnode, &df->slabs)) - irq_work_queue(&df->work); -} - void defer_free_barrier(void) { int cpu; From 073d5f156292201f1e49263a62dfa182eeee273f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:53 +0100 Subject: [PATCH 18/39] slab: simplify kmalloc_nolock() The kmalloc_nolock() implementation has several complications and restrictions due to SLUB's cpu slab locking, lockless fastpath and PREEMPT_RT differences. With cpu slab usage removed, we can simplify things: - relax the PREEMPT_RT context checks as they were before commit 99a3e3a1cfc9 ("slab: fix kmalloc_nolock() context check for PREEMPT_RT") and also reference the explanation comment in the page allocator - the local_lock_cpu_slab() macros became unused, remove them - we no longer need to set up lockdep classes on PREEMPT_RT - we no longer need to annotate ___slab_alloc as NOKPROBE_SYMBOL since there's no lockless cpu freelist manipulation anymore - __slab_alloc_node() can be called from kmalloc_nolock_noprof() unconditionally. It can also no longer return EBUSY. But trylock failures can still happen so retry with the larger bucket if the allocation fails for any reason. Note that we still need __CMPXCHG_DOUBLE, because while it was removed we don't use cmpxchg16b on cpu freelist anymore, we still use it on slab freelist, and the alternative is slab_lock() which can be interrupted by a nmi. Clarify the comment to mention it specifically. Acked-by: Alexei Starovoitov Reviewed-by: Hao Li Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slab.h | 1 - mm/slub.c | 144 +++++++++++------------------------------------------- 2 files changed, 29 insertions(+), 116 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index f88568e31268..31375198e19c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -190,7 +190,6 @@ struct kmem_cache_order_objects { */ struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; - struct lock_class_key lock_key; struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; diff --git a/mm/slub.c b/mm/slub.c index 883effab47fd..0ab1423ee0fa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3690,29 +3690,12 @@ static inline unsigned int init_tid(int cpu) static void init_kmem_cache_cpus(struct kmem_cache *s) { -#ifdef CONFIG_PREEMPT_RT - /* - * Register lockdep key for non-boot kmem caches to avoid - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() - */ - bool finegrain_lockdep = !init_section_contains(s, 1); -#else - /* - * Don't bother with different lockdep classes for each - * kmem_cache, since we only use local_trylock_irqsave(). - */ - bool finegrain_lockdep = false; -#endif int cpu; struct kmem_cache_cpu *c; - if (finegrain_lockdep) - lockdep_register_key(&s->lock_key); for_each_possible_cpu(cpu) { c = per_cpu_ptr(s->cpu_slab, cpu); local_trylock_init(&c->lock); - if (finegrain_lockdep) - lockdep_set_class(&c->lock, &s->lock_key); c->tid = init_tid(cpu); } } @@ -3799,47 +3782,6 @@ static void deactivate_slab(struct kmem_cache *s, struct slab *slab, } } -/* - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock - * can be acquired without a deadlock before invoking the function. - * - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), - * and kmalloc() is not used in an unsupported context. - * - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but - * lockdep_assert() will catch a bug in case: - * #1 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() - * or - * #2 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() - * - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt - * disabled context. The lock will always be acquired and if needed it - * block and sleep until the lock is available. - * #1 is possible in !PREEMPT_RT only. - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) - * - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B - */ -#if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) -#define local_lock_cpu_slab(s, flags) \ - local_lock_irqsave(&(s)->cpu_slab->lock, flags) -#else -#define local_lock_cpu_slab(s, flags) \ - do { \ - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ - lockdep_assert(__l); \ - } while (0) -#endif - -#define local_unlock_cpu_slab(s, flags) \ - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) - static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { unsigned long flags; @@ -4405,20 +4347,6 @@ success: return object; } -/* - * We disallow kprobes in ___slab_alloc() to prevent reentrance - * - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() - * manipulating c->freelist without lock. - * - * This does not prevent kprobe in functions called from ___slab_alloc() such as - * local_lock_irqsave() itself, and that is fine, we only need to protect the - * c->freelist manipulation in ___slab_alloc() itself. - */ -NOKPROBE_SYMBOL(___slab_alloc); - static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { @@ -5259,13 +5187,13 @@ void *kmalloc_nolock_noprof(size_t size, gfp_t gfp_flags, int node) if (unlikely(!size)) return ZERO_SIZE_PTR; - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) - /* - * kmalloc_nolock() in PREEMPT_RT is not supported from - * non-preemptible context because local_lock becomes a - * sleeping lock on RT. - */ + /* + * See the comment for the same check in + * alloc_frozen_pages_nolock_noprof() + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) return NULL; + retry: if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) return NULL; @@ -5274,10 +5202,11 @@ retry: if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) /* * kmalloc_nolock() is not supported on architectures that - * don't implement cmpxchg16b, but debug caches don't use - * per-cpu slab and per-cpu partial slabs. They rely on - * kmem_cache_node->list_lock, so kmalloc_nolock() can - * attempt to allocate from debug caches by + * don't implement cmpxchg16b and thus need slab_lock() + * which could be preempted by a nmi. + * But debug caches don't use that and only rely on + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt + * to allocate from debug caches by * spin_trylock_irqsave(&n->list_lock, ...) */ return NULL; @@ -5286,42 +5215,31 @@ retry: if (ret) goto success; - ret = ERR_PTR(-EBUSY); - /* * Do not call slab_alloc_node(), since trylock mode isn't * compatible with slab_pre_alloc_hook/should_failslab and * kfence_alloc. Hence call __slab_alloc_node() (at most twice) * and slab_post_alloc_hook() directly. - * - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair - * in irq saved region. It assumes that the same cpu will not - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. - * Therefore use in_nmi() to check whether particular bucket is in - * irq protected section. - * - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that - * this cpu was interrupted somewhere inside ___slab_alloc() after - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). - * In this case fast path with __update_cpu_freelist_fast() is not safe. */ - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); - if (PTR_ERR(ret) == -EBUSY) { - if (can_retry) { - /* pick the next kmalloc bucket */ - size = s->object_size + 1; - /* - * Another alternative is to - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; - * to retry from bucket of the same size. - */ - can_retry = false; - goto retry; - } - ret = NULL; + /* + * It's possible we failed due to trylock as we preempted someone with + * the sheaves locked, and the list_lock is also held by another cpu. + * But it should be rare that multiple kmalloc buckets would have + * sheaves locked, so try a larger one. + */ + if (!ret && can_retry) { + /* pick the next kmalloc bucket */ + size = s->object_size + 1; + /* + * Another alternative is to + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; + * to retry from bucket of the same size. + */ + can_retry = false; + goto retry; } success: @@ -7374,10 +7292,6 @@ void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); pcs_destroy(s); -#ifdef CONFIG_PREEMPT_RT - if (s->cpu_slab) - lockdep_unregister_key(&s->lock_key); -#endif free_percpu(s->cpu_slab); free_kmem_cache_nodes(s); } From 32c894c7274b7ce901041ce6dceeca3ec1152205 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:54 +0100 Subject: [PATCH 19/39] slab: remove struct kmem_cache_cpu The cpu slab is not used anymore for allocation or freeing, the remaining code is for flushing, but it's effectively dead. Remove the whole struct kmem_cache_cpu, the flushing code and other orphaned functions. The remaining used field of kmem_cache_cpu is the stat array with CONFIG_SLUB_STATS. Put it instead in a new struct kmem_cache_stats. In struct kmem_cache, the field is cpu_stats and placed near the end of the struct. Reviewed-by: Hao Li Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slab.h | 7 +- mm/slub.c | 304 +++++------------------------------------------------- 2 files changed, 27 insertions(+), 284 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 31375198e19c..e251a2714b0c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -21,14 +21,12 @@ # define system_has_freelist_aba() system_has_cmpxchg128() # define try_cmpxchg_freelist try_cmpxchg128 # endif -#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 typedef u128 freelist_full_t; #else /* CONFIG_64BIT */ # ifdef system_has_cmpxchg64 # define system_has_freelist_aba() system_has_cmpxchg64() # define try_cmpxchg_freelist try_cmpxchg64 # endif -#define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 typedef u64 freelist_full_t; #endif /* CONFIG_64BIT */ @@ -189,7 +187,6 @@ struct kmem_cache_order_objects { * Slab cache management. */ struct kmem_cache { - struct kmem_cache_cpu __percpu *cpu_slab; struct slub_percpu_sheaves __percpu *cpu_sheaves; /* Used for retrieving partial slabs, etc. */ slab_flags_t flags; @@ -238,6 +235,10 @@ struct kmem_cache { unsigned int usersize; /* Usercopy region size */ #endif +#ifdef CONFIG_SLUB_STATS + struct kmem_cache_stats __percpu *cpu_stats; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/mm/slub.c b/mm/slub.c index 0ab1423ee0fa..8e4482e75a5c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -405,28 +405,11 @@ enum stat_item { NR_SLUB_STAT_ITEMS }; -struct freelist_tid { - union { - struct { - void *freelist; /* Pointer to next available object */ - unsigned long tid; /* Globally unique transaction id */ - }; - freelist_full_t freelist_tid; - }; -}; - -/* - * When changing the layout, make sure freelist and tid are still compatible - * with this_cpu_cmpxchg_double() alignment requirements. - */ -struct kmem_cache_cpu { - struct freelist_tid; - struct slab *slab; /* The slab from which we are allocating */ - local_trylock_t lock; /* Protects the fields above */ #ifdef CONFIG_SLUB_STATS +struct kmem_cache_stats { unsigned int stat[NR_SLUB_STAT_ITEMS]; -#endif }; +#endif static inline void stat(const struct kmem_cache *s, enum stat_item si) { @@ -435,7 +418,7 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * The rmw is racy on a preemptible kernel but this is acceptable, so * avoid this_cpu_add()'s irq-disable overhead. */ - raw_cpu_inc(s->cpu_slab->stat[si]); + raw_cpu_inc(s->cpu_stats->stat[si]); #endif } @@ -443,7 +426,7 @@ static inline void stat_add(const struct kmem_cache *s, enum stat_item si, int v) { #ifdef CONFIG_SLUB_STATS - raw_cpu_add(s->cpu_slab->stat[si], v); + raw_cpu_add(s->cpu_stats->stat[si], v); #endif } @@ -532,7 +515,7 @@ static inline struct node_barn *get_barn(struct kmem_cache *s) static nodemask_t slab_nodes; /* - * Workqueue used for flush_cpu_slab(). + * Workqueue used for flushing cpu and kfree_rcu sheaves. */ static struct workqueue_struct *flushwq; @@ -1154,20 +1137,6 @@ static void object_err(struct kmem_cache *s, struct slab *slab, WARN_ON(1); } -static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, - void **freelist, void *nextfree) -{ - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && - !check_valid_pointer(s, slab, nextfree) && freelist) { - object_err(s, slab, *freelist, "Freechain corrupt"); - *freelist = NULL; - slab_fix(s, "Isolate corrupted freechain"); - return true; - } - - return false; -} - static void __slab_err(struct slab *slab) { if (slab_in_kunit_test()) @@ -1949,11 +1918,6 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, - void **freelist, void *nextfree) -{ - return false; -} #endif /* CONFIG_SLUB_DEBUG */ /* @@ -3651,191 +3615,6 @@ static void *get_from_partial(struct kmem_cache *s, int node, return get_from_any_partial(s, pc); } -#ifdef CONFIG_PREEMPTION -/* - * Calculate the next globally unique transaction for disambiguation - * during cmpxchg. The transactions start with the cpu number and are then - * incremented by CONFIG_NR_CPUS. - */ -#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) -#else -/* - * No preemption supported therefore also no need to check for - * different cpus. - */ -#define TID_STEP 1 -#endif /* CONFIG_PREEMPTION */ - -static inline unsigned long next_tid(unsigned long tid) -{ - return tid + TID_STEP; -} - -#ifdef SLUB_DEBUG_CMPXCHG -static inline unsigned int tid_to_cpu(unsigned long tid) -{ - return tid % TID_STEP; -} - -static inline unsigned long tid_to_event(unsigned long tid) -{ - return tid / TID_STEP; -} -#endif - -static inline unsigned int init_tid(int cpu) -{ - return cpu; -} - -static void init_kmem_cache_cpus(struct kmem_cache *s) -{ - int cpu; - struct kmem_cache_cpu *c; - - for_each_possible_cpu(cpu) { - c = per_cpu_ptr(s->cpu_slab, cpu); - local_trylock_init(&c->lock); - c->tid = init_tid(cpu); - } -} - -/* - * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, - * unfreezes the slabs and puts it on the proper list. - * Assumes the slab has been already safely taken away from kmem_cache_cpu - * by the caller. - */ -static void deactivate_slab(struct kmem_cache *s, struct slab *slab, - void *freelist) -{ - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - int free_delta = 0; - void *nextfree, *freelist_iter, *freelist_tail; - int tail = DEACTIVATE_TO_HEAD; - unsigned long flags = 0; - struct freelist_counters old, new; - - if (READ_ONCE(slab->freelist)) { - stat(s, DEACTIVATE_REMOTE_FREES); - tail = DEACTIVATE_TO_TAIL; - } - - /* - * Stage one: Count the objects on cpu's freelist as free_delta and - * remember the last object in freelist_tail for later splicing. - */ - freelist_tail = NULL; - freelist_iter = freelist; - while (freelist_iter) { - nextfree = get_freepointer(s, freelist_iter); - - /* - * If 'nextfree' is invalid, it is possible that the object at - * 'freelist_iter' is already corrupted. So isolate all objects - * starting at 'freelist_iter' by skipping them. - */ - if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) - break; - - freelist_tail = freelist_iter; - free_delta++; - - freelist_iter = nextfree; - } - - /* - * Stage two: Unfreeze the slab while splicing the per-cpu - * freelist to the head of slab's freelist. - */ - do { - old.freelist = READ_ONCE(slab->freelist); - old.counters = READ_ONCE(slab->counters); - VM_BUG_ON(!old.frozen); - - /* Determine target state of the slab */ - new.counters = old.counters; - new.frozen = 0; - if (freelist_tail) { - new.inuse -= free_delta; - set_freepointer(s, freelist_tail, old.freelist); - new.freelist = freelist; - } else { - new.freelist = old.freelist; - } - } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); - - /* - * Stage three: Manipulate the slab list based on the updated state. - */ - if (!new.inuse && n->nr_partial >= s->min_partial) { - stat(s, DEACTIVATE_EMPTY); - discard_slab(s, slab); - stat(s, FREE_SLAB); - } else if (new.freelist) { - spin_lock_irqsave(&n->list_lock, flags); - add_partial(n, slab, tail); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, tail); - } else { - stat(s, DEACTIVATE_FULL); - } -} - -static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) -{ - unsigned long flags; - struct slab *slab; - void *freelist; - - local_lock_irqsave(&s->cpu_slab->lock, flags); - - slab = c->slab; - freelist = c->freelist; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - local_unlock_irqrestore(&s->cpu_slab->lock, flags); - - if (slab) { - deactivate_slab(s, slab, freelist); - stat(s, CPUSLAB_FLUSH); - } -} - -static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - void *freelist = c->freelist; - struct slab *slab = c->slab; - - c->slab = NULL; - c->freelist = NULL; - c->tid = next_tid(c->tid); - - if (slab) { - deactivate_slab(s, slab, freelist); - stat(s, CPUSLAB_FLUSH); - } -} - -static inline void flush_this_cpu_slab(struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); - - if (c->slab) - flush_slab(s, c); -} - -static bool has_cpu_slab(int cpu, struct kmem_cache *s) -{ - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - - return c->slab; -} - static bool has_pcs_used(int cpu, struct kmem_cache *s) { struct slub_percpu_sheaves *pcs; @@ -3849,11 +3628,11 @@ static bool has_pcs_used(int cpu, struct kmem_cache *s) } /* - * Flush cpu slab. + * Flush percpu sheaves * * Called from CPU work handler with migration disabled. */ -static void flush_cpu_slab(struct work_struct *w) +static void flush_cpu_sheaves(struct work_struct *w) { struct kmem_cache *s; struct slub_flush_work *sfw; @@ -3864,8 +3643,6 @@ static void flush_cpu_slab(struct work_struct *w) if (cache_has_sheaves(s)) pcs_flush_all(s); - - flush_this_cpu_slab(s); } static void flush_all_cpus_locked(struct kmem_cache *s) @@ -3878,11 +3655,11 @@ static void flush_all_cpus_locked(struct kmem_cache *s) for_each_online_cpu(cpu) { sfw = &per_cpu(slub_flush, cpu); - if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { + if (!has_pcs_used(cpu, s)) { sfw->skip = true; continue; } - INIT_WORK(&sfw->work, flush_cpu_slab); + INIT_WORK(&sfw->work, flush_cpu_sheaves); sfw->skip = false; sfw->s = s; queue_work_on(cpu, flushwq, &sfw->work); @@ -3988,7 +3765,6 @@ static int slub_cpu_dead(unsigned int cpu) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { - __flush_cpu_slab(s, cpu); if (cache_has_sheaves(s)) __pcs_flush_all_cpu(s, cpu); } @@ -7162,26 +6938,21 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct node_barn *barn) barn_init(barn); } -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +#ifdef CONFIG_SLUB_STATS +static inline int alloc_kmem_cache_stats(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * - sizeof(struct kmem_cache_cpu)); + sizeof(struct kmem_cache_stats)); - /* - * Must align to double word boundary for the double cmpxchg - * instructions to work; see __pcpu_double_call_return_bool(). - */ - s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), - 2 * sizeof(void *)); + s->cpu_stats = alloc_percpu(struct kmem_cache_stats); - if (!s->cpu_slab) + if (!s->cpu_stats) return 0; - init_kmem_cache_cpus(s); - return 1; } +#endif static int init_percpu_sheaves(struct kmem_cache *s) { @@ -7292,7 +7063,9 @@ void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); pcs_destroy(s); - free_percpu(s->cpu_slab); +#ifdef CONFIG_SLUB_STATS + free_percpu(s->cpu_stats); +#endif free_kmem_cache_nodes(s); } @@ -7989,12 +7762,6 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) memcpy(s, static_cache, kmem_cache->object_size); - /* - * This runs very early, and only the boot processor is supposed to be - * up. Even if it weren't true, IRQs are not up so we couldn't fire - * IPIs around. - */ - __flush_cpu_slab(s, smp_processor_id()); for_each_kmem_cache_node(s, node, n) { struct slab *p; @@ -8209,8 +7976,10 @@ int do_kmem_cache_create(struct kmem_cache *s, const char *name, if (!init_kmem_cache_nodes(s)) goto out; - if (!alloc_kmem_cache_cpus(s)) +#ifdef CONFIG_SLUB_STATS + if (!alloc_kmem_cache_stats(s)) goto out; +#endif err = init_percpu_sheaves(s); if (err) @@ -8529,33 +8298,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s, if (!nodes) return -ENOMEM; - if (flags & SO_CPU) { - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, - cpu); - int node; - struct slab *slab; - - slab = READ_ONCE(c->slab); - if (!slab) - continue; - - node = slab_nid(slab); - if (flags & SO_TOTAL) - x = slab->objects; - else if (flags & SO_OBJECTS) - x = slab->inuse; - else - x = 1; - - total += x; - nodes[node] += x; - - } - } - /* * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" * already held which will conflict with an existing lock order: @@ -8926,7 +8668,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; + unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -8952,7 +8694,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) int cpu; for_each_online_cpu(cpu) - per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; + per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0; } #define STAT_ATTR(si, text) \ From 6c2f307f30edbe2b18a35584f01854a27e375927 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:55 +0100 Subject: [PATCH 20/39] slab: remove unused PREEMPT_RT specific macros The macros slub_get_cpu_ptr()/slub_put_cpu_ptr() are now unused, remove them. USE_LOCKLESS_FAST_PATH() has lost its true meaning with the code being removed. The only remaining usage is in fact testing whether we can assert irqs disabled, because spin_lock_irqsave() only does that on !RT. Test for CONFIG_PREEMPT_RT instead. Reviewed-by: Hao Li Reviewed-by: Suren Baghdasaryan Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 24 +----------------------- 1 file changed, 1 insertion(+), 23 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 8e4482e75a5c..697d3d6cebd3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -201,28 +201,6 @@ enum slab_flags { SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ }; -/* - * We could simply use migrate_disable()/enable() but as long as it's a - * function call even on !PREEMPT_RT, use inline preempt_disable() there. - */ -#ifndef CONFIG_PREEMPT_RT -#define slub_get_cpu_ptr(var) get_cpu_ptr(var) -#define slub_put_cpu_ptr(var) put_cpu_ptr(var) -#define USE_LOCKLESS_FAST_PATH() (true) -#else -#define slub_get_cpu_ptr(var) \ -({ \ - migrate_disable(); \ - this_cpu_ptr(var); \ -}) -#define slub_put_cpu_ptr(var) \ -do { \ - (void)(var); \ - migrate_enable(); \ -} while (0) -#define USE_LOCKLESS_FAST_PATH() (false) -#endif - #ifndef CONFIG_SLUB_TINY #define __fastpath_inline __always_inline #else @@ -713,7 +691,7 @@ static inline bool __slab_update_freelist(struct kmem_cache *s, struct slab *sla { bool ret; - if (USE_LOCKLESS_FAST_PATH()) + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) lockdep_assert_irqs_disabled(); if (s->flags & __CMPXCHG_DOUBLE) From 46dea1744498a5b8864e21d0b769fd072a4e64bc Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:56 +0100 Subject: [PATCH 21/39] slab: refill sheaves from all nodes __refill_objects() currently only attempts to get partial slabs from the local node and then allocates new slab(s). Expand it to trying also other nodes while observing the remote node defrag ratio, similarly to get_any_partial(). This will prevent allocating new slabs on a node while other nodes have many free slabs. It does mean sheaves will contain non-local objects in that case. Allocations that care about specific node will still be served appropriately, but might get a slowpath allocation. Like get_any_partial() we do observe cpuset_zone_allowed(), although we might be refilling a sheaf that will be then used from a different allocation context. We can also use the resulting refill_objects() in __kmem_cache_alloc_bulk() for non-debug caches. This means kmem_cache_alloc_bulk() will get better performance when sheaves are exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so it's compatible with sheaves refill in preferring the local node. Its users also have gfp flags that allow spinning, so document that as a requirement. Reviewed-by: Suren Baghdasaryan Reviewed-by: Hao Li Signed-off-by: Vlastimil Babka --- mm/slub.c | 137 ++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 31 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 697d3d6cebd3..202d103093d8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2512,8 +2512,8 @@ static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) } static unsigned int -__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, - unsigned int max); +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max); static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, gfp_t gfp) @@ -2524,8 +2524,8 @@ static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, if (!to_fill) return 0; - filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp, - to_fill, to_fill); + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, + to_fill); sheaf->size += filled; @@ -6563,29 +6563,22 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) EXPORT_SYMBOL(kmem_cache_free_bulk); static unsigned int -__refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, - unsigned int max) +__refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max, struct kmem_cache_node *n) { struct partial_bulk_context pc; struct slab *slab, *slab2; unsigned int refilled = 0; unsigned long flags; void *object; - int node; pc.flags = gfp; pc.min_objects = min; pc.max_objects = max; - node = numa_mem_id(); - - if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) + if (!get_partial_node_bulk(s, n, &pc)) return 0; - /* TODO: consider also other nodes? */ - if (!get_partial_node_bulk(s, get_node(s, node), &pc)) - goto new_slab; - list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { list_del(&slab->slab_list); @@ -6623,8 +6616,6 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, } if (unlikely(!list_empty(&pc.slabs))) { - struct kmem_cache_node *n = get_node(s, node); - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { @@ -6646,13 +6637,92 @@ __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, } } + return refilled; +} - if (likely(refilled >= min)) - goto out; +#ifdef CONFIG_NUMA +static unsigned int +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + struct zonelist *zonelist; + struct zoneref *z; + struct zone *zone; + enum zone_type highest_zoneidx = gfp_zone(gfp); + unsigned int cpuset_mems_cookie; + unsigned int refilled = 0; + + /* see get_from_any_partial() for the defrag ratio description */ + if (!s->remote_node_defrag_ratio || + get_cycles() % 1024 > s->remote_node_defrag_ratio) + return 0; + + do { + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), gfp); + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { + struct kmem_cache_node *n; + unsigned int r; + + n = get_node(s, zone_to_nid(zone)); + + if (!n || !cpuset_zone_allowed(zone, gfp) || + n->nr_partial <= s->min_partial) + continue; + + r = __refill_objects_node(s, p, gfp, min, max, n); + refilled += r; + + if (r >= min) { + /* + * Don't check read_mems_allowed_retry() here - + * if mems_allowed was updated in parallel, that + * was a harmless race between allocation and + * the cpuset update + */ + return refilled; + } + p += r; + min -= r; + max -= r; + } + } while (read_mems_allowed_retry(cpuset_mems_cookie)); + + return refilled; +} +#else +static inline unsigned int +__refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + return 0; +} +#endif + +static unsigned int +refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, + unsigned int max) +{ + int local_node = numa_mem_id(); + unsigned int refilled; + struct slab *slab; + + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) + return 0; + + refilled = __refill_objects_node(s, p, gfp, min, max, + get_node(s, local_node)); + if (refilled >= min) + return refilled; + + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, + max - refilled); + if (refilled >= min) + return refilled; new_slab: - slab = new_slab(s, pc.flags, node); + slab = new_slab(s, gfp, local_node); if (!slab) goto out; @@ -6667,8 +6737,8 @@ new_slab: if (refilled < min) goto new_slab; -out: +out: return refilled; } @@ -6678,18 +6748,20 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, { int i; - /* - * TODO: this might be more efficient (if necessary) by reusing - * __refill_objects() - */ - for (i = 0; i < size; i++) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + for (i = 0; i < size; i++) { - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, - s->object_size); - if (unlikely(!p[i])) + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, + s->object_size); + if (unlikely(!p[i])) + goto error; + + maybe_wipe_obj_freeptr(s, p[i]); + } + } else { + i = refill_objects(s, p, flags, size, size); + if (i < size) goto error; - - maybe_wipe_obj_freeptr(s, p[i]); } return i; @@ -6700,7 +6772,10 @@ error: } -/* Note that interrupts must be enabled when calling this function. */ +/* + * Note that interrupts must be enabled when calling this function and gfp + * flags must allow spinning. + */ int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { From 0f7075bea8da3b01898712cfec49dd9a2f09be2f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:57 +0100 Subject: [PATCH 22/39] slab: update overview comments The changes related to sheaves made the description of locking and other details outdated. Update it to reflect current state. Also add a new copyright line due to major changes. Reviewed-by: Suren Baghdasaryan Reviewed-by: Hao Li Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 143 ++++++++++++++++++++++++++---------------------------- 1 file changed, 68 insertions(+), 75 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 202d103093d8..d782ceb8a2ba 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1,13 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* - * SLUB: A slab allocator that limits cache line use instead of queuing - * objects in per cpu and per node lists. + * SLUB: A slab allocator with low overhead percpu array caches and mostly + * lockless freeing of objects to slabs in the slowpath. * - * The allocator synchronizes using per slab locks or atomic operations - * and only uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using spin_trylock for percpu arrays in the + * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing. + * Uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter * (C) 2011 Linux Foundation, Christoph Lameter + * (C) 2025 SUSE, Vlastimil Babka */ #include @@ -53,11 +55,13 @@ /* * Lock order: - * 1. slab_mutex (Global Mutex) - * 2. node->list_lock (Spinlock) - * 3. kmem_cache->cpu_slab->lock (Local lock) - * 4. slab_lock(slab) (Only on some arches) - * 5. object_map_lock (Only for debugging) + * 0. cpu_hotplug_lock + * 1. slab_mutex (Global Mutex) + * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) + * 2b. node->barn->lock (Spinlock) + * 2c. node->list_lock (Spinlock) + * 3. slab_lock(slab) (Only on some arches) + * 4. object_map_lock (Only for debugging) * * slab_mutex * @@ -78,31 +82,38 @@ * C. slab->objects -> Number of objects in slab * D. slab->frozen -> frozen state * + * SL_partial slabs + * + * Slabs on node partial list have at least one free object. A limited number + * of slabs on the list can be fully free (slab->inuse == 0), until we start + * discarding them. These slabs are marked with SL_partial, and the flag is + * cleared while removing them, usually to grab their freelist afterwards. + * This clearing also exempts them from list management. Please see + * __slab_free() for more details. + * + * Full slabs + * + * For caches without debugging enabled, full slabs (slab->inuse == + * slab->objects and slab->freelist == NULL) are not placed on any list. + * The __slab_free() freeing the first object from such a slab will place + * it on the partial list. Caches with debugging enabled place such slab + * on the full list and use different allocation and freeing paths. + * * Frozen slabs * - * If a slab is frozen then it is exempt from list management. It is - * the cpu slab which is actively allocated from by the processor that - * froze it and it is not on any list. The processor that froze the - * slab is the one who can perform list operations on the slab. Other - * processors may put objects onto the freelist but the processor that - * froze the slab is the only one that can retrieve the objects from the - * slab's freelist. - * - * CPU partial slabs - * - * The partially empty slabs cached on the CPU partial list are used - * for performance reasons, which speeds up the allocation process. - * These slabs are not frozen, but are also exempt from list management, - * by clearing the SL_partial flag when moving out of the node - * partial list. Please see __slab_free() for more details. + * If a slab is frozen then it is exempt from list management. It is used to + * indicate a slab that has failed consistency checks and thus cannot be + * allocated from anymore - it is also marked as full. Any previously + * allocated objects will be simply leaked upon freeing instead of attempting + * to modify the potentially corrupted freelist and metadata. * * To sum up, the current scheme is: - * - node partial slab: SL_partial && !frozen - * - cpu partial slab: !SL_partial && !frozen - * - cpu slab: !SL_partial && frozen - * - full slab: !SL_partial && !frozen + * - node partial slab: SL_partial && !full && !frozen + * - taken off partial list: !SL_partial && !full && !frozen + * - full slab, not on any list: !SL_partial && full && !frozen + * - frozen due to inconsistency: !SL_partial && full && frozen * - * list_lock + * node->list_lock (spinlock) * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -112,47 +123,46 @@ * * The list_lock is a centralized lock and thus we avoid taking it as * much as possible. As long as SLUB does not have to handle partial - * slabs, operations can continue without any centralized lock. F.e. - * allocating a long series of objects that fill up slabs does not require - * the list lock. + * slabs, operations can continue without any centralized lock. * * For debug caches, all allocations are forced to go through a list_lock * protected region to serialize against concurrent validation. * - * cpu_slab->lock local lock + * cpu_sheaves->lock (local_trylock) * - * This locks protect slowpath manipulation of all kmem_cache_cpu fields - * except the stat counters. This is a percpu structure manipulated only by - * the local cpu, so the lock protects against being preempted or interrupted - * by an irq. Fast path operations rely on lockless operations instead. + * This lock protects fastpath operations on the percpu sheaves. On !RT it + * only disables preemption and does no atomic operations. As long as the main + * or spare sheaf can handle the allocation or free, there is no other + * overhead. * - * On PREEMPT_RT, the local lock neither disables interrupts nor preemption - * which means the lockless fastpath cannot be used as it might interfere with - * an in-progress slow path operations. In this case the local lock is always - * taken but it still utilizes the freelist for the common operations. + * node->barn->lock (spinlock) * - * lockless fastpaths + * This lock protects the operations on per-NUMA-node barn. It can quickly + * serve an empty or full sheaf if available, and avoid more expensive refill + * or flush operation. * - * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) - * are fully lockless when satisfied from the percpu slab (and when - * cmpxchg_double is possible to use, otherwise slab_lock is taken). - * They also don't disable preemption or migration or irqs. They rely on - * the transaction id (tid) field to detect being preempted or moved to - * another cpu. + * Lockless freeing + * + * Objects may have to be freed to their slabs when they are from a remote + * node (where we want to avoid filling local sheaves with remote objects) + * or when there are too many full sheaves. On architectures supporting + * cmpxchg_double this is done by a lockless update of slab's freelist and + * counters, otherwise slab_lock is taken. This only needs to take the + * list_lock if it's a first free to a full slab, or when a slab becomes empty + * after the free. * * irq, preemption, migration considerations * - * Interrupts are disabled as part of list_lock or local_lock operations, or + * Interrupts are disabled as part of list_lock or barn lock operations, or * around the slab_lock operation, in order to make the slab allocator safe * to use in the context of an irq. + * Preemption is disabled as part of local_trylock operations. + * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see + * their limitations. * - * In addition, preemption (or migration on PREEMPT_RT) is disabled in the - * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the - * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer - * doesn't have to be revalidated in each section protected by the local lock. - * - * SLUB assigns one slab for allocation to each processor. - * Allocations only occur from these slabs called cpu slabs. + * SLUB assigns two object arrays called sheaves for caching allocations and + * frees on each cpu, with a NUMA node shared barn for balancing between cpus. + * Allocations and frees are primarily served from these sheaves. * * Slabs with free elements are kept on a partial list and during regular * operations no list for full slabs is used. If an object in a full slab is @@ -160,25 +170,8 @@ * We track full slabs for debugging purposes though because otherwise we * cannot scan all objects. * - * Slabs are freed when they become empty. Teardown and setup is - * minimal so we rely on the page allocators per cpu caches for - * fast frees and allocs. - * - * slab->frozen The slab is frozen and exempt from list processing. - * This means that the slab is dedicated to a purpose - * such as satisfying allocations for a specific - * processor. Objects may be freed in the slab while - * it is frozen but slab_free will then skip the usual - * list operations. It is up to the processor holding - * the slab to integrate the slab into the slab lists - * when the slab is no longer needed. - * - * One use of this flag is to mark slabs that are - * used for allocations. Then such a slab becomes a cpu - * slab. The cpu slab may be equipped with an additional - * freelist that allows lockless access to - * free objects in addition to the regular freelist - * that requires the slab lock. + * Slabs are freed when they become empty. Teardown and setup is minimal so we + * rely on the page allocators per cpu caches for fast frees and allocs. * * SLAB_DEBUG_FLAGS Slab requires special handling due to debug * options set. This moves slab handling out of From b16af1c81277dceb96812305b471296af9adf5d0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:58 +0100 Subject: [PATCH 23/39] slab: remove frozen slab checks from __slab_free() Currently slabs are only frozen after consistency checks failed. This can happen only in caches with debugging enabled, and those use free_to_partial_list() for freeing. The non-debug operation of __slab_free() can thus stop considering the frozen field, and we can remove the FREE_FROZEN stat. Reviewed-by: Suren Baghdasaryan Reviewed-by: Hao Li Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index d782ceb8a2ba..bbc9d56484e5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -338,7 +338,6 @@ enum stat_item { FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ FREE_FASTPATH, /* Free to cpu slab */ FREE_SLOWPATH, /* Freeing not to cpu slab */ - FREE_FROZEN, /* Freeing to frozen slab */ FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ FREE_REMOVE_PARTIAL, /* Freeing removes last object */ ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ @@ -5109,7 +5108,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { - bool was_frozen, was_full; + bool was_full; struct freelist_counters old, new; struct kmem_cache_node *n = NULL; unsigned long flags; @@ -5132,7 +5131,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, old.counters = slab->counters; was_full = (old.freelist == NULL); - was_frozen = old.frozen; set_freepointer(s, tail, old.freelist); @@ -5145,7 +5143,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * to (due to not being full anymore) the partial list. * Unless it's frozen. */ - if ((!new.inuse || was_full) && !was_frozen) { + if (!new.inuse || was_full) { n = get_node(s, slab_nid(slab)); /* @@ -5164,20 +5162,10 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); if (likely(!n)) { - - if (likely(was_frozen)) { - /* - * The list lock was not taken therefore no list - * activity can be necessary. - */ - stat(s, FREE_FROZEN); - } - /* - * In other cases we didn't take the list_lock because the slab - * was already on the partial list and will remain there. + * We didn't take the list_lock because the slab was already on + * the partial list and will remain there. */ - return; } @@ -8766,7 +8754,6 @@ STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); -STAT_ATTR(FREE_FROZEN, free_frozen); STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); @@ -8871,7 +8858,6 @@ static struct attribute *slab_attrs[] = { &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, - &free_frozen_attr.attr, &free_add_partial_attr.attr, &free_remove_partial_attr.attr, &alloc_from_partial_attr.attr, From fb016a5ec70ea9c734bde73ef9e3e82e201f5ab5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:52:59 +0100 Subject: [PATCH 24/39] mm/slub: remove DEACTIVATE_TO_* stat items The cpu slabs and their deactivations were removed, so remove the unused stat items. Weirdly enough the values were also used to control __add_partial() adding to head or tail of the list, so replace that with a new enum add_mode, which is cleaner. Reviewed-by: Suren Baghdasaryan Reviewed-by: Hao Li Reviewed-by: Harry Yoo Signed-off-by: Vlastimil Babka --- mm/slub.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index bbc9d56484e5..2ec4bcfb3759 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -329,6 +329,11 @@ static void debugfs_slab_add(struct kmem_cache *); static inline void debugfs_slab_add(struct kmem_cache *s) { } #endif +enum add_mode { + ADD_TO_HEAD, + ADD_TO_TAIL, +}; + enum stat_item { ALLOC_PCS, /* Allocation from percpu sheaf */ ALLOC_FASTPATH, /* Allocation from cpu slab */ @@ -348,8 +353,6 @@ enum stat_item { CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ @@ -3270,10 +3273,10 @@ static inline void slab_clear_node_partial(struct slab *slab) * Management of partially allocated slabs. */ static inline void -__add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) +__add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) { n->nr_partial++; - if (tail == DEACTIVATE_TO_TAIL) + if (mode == ADD_TO_TAIL) list_add_tail(&slab->slab_list, &n->partial); else list_add(&slab->slab_list, &n->partial); @@ -3281,10 +3284,10 @@ __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) } static inline void add_partial(struct kmem_cache_node *n, - struct slab *slab, int tail) + struct slab *slab, enum add_mode mode) { lockdep_assert_held(&n->list_lock); - __add_partial(n, slab, tail); + __add_partial(n, slab, mode); } static inline void remove_partial(struct kmem_cache_node *n, @@ -3377,7 +3380,7 @@ static void *alloc_single_from_new_slab(struct kmem_cache *s, struct slab *slab, if (slab->inuse == slab->objects) add_full(s, n, slab); else - add_partial(n, slab, DEACTIVATE_TO_HEAD); + add_partial(n, slab, ADD_TO_HEAD); inc_slabs_node(s, nid, slab->objects); spin_unlock_irqrestore(&n->list_lock, flags); @@ -3999,7 +4002,7 @@ static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, n = get_node(s, slab_nid(slab)); spin_lock_irqsave(&n->list_lock, flags); } - add_partial(n, slab, DEACTIVATE_TO_HEAD); + add_partial(n, slab, ADD_TO_HEAD); spin_unlock_irqrestore(&n->list_lock, flags); } @@ -5070,7 +5073,7 @@ static noinline void free_to_partial_list( /* was on full list */ remove_full(s, n, slab); if (!slab_free) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); + add_partial(n, slab, ADD_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } else if (slab_free) { @@ -5190,7 +5193,7 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, * then add it. */ if (unlikely(was_full)) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); + add_partial(n, slab, ADD_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -6605,7 +6608,7 @@ __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int mi continue; list_del(&slab->slab_list); - add_partial(n, slab, DEACTIVATE_TO_HEAD); + add_partial(n, slab, ADD_TO_HEAD); } spin_unlock_irqrestore(&n->list_lock, flags); @@ -7072,7 +7075,7 @@ static void early_kmem_cache_node_alloc(int node) * No locks need to be taken here as it has just been * initialized and there is no concurrent access. */ - __add_partial(n, slab, DEACTIVATE_TO_HEAD); + __add_partial(n, slab, ADD_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -8764,8 +8767,6 @@ STAT_ATTR(FREE_SLAB, free_slab); STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); STAT_ATTR(DEACTIVATE_FULL, deactivate_full); STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); -STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); -STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); @@ -8868,8 +8869,6 @@ static struct attribute *slab_attrs[] = { &cpuslab_flush_attr.attr, &deactivate_full_attr.attr, &deactivate_empty_attr.attr, - &deactivate_to_head_attr.attr, - &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, &deactivate_bypass_attr.attr, &order_fallback_attr.attr, From 6f1912181ddfcf851a6670b4fa9c7dfdaf3ed46d Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Fri, 23 Jan 2026 07:53:00 +0100 Subject: [PATCH 25/39] mm/slub: cleanup and repurpose some stat items A number of stat items related to cpu slabs became unused, remove them. Two of those were ALLOC_FASTPATH and FREE_FASTPATH. But instead of removing those, use them instead of ALLOC_PCS and FREE_PCS, since sheaves are the new (and only) fastpaths, Remove the recently added _PCS variants instead. Change where FREE_SLOWPATH is counted so that it only counts freeing of objects by slab users that (for whatever reason) do not go to a percpu sheaf, and not all (including internal) callers of __slab_free(). Thus sheaf flushing (already counted by SHEAF_FLUSH) does not affect FREE_SLOWPATH anymore. This matches how ALLOC_SLOWPATH doesn't count sheaf refills (counted by SHEAF_REFILL). Reviewed-by: Hao Li Signed-off-by: Vlastimil Babka --- mm/slub.c | 83 +++++++++++++++++-------------------------------------- 1 file changed, 26 insertions(+), 57 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 2ec4bcfb3759..eb1f52a79999 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -335,33 +335,19 @@ enum add_mode { }; enum stat_item { - ALLOC_PCS, /* Allocation from percpu sheaf */ - ALLOC_FASTPATH, /* Allocation from cpu slab */ - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ - FREE_PCS, /* Free to percpu sheaf */ + ALLOC_FASTPATH, /* Allocation from percpu sheaves */ + ALLOC_SLOWPATH, /* Allocation from partial or new slab */ FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ - FREE_FASTPATH, /* Free to cpu slab */ - FREE_SLOWPATH, /* Freeing not to cpu slab */ + FREE_FASTPATH, /* Free to percpu sheaves */ + FREE_SLOWPATH, /* Free to a slab */ FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ FREE_REMOVE_PARTIAL, /* Freeing removes last object */ - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ + ALLOC_SLAB, /* New slab acquired from page allocator */ + ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */ FREE_SLAB, /* Slab freed to the page allocator */ - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ - DEACTIVATE_BYPASS, /* Implicit deactivation */ ORDER_FALLBACK, /* Number of times fallback was necessary */ - CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ SHEAF_FLUSH, /* Objects flushed from a sheaf */ SHEAF_REFILL, /* Objects refilled to a sheaf */ SHEAF_ALLOC, /* Allocation of an empty sheaf */ @@ -4350,8 +4336,10 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) * We assume the percpu sheaves contain only local objects although it's * not completely guaranteed, so we verify later. */ - if (unlikely(node_requested && node != numa_mem_id())) + if (unlikely(node_requested && node != numa_mem_id())) { + stat(s, ALLOC_NODE_MISMATCH); return NULL; + } if (!local_trylock(&s->cpu_sheaves->lock)) return NULL; @@ -4374,6 +4362,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) */ if (page_to_nid(virt_to_page(object)) != node) { local_unlock(&s->cpu_sheaves->lock); + stat(s, ALLOC_NODE_MISMATCH); return NULL; } } @@ -4382,7 +4371,7 @@ void *alloc_from_pcs(struct kmem_cache *s, gfp_t gfp, int node) local_unlock(&s->cpu_sheaves->lock); - stat(s, ALLOC_PCS); + stat(s, ALLOC_FASTPATH); return object; } @@ -4454,7 +4443,7 @@ do_alloc: local_unlock(&s->cpu_sheaves->lock); - stat_add(s, ALLOC_PCS, batch); + stat_add(s, ALLOC_FASTPATH, batch); allocated += batch; @@ -5117,8 +5106,6 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, unsigned long flags; bool on_node_partial; - stat(s, FREE_SLOWPATH); - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { free_to_partial_list(s, slab, head, tail, cnt, addr); return; @@ -5422,7 +5409,7 @@ bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) local_unlock(&s->cpu_sheaves->lock); - stat(s, FREE_PCS); + stat(s, FREE_FASTPATH); return true; } @@ -5690,7 +5677,7 @@ do_free: local_unlock(&s->cpu_sheaves->lock); - stat_add(s, FREE_PCS, batch); + stat_add(s, FREE_FASTPATH, batch); if (batch < size) { p += batch; @@ -5712,10 +5699,12 @@ no_empty: */ fallback: __kmem_cache_free_bulk(s, size, p); + stat_add(s, FREE_SLOWPATH, size); flush_remote: if (remote_nr) { __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); + stat_add(s, FREE_SLOWPATH, remote_nr); if (i < size) { remote_nr = 0; goto next_remote_batch; @@ -5769,6 +5758,7 @@ static void free_deferred_objects(struct irq_work *work) set_freepointer(s, x, NULL); __slab_free(s, slab, x, x, 1, _THIS_IP_); + stat(s, FREE_SLOWPATH); } } @@ -5810,6 +5800,7 @@ void slab_free(struct kmem_cache *s, struct slab *slab, void *object, } __slab_free(s, slab, object, object, 1, addr); + stat(s, FREE_SLOWPATH); } #ifdef CONFIG_MEMCG @@ -5832,8 +5823,10 @@ void slab_free_bulk(struct kmem_cache *s, struct slab *slab, void *head, * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. */ - if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) { __slab_free(s, slab, head, tail, cnt, addr); + stat_add(s, FREE_SLOWPATH, cnt); + } } #ifdef CONFIG_SLUB_RCU_DEBUG @@ -5858,8 +5851,10 @@ static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) return; /* resume freeing */ - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) { __slab_free(s, slab, object, object, 1, _THIS_IP_); + stat(s, FREE_SLOWPATH); + } } #endif /* CONFIG_SLUB_RCU_DEBUG */ @@ -5867,6 +5862,7 @@ static void slab_free_after_rcu_debug(struct rcu_head *rcu_head) void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) { __slab_free(cache, virt_to_slab(x), x, x, 1, addr); + stat(cache, FREE_SLOWPATH); } #endif @@ -6746,6 +6742,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, i = refill_objects(s, p, flags, size, size); if (i < size) goto error; + stat_add(s, ALLOC_SLOWPATH, i); } return i; @@ -8749,33 +8746,19 @@ static ssize_t text##_store(struct kmem_cache *s, \ } \ SLAB_ATTR(text); \ -STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); -STAT_ATTR(FREE_PCS, free_cpu_sheaf); STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); STAT_ATTR(FREE_FASTPATH, free_fastpath); STAT_ATTR(FREE_SLOWPATH, free_slowpath); STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); -STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); -STAT_ATTR(ALLOC_REFILL, alloc_refill); STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); -STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); -STAT_ATTR(DEACTIVATE_FULL, deactivate_full); -STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); -STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); -STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); -STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); -STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); -STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); -STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); -STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); STAT_ATTR(SHEAF_FLUSH, sheaf_flush); STAT_ATTR(SHEAF_REFILL, sheaf_refill); STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); @@ -8851,33 +8834,19 @@ static struct attribute *slab_attrs[] = { &remote_node_defrag_ratio_attr.attr, #endif #ifdef CONFIG_SLUB_STATS - &alloc_cpu_sheaf_attr.attr, &alloc_fastpath_attr.attr, &alloc_slowpath_attr.attr, - &free_cpu_sheaf_attr.attr, &free_rcu_sheaf_attr.attr, &free_rcu_sheaf_fail_attr.attr, &free_fastpath_attr.attr, &free_slowpath_attr.attr, &free_add_partial_attr.attr, &free_remove_partial_attr.attr, - &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, - &alloc_refill_attr.attr, &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, - &cpuslab_flush_attr.attr, - &deactivate_full_attr.attr, - &deactivate_empty_attr.attr, - &deactivate_remote_frees_attr.attr, - &deactivate_bypass_attr.attr, &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, - &cmpxchg_double_cpu_fail_attr.attr, - &cpu_partial_alloc_attr.attr, - &cpu_partial_free_attr.attr, - &cpu_partial_node_attr.attr, - &cpu_partial_drain_attr.attr, &sheaf_flush_attr.attr, &sheaf_refill_attr.attr, &sheaf_alloc_attr.attr, From 40fd0acc45d06709b3b1eea77e50e13f4145dff0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 29 Jan 2026 10:07:57 +0100 Subject: [PATCH 26/39] slub: avoid list_lock contention from __refill_objects_any() Kernel test robot has reported a regression in the patch "slab: refill sheaves from all nodes". When taken in isolation like this, there is indeed a tradeoff - we prefer to use remote objects prior to allocating new local slabs. It is replicating a behavior that existed before sheaves for replenishing cpu (partial) slabs - now called get_from_any_partial() to allocate a single object. So the possibility of allocating remote objects is intended even if remote accesses are then slower. But the profiles in the report also suggested a contention on the list_lock spinlock. And that's something we can try to avoid without much tradeoff - if someone else has the spin_lock, it's more likely they are allocating from the node than freeing to it, so we can skip it even if it means allocating a new local slab - contributing to that lock's contention isn't worth it. It should not result in partial slabs accumulating on the remote node. Thus add an allow_spin parameter to __refill_objects_node() and get_partial_node_bulk() to make the attempts from __refill_objects_any() use only a trylock. Reported-by: kernel test robot Link: https://lore.kernel.org/oe-lkp/202601132136.77efd6d7-lkp@intel.com Link: https://patch.msgid.link/20260129-b4-refill_any_trylock-v1-1-de7420b25840@suse.cz Signed-off-by: Vlastimil Babka --- mm/slub.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index eb1f52a79999..ca3db3ae1afb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3378,7 +3378,8 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); static bool get_partial_node_bulk(struct kmem_cache *s, struct kmem_cache_node *n, - struct partial_bulk_context *pc) + struct partial_bulk_context *pc, + bool allow_spin) { struct slab *slab, *slab2; unsigned int total_free = 0; @@ -3390,7 +3391,10 @@ static bool get_partial_node_bulk(struct kmem_cache *s, INIT_LIST_HEAD(&pc->slabs); - spin_lock_irqsave(&n->list_lock, flags); + if (allow_spin) + spin_lock_irqsave(&n->list_lock, flags); + else if (!spin_trylock_irqsave(&n->list_lock, flags)) + return false; list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { struct freelist_counters flc; @@ -6544,7 +6548,8 @@ EXPORT_SYMBOL(kmem_cache_free_bulk); static unsigned int __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, - unsigned int max, struct kmem_cache_node *n) + unsigned int max, struct kmem_cache_node *n, + bool allow_spin) { struct partial_bulk_context pc; struct slab *slab, *slab2; @@ -6556,7 +6561,7 @@ __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int mi pc.min_objects = min; pc.max_objects = max; - if (!get_partial_node_bulk(s, n, &pc)) + if (!get_partial_node_bulk(s, n, &pc, allow_spin)) return 0; list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { @@ -6650,7 +6655,8 @@ __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min n->nr_partial <= s->min_partial) continue; - r = __refill_objects_node(s, p, gfp, min, max, n); + r = __refill_objects_node(s, p, gfp, min, max, n, + /* allow_spin = */ false); refilled += r; if (r >= min) { @@ -6691,7 +6697,8 @@ refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, return 0; refilled = __refill_objects_node(s, p, gfp, min, max, - get_node(s, local_node)); + get_node(s, local_node), + /* allow_spin = */ true); if (refilled >= min) return refilled; From 280ea9c3154b2af7d841f992c9fc79e9d6534e03 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Mon, 26 Jan 2026 21:57:14 +0900 Subject: [PATCH 27/39] mm/slab: avoid allocating slabobj_ext array from its own slab When allocating slabobj_ext array in alloc_slab_obj_exts(), the array can be allocated from the same slab we're allocating the array for. This led to obj_exts_in_slab() incorrectly returning true [1], although the array is not allocated from wasted space of the slab. Vlastimil Babka observed that this problem should be fixed even when ignoring its incompatibility with obj_exts_in_slab(), because it creates slabs that are never freed as there is always at least one allocated object. To avoid this, use the next kmalloc size or large kmalloc when the array can be allocated from the same cache we're allocating the array for. In case of random kmalloc caches, there are multiple kmalloc caches for the same size and the cache is selected based on the caller address. Because it is fragile to ensure the same caller address is passed to kmalloc_slab(), kmalloc_noprof(), and kmalloc_node_noprof(), bump the size to (s->object_size + 1) when the sizes are equal, instead of directly comparing the kmem_cache pointers. Note that this doesn't happen when memory allocation profiling is disabled, as when the allocation of the array is triggered by memory cgroup (KMALLOC_CGROUP), the array is allocated from KMALLOC_NORMAL. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202601231457.f7b31e09-lkp@intel.com [1] Cc: stable@vger.kernel.org Fixes: 4b8736964640 ("mm/slab: add allocation accounting into slab allocation and free paths") Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260126125714.88008-1-harry.yoo@oracle.com Reviewed-by: Hao Li Signed-off-by: Vlastimil Babka --- mm/slub.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index afc3e511ff39..65b6d07ef20e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2092,6 +2092,49 @@ static inline void init_slab_obj_exts(struct slab *slab) slab->obj_exts = 0; } +/* + * Calculate the allocation size for slabobj_ext array. + * + * When memory allocation profiling is enabled, the obj_exts array + * could be allocated from the same slab cache it's being allocated for. + * This would prevent the slab from ever being freed because it would + * always contain at least one allocated object (its own obj_exts array). + * + * To avoid this, increase the allocation size when we detect the array + * may come from the same cache, forcing it to use a different cache. + */ +static inline size_t obj_exts_alloc_size(struct kmem_cache *s, + struct slab *slab, gfp_t gfp) +{ + size_t sz = sizeof(struct slabobj_ext) * slab->objects; + struct kmem_cache *obj_exts_cache; + + /* + * slabobj_ext array for KMALLOC_CGROUP allocations + * are served from KMALLOC_NORMAL caches. + */ + if (!mem_alloc_profiling_enabled()) + return sz; + + if (sz > KMALLOC_MAX_CACHE_SIZE) + return sz; + + if (!is_kmalloc_normal(s)) + return sz; + + obj_exts_cache = kmalloc_slab(sz, NULL, gfp, 0); + /* + * We can't simply compare s with obj_exts_cache, because random kmalloc + * caches have multiple caches per size, selected by caller address. + * Since caller address may differ between kmalloc_slab() and actual + * allocation, bump size when sizes are equal. + */ + if (s->object_size == obj_exts_cache->object_size) + return obj_exts_cache->object_size + 1; + + return sz; +} + int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, gfp_t gfp, bool new_slab) { @@ -2100,26 +2143,26 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, unsigned long new_exts; unsigned long old_exts; struct slabobj_ext *vec; + size_t sz; gfp &= ~OBJCGS_CLEAR_MASK; /* Prevent recursive extension vector allocation */ gfp |= __GFP_NO_OBJ_EXT; + sz = obj_exts_alloc_size(s, slab, gfp); + /* * Note that allow_spin may be false during early boot and its * restricted GFP_BOOT_MASK. Due to kmalloc_nolock() only supporting * architectures with cmpxchg16b, early obj_exts will be missing for * very early allocations on those. */ - if (unlikely(!allow_spin)) { - size_t sz = objects * sizeof(struct slabobj_ext); - + if (unlikely(!allow_spin)) vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, slab_nid(slab)); - } else { - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, - slab_nid(slab)); - } + else + vec = kmalloc_node(sz, gfp | __GFP_ZERO, slab_nid(slab)); + if (!vec) { /* * Try to mark vectors which failed to allocate. @@ -2133,6 +2176,9 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, return -ENOMEM; } + VM_WARN_ON_ONCE(virt_to_slab(vec) != NULL && + virt_to_slab(vec)->slab_cache == s); + new_exts = (unsigned long)vec; if (unlikely(!allow_spin)) new_exts |= OBJEXTS_NOSPIN_ALLOC; From 9346ee2b53936758afe49519318865dd7c2b1843 Mon Sep 17 00:00:00 2001 From: Hao Li Date: Mon, 29 Dec 2025 20:24:39 +0800 Subject: [PATCH 28/39] slub: clarify object field layout comments The comments above check_pad_bytes() document the field layout of a single object. Rewrite them to improve clarity and precision. Also update an outdated comment in calculate_sizes(). Suggested-by: Harry Yoo Acked-by: Harry Yoo Signed-off-by: Hao Li Link: https://patch.msgid.link/20251229122415.192377-1-hao.li@linux.dev Signed-off-by: Vlastimil Babka --- mm/slub.c | 82 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 65b6d07ef20e..152fe53d0fb5 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1344,44 +1344,60 @@ skip_bug_print: } /* - * Object layout: + * Object field layout: * - * object address - * Bytes of the object to be managed. - * If the freepointer may overlay the object then the free - * pointer is at the middle of the object. + * [Left redzone padding] (if SLAB_RED_ZONE) + * - Field size: s->red_left_pad + * - Immediately precedes each object when SLAB_RED_ZONE is set. + * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and + * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE. * - * Poisoning uses 0x6b (POISON_FREE) and the last byte is - * 0xa5 (POISON_END) + * [Object bytes] (object address starts here) + * - Field size: s->object_size + * - Object payload bytes. + * - If the freepointer may overlap the object, it is stored inside + * the object (typically near the middle). + * - Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) when __OBJECT_POISON is enabled. * - * object + s->object_size - * Padding to reach word boundary. This is also used for Redzoning. - * Padding is extended by another word if Redzoning is enabled and - * object_size == inuse. + * [Word-align padding] (right redzone when SLAB_RED_ZONE is set) + * - Field size: s->inuse - s->object_size + * - If redzoning is enabled and ALIGN(size, sizeof(void *)) adds no + * padding, explicitly extend by one word so the right redzone is + * non-empty. + * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and + * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE. * - * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with - * 0xcc (SLUB_RED_ACTIVE) for objects in use. + * [Metadata starts at object + s->inuse] + * - A. freelist pointer (if freeptr_outside_object) + * - B. alloc tracking (SLAB_STORE_USER) + * - C. free tracking (SLAB_STORE_USER) + * - D. original request size (SLAB_KMALLOC && SLAB_STORE_USER) + * - E. KASAN metadata (if enabled) * - * object + s->inuse - * Meta data starts here. + * [Mandatory padding] (if CONFIG_SLUB_DEBUG && SLAB_RED_ZONE) + * - One mandatory debug word to guarantee a minimum poisoned gap + * between metadata and the next object, independent of alignment. + * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set. + * [Final alignment padding] + * - Any bytes added by ALIGN(size, s->align) to reach s->size. + * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set. * - * A. Free pointer (if we cannot overwrite object on free) - * B. Tracking data for SLAB_STORE_USER - * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) - * D. Padding to reach required alignment boundary or at minimum - * one word if debugging is on to be able to detect writes - * before the word boundary. + * Notes: + * - Redzones are filled by init_object() with SLUB_RED_ACTIVE/INACTIVE. + * - Object contents are poisoned with POISON_FREE/END when __OBJECT_POISON. + * - The trailing padding is pre-filled with POISON_INUSE by + * setup_slab_debug() when SLAB_POISON is set, and is validated by + * check_pad_bytes(). + * - The first object pointer is slab_address(slab) + + * (s->red_left_pad if redzoning); subsequent objects are reached by + * adding s->size each time. * - * Padding is done using 0x5a (POISON_INUSE) - * - * object + s->size - * Nothing is used beyond s->size. - * - * If slabcaches are merged then the object_size and inuse boundaries are mostly - * ignored. And therefore no slab options that rely on these boundaries - * may be used with merged slabcaches. + * If a slab cache flag relies on specific metadata to exist at a fixed + * offset, the flag must be included in SLAB_NEVER_MERGE to prevent merging. + * Otherwise, the cache would misbehave as s->object_size and s->inuse are + * adjusted during cache merging (see __kmem_cache_alias()). */ - static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) { unsigned long off = get_info_end(s); /* The end of info */ @@ -7967,9 +7983,9 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) /* - * If we are Redzoning then check if there is some space between the - * end of the object and the free pointer. If not then add an - * additional word to have some bytes to store Redzone information. + * If we are Redzoning and there is no space between the end of the + * object and the following fields, add one word so the right Redzone + * is non-empty. */ if ((flags & SLAB_RED_ZONE) && size == s->object_size) size += sizeof(void *); From b85f369b81aed457acbea4ad3314218254a72fd2 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:37 +0900 Subject: [PATCH 29/39] mm/slab: use unsigned long for orig_size to ensure proper metadata align When both KASAN and SLAB_STORE_USER are enabled, accesses to struct kasan_alloc_meta fields can be misaligned on 64-bit architectures. This occurs because orig_size is currently defined as unsigned int, which only guarantees 4-byte alignment. When struct kasan_alloc_meta is placed after orig_size, it may end up at a 4-byte boundary rather than the required 8-byte boundary on 64-bit systems. Note that 64-bit architectures without HAVE_EFFICIENT_UNALIGNED_ACCESS are assumed to require 64-bit accesses to be 64-bit aligned. See HAVE_64BIT_ALIGNED_ACCESS and commit adab66b71abf ("Revert: "ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS"") for more details. Change orig_size from unsigned int to unsigned long to ensure proper alignment for any subsequent metadata. This should not waste additional memory because kmalloc objects are already aligned to at least ARCH_KMALLOC_MINALIGN. Closes: https://lore.kernel.org/all/aPrLF0OUK651M4dk@hyeyoo Suggested-by: Andrey Ryabinin Cc: stable@vger.kernel.org Fixes: 6edf2576a6cc ("mm/slub: enable debugging memory wasting of kmalloc") Signed-off-by: Harry Yoo Closes: https://lore.kernel.org/all/aPrLF0OUK651M4dk@hyeyoo/ Link: https://patch.msgid.link/20260113061845.159790-2-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- mm/slub.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 152fe53d0fb5..2c000dddcf74 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -854,7 +854,7 @@ static inline bool slab_update_freelist(struct kmem_cache *s, struct slab *slab, * request size in the meta data area, for better debug and sanity check. */ static inline void set_orig_size(struct kmem_cache *s, - void *object, unsigned int orig_size) + void *object, unsigned long orig_size) { void *p = kasan_reset_tag(object); @@ -864,10 +864,10 @@ static inline void set_orig_size(struct kmem_cache *s, p += get_info_end(s); p += sizeof(struct track) * 2; - *(unsigned int *)p = orig_size; + *(unsigned long *)p = orig_size; } -static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) +static inline unsigned long get_orig_size(struct kmem_cache *s, void *object) { void *p = kasan_reset_tag(object); @@ -880,7 +880,7 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) p += get_info_end(s); p += sizeof(struct track) * 2; - return *(unsigned int *)p; + return *(unsigned long *)p; } #ifdef CONFIG_SLUB_DEBUG @@ -1195,7 +1195,7 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) off += 2 * sizeof(struct track); if (slub_debug_orig_size(s)) - off += sizeof(unsigned int); + off += sizeof(unsigned long); off += kasan_metadata_size(s, false); @@ -1407,7 +1407,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) off += 2 * sizeof(struct track); if (s->flags & SLAB_KMALLOC) - off += sizeof(unsigned int); + off += sizeof(unsigned long); } off += kasan_metadata_size(s, false); @@ -8040,7 +8040,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) /* Save the original kmalloc request size */ if (flags & SLAB_KMALLOC) - size += sizeof(unsigned int); + size += sizeof(unsigned long); } #endif From a13b68d79d5caa5ec0d34b4c0fb2dedf3259fc32 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:38 +0900 Subject: [PATCH 30/39] mm/slab: allow specifying free pointer offset when using constructor When a slab cache has a constructor, the free pointer is placed after the object because certain fields must not be overwritten even after the object is freed. However, some fields that the constructor does not initialize can safely be overwritten after free. Allow specifying the free pointer offset within the object, reducing the overall object size when some fields can be reused for the free pointer. Adjust the document accordingly. Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260113061845.159790-3-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 30 ++++++++++++++++-------------- mm/slab_common.c | 2 +- mm/slub.c | 6 ++++-- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 2482992248dc..4554c04a9bd7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -299,24 +299,26 @@ struct kmem_cache_args { unsigned int usersize; /** * @freeptr_offset: Custom offset for the free pointer - * in &SLAB_TYPESAFE_BY_RCU caches + * in caches with &SLAB_TYPESAFE_BY_RCU or @ctor * - * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer - * outside of the object. This might cause the object to grow in size. - * Cache creators that have a reason to avoid this can specify a custom - * free pointer offset in their struct where the free pointer will be - * placed. + * By default, &SLAB_TYPESAFE_BY_RCU and @ctor caches place the free + * pointer outside of the object. This might cause the object to grow + * in size. Cache creators that have a reason to avoid this can specify + * a custom free pointer offset in their data structure where the free + * pointer will be placed. * - * Note that placing the free pointer inside the object requires the - * caller to ensure that no fields are invalidated that are required to - * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for - * details). + * For caches with &SLAB_TYPESAFE_BY_RCU, the caller must ensure that + * the free pointer does not overlay fields required to guard against + * object recycling (See &SLAB_TYPESAFE_BY_RCU for details). + * + * For caches with @ctor, the caller must ensure that the free pointer + * does not overlay fields initialized by the constructor. + * + * Currently, only caches with &SLAB_TYPESAFE_BY_RCU or @ctor + * may specify @freeptr_offset. * * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset - * is specified, %use_freeptr_offset must be set %true. - * - * Note that @ctor currently isn't supported with custom free pointers - * as a @ctor requires an external free pointer. + * is specified, @use_freeptr_offset must be set %true. */ unsigned int freeptr_offset; /** diff --git a/mm/slab_common.c b/mm/slab_common.c index b6836f8500b6..027bf64c2e35 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -239,7 +239,7 @@ static struct kmem_cache *create_cache(const char *name, err = -EINVAL; if (args->use_freeptr_offset && (args->freeptr_offset >= object_size || - !(flags & SLAB_TYPESAFE_BY_RCU) || + (!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) || !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t)))) goto out; diff --git a/mm/slub.c b/mm/slub.c index 2c000dddcf74..1b7ed91a2f15 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7998,7 +7998,8 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) s->inuse = size; if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || - (flags & SLAB_POISON) || s->ctor || + (flags & SLAB_POISON) || + (s->ctor && !args->use_freeptr_offset) || ((flags & SLAB_RED_ZONE) && (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* @@ -8019,7 +8020,8 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) */ s->offset = size; size += sizeof(void *); - } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { + } else if (((flags & SLAB_TYPESAFE_BY_RCU) || s->ctor) && + args->use_freeptr_offset) { s->offset = args->freeptr_offset; } else { /* From 43d9bb4236fd1dd2e4646bee7f556542eefa422a Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:39 +0900 Subject: [PATCH 31/39] ext4: specify the free pointer offset for ext4_inode_cache Convert ext4_inode_cache to use the kmem_cache_args interface and specify a free pointer offset. Since ext4_inode_cache uses a constructor, the free pointer would be placed after the object to prevent overwriting fields used by the constructor. However, some fields such as ->i_flags are not used by the constructor and can safely be repurposed for the free pointer. Specify the free pointer offset at i_flags to reduce the object size. Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260113061845.159790-4-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- fs/ext4/super.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 87205660c5d0..6f1c2c497871 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1491,12 +1491,19 @@ static void init_once(void *foo) static int __init init_inodecache(void) { - ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", - sizeof(struct ext4_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, - offsetof(struct ext4_inode_info, i_data), - sizeof_field(struct ext4_inode_info, i_data), - init_once); + struct kmem_cache_args args = { + .useroffset = offsetof(struct ext4_inode_info, i_data), + .usersize = sizeof_field(struct ext4_inode_info, i_data), + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct ext4_inode_info, i_flags), + .ctor = init_once, + }; + + ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", + sizeof(struct ext4_inode_info), + &args, + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT); + if (ext4_inode_cachep == NULL) return -ENOMEM; return 0; From 52f1ca8a459a73cf423a0b71b59f0b950e522cab Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:40 +0900 Subject: [PATCH 32/39] mm/slab: abstract slabobj_ext access via new slab_obj_ext() helper Currently, the slab allocator assumes that slab->obj_exts is a pointer to an array of struct slabobj_ext objects. However, to support storage methods where struct slabobj_ext is embedded within objects, the slab allocator should not make this assumption. Instead of directly dereferencing the slabobj_exts array, abstract access to struct slabobj_ext via helper functions. Introduce a new API slabobj_ext metadata access: slab_obj_ext(slab, obj_exts, index) - returns the pointer to struct slabobj_ext element at the given index. Directly dereferencing the return value of slab_obj_exts() is no longer allowed. Instead, slab_obj_ext() must always be used to access individual struct slabobj_ext objects. Convert all users to use these APIs. No functional changes intended. Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260113061845.159790-5-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- mm/memcontrol.c | 23 ++++++++++++++++------- mm/slab.h | 43 +++++++++++++++++++++++++++++++++++++------ mm/slub.c | 45 ++++++++++++++++++++++++++------------------- 3 files changed, 79 insertions(+), 32 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 86f43b7e5f71..276e87090a75 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2596,7 +2596,8 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) * Memcg membership data for each individual object is saved in * slab->obj_exts. */ - struct slabobj_ext *obj_exts; + unsigned long obj_exts; + struct slabobj_ext *obj_ext; unsigned int off; obj_exts = slab_obj_exts(slab); @@ -2604,8 +2605,9 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) return NULL; off = obj_to_index(slab->slab_cache, slab, p); - if (obj_exts[off].objcg) - return obj_cgroup_memcg(obj_exts[off].objcg); + obj_ext = slab_obj_ext(slab, obj_exts, off); + if (obj_ext->objcg) + return obj_cgroup_memcg(obj_ext->objcg); return NULL; } @@ -3191,6 +3193,9 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, } for (i = 0; i < size; i++) { + unsigned long obj_exts; + struct slabobj_ext *obj_ext; + slab = virt_to_slab(p[i]); if (!slab_obj_exts(slab) && @@ -3213,29 +3218,33 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, slab_pgdat(slab), cache_vmstat_idx(s))) return false; + obj_exts = slab_obj_exts(slab); off = obj_to_index(s, slab, p[i]); + obj_ext = slab_obj_ext(slab, obj_exts, off); obj_cgroup_get(objcg); - slab_obj_exts(slab)[off].objcg = objcg; + obj_ext->objcg = objcg; } return true; } void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects, struct slabobj_ext *obj_exts) + void **p, int objects, unsigned long obj_exts) { size_t obj_size = obj_full_size(s); for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; + struct slabobj_ext *obj_ext; unsigned int off; off = obj_to_index(s, slab, p[i]); - objcg = obj_exts[off].objcg; + obj_ext = slab_obj_ext(slab, obj_exts, off); + objcg = obj_ext->objcg; if (!objcg) continue; - obj_exts[off].objcg = NULL; + obj_ext->objcg = NULL; refill_obj_stock(objcg, obj_size, true, -obj_size, slab_pgdat(slab), cache_vmstat_idx(s)); obj_cgroup_put(objcg); diff --git a/mm/slab.h b/mm/slab.h index 0993800fcced..4935602b3fce 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -507,10 +507,12 @@ static inline bool slab_in_kunit_test(void) { return false; } * associated with a slab. * @slab: a pointer to the slab struct * - * Returns a pointer to the object extension vector associated with the slab, - * or NULL if no such vector has been associated yet. + * Returns the address of the object extension vector associated with the slab, + * or zero if no such vector has been associated yet. + * Do not dereference the return value directly; use slab_obj_ext() to access + * its elements. */ -static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) +static inline unsigned long slab_obj_exts(struct slab *slab) { unsigned long obj_exts = READ_ONCE(slab->obj_exts); @@ -523,7 +525,29 @@ static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); #endif - return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); + + return obj_exts & ~OBJEXTS_FLAGS_MASK; +} + +/* + * slab_obj_ext - get the pointer to the slab object extension metadata + * associated with an object in a slab. + * @slab: a pointer to the slab struct + * @obj_exts: a pointer to the object extension vector + * @index: an index of the object + * + * Returns a pointer to the object extension associated with the object. + */ +static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, + unsigned long obj_exts, + unsigned int index) +{ + struct slabobj_ext *obj_ext; + + VM_WARN_ON_ONCE(obj_exts != slab_obj_exts(slab)); + + obj_ext = (struct slabobj_ext *)obj_exts; + return &obj_ext[index]; } int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, @@ -531,7 +555,14 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, #else /* CONFIG_SLAB_OBJ_EXT */ -static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) +static inline unsigned long slab_obj_exts(struct slab *slab) +{ + return 0; +} + +static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, + unsigned long obj_exts, + unsigned int index) { return NULL; } @@ -548,7 +579,7 @@ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, gfp_t flags, size_t size, void **p); void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, - void **p, int objects, struct slabobj_ext *obj_exts); + void **p, int objects, unsigned long obj_exts); #endif void kvfree_rcu_cb(struct rcu_head *head); diff --git a/mm/slub.c b/mm/slub.c index 1b7ed91a2f15..09af619c2c69 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2055,7 +2055,7 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) { - struct slabobj_ext *slab_exts; + unsigned long slab_exts; struct slab *obj_exts_slab; obj_exts_slab = virt_to_slab(obj_exts); @@ -2063,13 +2063,15 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) if (slab_exts) { unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, obj_exts_slab, obj_exts); + struct slabobj_ext *ext = slab_obj_ext(obj_exts_slab, + slab_exts, offs); - if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) + if (unlikely(is_codetag_empty(&ext->ref))) return; /* codetag should be NULL here */ - WARN_ON(slab_exts[offs].ref.ct); - set_codetag_empty(&slab_exts[offs].ref); + WARN_ON(ext->ref.ct); + set_codetag_empty(&ext->ref); } } @@ -2237,7 +2239,7 @@ static inline void free_slab_obj_exts(struct slab *slab) { struct slabobj_ext *obj_exts; - obj_exts = slab_obj_exts(slab); + obj_exts = (struct slabobj_ext *)slab_obj_exts(slab); if (!obj_exts) { /* * If obj_exts allocation failed, slab->obj_exts is set to @@ -2284,26 +2286,29 @@ static inline void free_slab_obj_exts(struct slab *slab) #ifdef CONFIG_MEM_ALLOC_PROFILING static inline struct slabobj_ext * -prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) +prepare_slab_obj_ext_hook(struct kmem_cache *s, gfp_t flags, void *p) { struct slab *slab; + unsigned long obj_exts; slab = virt_to_slab(p); - if (!slab_obj_exts(slab) && + obj_exts = slab_obj_exts(slab); + if (!obj_exts && alloc_slab_obj_exts(slab, s, flags, false)) { pr_warn_once("%s, %s: Failed to create slab extension vector!\n", __func__, s->name); return NULL; } - return slab_obj_exts(slab) + obj_to_index(s, slab, p); + obj_exts = slab_obj_exts(slab); + return slab_obj_ext(slab, obj_exts, obj_to_index(s, slab, p)); } /* Should be called only if mem_alloc_profiling_enabled() */ static noinline void __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { - struct slabobj_ext *obj_exts; + struct slabobj_ext *obj_ext; if (!object) return; @@ -2314,14 +2319,14 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) if (flags & __GFP_NO_OBJ_EXT) return; - obj_exts = prepare_slab_obj_exts_hook(s, flags, object); + obj_ext = prepare_slab_obj_ext_hook(s, flags, object); /* * Currently obj_exts is used only for allocation profiling. * If other users appear then mem_alloc_profiling_enabled() * check should be added before alloc_tag_add(). */ - if (likely(obj_exts)) - alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); + if (likely(obj_ext)) + alloc_tag_add(&obj_ext->ref, current->alloc_tag, s->size); else alloc_tag_set_inaccurate(current->alloc_tag); } @@ -2338,8 +2343,8 @@ static noinline void __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { - struct slabobj_ext *obj_exts; int i; + unsigned long obj_exts; /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) @@ -2352,7 +2357,7 @@ __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p for (i = 0; i < objects; i++) { unsigned int off = obj_to_index(s, slab, p[i]); - alloc_tag_sub(&obj_exts[off].ref, s->size); + alloc_tag_sub(&slab_obj_ext(slab, obj_exts, off)->ref, s->size); } } @@ -2411,7 +2416,7 @@ static __fastpath_inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { - struct slabobj_ext *obj_exts; + unsigned long obj_exts; if (!memcg_kmem_online()) return; @@ -2426,7 +2431,8 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, static __fastpath_inline bool memcg_slab_post_charge(void *p, gfp_t flags) { - struct slabobj_ext *slab_exts; + unsigned long obj_exts; + struct slabobj_ext *obj_ext; struct kmem_cache *s; struct page *page; struct slab *slab; @@ -2467,10 +2473,11 @@ bool memcg_slab_post_charge(void *p, gfp_t flags) return true; /* Ignore already charged objects. */ - slab_exts = slab_obj_exts(slab); - if (slab_exts) { + obj_exts = slab_obj_exts(slab); + if (obj_exts) { off = obj_to_index(s, slab, p); - if (unlikely(slab_exts[off].objcg)) + obj_ext = slab_obj_ext(slab, obj_exts, off); + if (unlikely(obj_ext->objcg)) return true; } From 7a8e71bc619d34c7607adef0e368d10421b7d4f6 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:41 +0900 Subject: [PATCH 33/39] mm/slab: use stride to access slabobj_ext Use a configurable stride value when accessing slab object extension metadata instead of assuming a fixed sizeof(struct slabobj_ext). Store stride value in free bits of slab->counters field. This allows for flexibility in cases where the extension is embedded within slab objects. Since these free bits exist only on 64-bit, any future optimizations that need to change stride value cannot be enabled on 32-bit architectures. Suggested-by: Vlastimil Babka Reviewed-by: Suren Baghdasaryan Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260113061845.159790-6-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- mm/slab.h | 37 +++++++++++++++++++++++++++++++++---- mm/slub.c | 2 ++ 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 4935602b3fce..be0e62c98da0 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -55,6 +55,14 @@ struct freelist_counters { * that the slab was corrupted */ unsigned frozen:1; +#ifdef CONFIG_64BIT + /* + * Some optimizations use free bits in 'counters' field + * to save memory. In case ->stride field is not available, + * such optimizations are disabled. + */ + unsigned short stride; +#endif }; }; }; @@ -529,6 +537,26 @@ static inline unsigned long slab_obj_exts(struct slab *slab) return obj_exts & ~OBJEXTS_FLAGS_MASK; } +#ifdef CONFIG_64BIT +static inline void slab_set_stride(struct slab *slab, unsigned short stride) +{ + slab->stride = stride; +} +static inline unsigned short slab_get_stride(struct slab *slab) +{ + return slab->stride; +} +#else +static inline void slab_set_stride(struct slab *slab, unsigned short stride) +{ + VM_WARN_ON_ONCE(stride != sizeof(struct slabobj_ext)); +} +static inline unsigned short slab_get_stride(struct slab *slab) +{ + return sizeof(struct slabobj_ext); +} +#endif + /* * slab_obj_ext - get the pointer to the slab object extension metadata * associated with an object in a slab. @@ -542,12 +570,9 @@ static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, unsigned long obj_exts, unsigned int index) { - struct slabobj_ext *obj_ext; - VM_WARN_ON_ONCE(obj_exts != slab_obj_exts(slab)); - obj_ext = (struct slabobj_ext *)obj_exts; - return &obj_ext[index]; + return (struct slabobj_ext *)(obj_exts + slab_get_stride(slab) * index); } int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, @@ -567,6 +592,10 @@ static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, return NULL; } +static inline void slab_set_stride(struct slab *slab, unsigned int stride) { } +static inline unsigned int slab_get_stride(struct slab *slab) { return 0; } + + #endif /* CONFIG_SLAB_OBJ_EXT */ static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) diff --git a/mm/slub.c b/mm/slub.c index 09af619c2c69..020a2c4afb45 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2206,6 +2206,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, retry: old_exts = READ_ONCE(slab->obj_exts); handle_failed_objexts_alloc(old_exts, vec, objects); + slab_set_stride(slab, sizeof(struct slabobj_ext)); + if (new_slab) { /* * If the slab is brand new and nobody can yet access its From 4b1530f89c28dfbc3ec10b0cb860ec11e4538dbe Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:42 +0900 Subject: [PATCH 34/39] mm/memcontrol,alloc_tag: handle slabobj_ext access under KASAN poison In the near future, slabobj_ext may reside outside the allocated slab object range within a slab, which could be reported as an out-of-bounds access by KASAN. As suggested by Andrey Konovalov [1], explicitly disable KASAN and KMSAN checks when accessing slabobj_ext within slab allocator, memory profiling, and memory cgroup code. While an alternative approach could be to unpoison slabobj_ext, out-of-bounds accesses outside the slab allocator are generally more common. Move metadata_access_enable()/disable() helpers to mm/slab.h so that it can be used outside mm/slub.c. However, as suggested by Suren Baghdasaryan [2], instead of calling them directly from mm code (which is more prone to errors), change users to access slabobj_ext via get/put APIs: - Users should call get_slab_obj_exts() to access slabobj_metadata and call put_slab_obj_exts() when it's done. - From now on, accessing it outside the section covered by get_slab_obj_exts() ~ put_slab_obj_exts() is illegal. This ensures that accesses to slabobj_ext metadata won't be reported as access violations. Call kasan_reset_tag() in slab_obj_ext() before returning the address to prevent SW or HW tag-based KASAN from reporting false positives. Suggested-by: Andrey Konovalov Suggested-by: Suren Baghdasaryan Link: https://lore.kernel.org/linux-mm/CA+fCnZezoWn40BaS3cgmCeLwjT+5AndzcQLc=wH3BjMCu6_YCw@mail.gmail.com [1] Link: https://lore.kernel.org/linux-mm/CAJuCfpG=Lb4WhYuPkSpdNO4Ehtjm1YcEEK0OM=3g9i=LxmpHSQ@mail.gmail.com [2] Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260113061845.159790-7-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- mm/memcontrol.c | 12 +++++++-- mm/slab.h | 54 +++++++++++++++++++++++++++++++++++--- mm/slub.c | 69 ++++++++++++++++++++++++------------------------- 3 files changed, 95 insertions(+), 40 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 276e87090a75..2d6dfba540d4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2604,10 +2604,16 @@ struct mem_cgroup *mem_cgroup_from_obj_slab(struct slab *slab, void *p) if (!obj_exts) return NULL; + get_slab_obj_exts(obj_exts); off = obj_to_index(slab->slab_cache, slab, p); obj_ext = slab_obj_ext(slab, obj_exts, off); - if (obj_ext->objcg) - return obj_cgroup_memcg(obj_ext->objcg); + if (obj_ext->objcg) { + struct obj_cgroup *objcg = obj_ext->objcg; + + put_slab_obj_exts(obj_exts); + return obj_cgroup_memcg(objcg); + } + put_slab_obj_exts(obj_exts); return NULL; } @@ -3219,10 +3225,12 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, return false; obj_exts = slab_obj_exts(slab); + get_slab_obj_exts(obj_exts); off = obj_to_index(s, slab, p[i]); obj_ext = slab_obj_ext(slab, obj_exts, off); obj_cgroup_get(objcg); obj_ext->objcg = objcg; + put_slab_obj_exts(obj_exts); } return true; diff --git a/mm/slab.h b/mm/slab.h index be0e62c98da0..d5da2f69e2d5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -508,6 +508,24 @@ bool slab_in_kunit_test(void); static inline bool slab_in_kunit_test(void) { return false; } #endif +/* + * slub is about to manipulate internal object metadata. This memory lies + * outside the range of the allocated object, so accessing it would normally + * be reported by kasan as a bounds error. metadata_access_enable() is used + * to tell kasan that these accesses are OK. + */ +static inline void metadata_access_enable(void) +{ + kasan_disable_current(); + kmsan_disable_current(); +} + +static inline void metadata_access_disable(void) +{ + kmsan_enable_current(); + kasan_enable_current(); +} + #ifdef CONFIG_SLAB_OBJ_EXT /* @@ -517,8 +535,22 @@ static inline bool slab_in_kunit_test(void) { return false; } * * Returns the address of the object extension vector associated with the slab, * or zero if no such vector has been associated yet. - * Do not dereference the return value directly; use slab_obj_ext() to access - * its elements. + * Do not dereference the return value directly; use get/put_slab_obj_exts() + * pair and slab_obj_ext() to access individual elements. + * + * Example usage: + * + * obj_exts = slab_obj_exts(slab); + * if (obj_exts) { + * get_slab_obj_exts(obj_exts); + * obj_ext = slab_obj_ext(slab, obj_exts, obj_to_index(s, slab, obj)); + * // do something with obj_ext + * put_slab_obj_exts(obj_exts); + * } + * + * Note that the get/put semantics does not involve reference counting. + * Instead, it updates kasan/kmsan depth so that accesses to slabobj_ext + * won't be reported as access violations. */ static inline unsigned long slab_obj_exts(struct slab *slab) { @@ -537,6 +569,17 @@ static inline unsigned long slab_obj_exts(struct slab *slab) return obj_exts & ~OBJEXTS_FLAGS_MASK; } +static inline void get_slab_obj_exts(unsigned long obj_exts) +{ + VM_WARN_ON_ONCE(!obj_exts); + metadata_access_enable(); +} + +static inline void put_slab_obj_exts(unsigned long obj_exts) +{ + metadata_access_disable(); +} + #ifdef CONFIG_64BIT static inline void slab_set_stride(struct slab *slab, unsigned short stride) { @@ -565,14 +608,19 @@ static inline unsigned short slab_get_stride(struct slab *slab) * @index: an index of the object * * Returns a pointer to the object extension associated with the object. + * Must be called within a section covered by get/put_slab_obj_exts(). */ static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, unsigned long obj_exts, unsigned int index) { + struct slabobj_ext *obj_ext; + VM_WARN_ON_ONCE(obj_exts != slab_obj_exts(slab)); - return (struct slabobj_ext *)(obj_exts + slab_get_stride(slab) * index); + obj_ext = (struct slabobj_ext *)(obj_exts + + slab_get_stride(slab) * index); + return kasan_reset_tag(obj_ext); } int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, diff --git a/mm/slub.c b/mm/slub.c index 020a2c4afb45..b0c50df49cf3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -972,24 +972,6 @@ static slab_flags_t slub_debug; static const char *slub_debug_string __ro_after_init; static int disable_higher_order_debug; -/* - * slub is about to manipulate internal object metadata. This memory lies - * outside the range of the allocated object, so accessing it would normally - * be reported by kasan as a bounds error. metadata_access_enable() is used - * to tell kasan that these accesses are OK. - */ -static inline void metadata_access_enable(void) -{ - kasan_disable_current(); - kmsan_disable_current(); -} - -static inline void metadata_access_disable(void) -{ - kmsan_enable_current(); - kasan_enable_current(); -} - /* * Object debugging */ @@ -2055,23 +2037,27 @@ static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) { - unsigned long slab_exts; struct slab *obj_exts_slab; + unsigned long slab_exts; obj_exts_slab = virt_to_slab(obj_exts); slab_exts = slab_obj_exts(obj_exts_slab); if (slab_exts) { + get_slab_obj_exts(slab_exts); unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, obj_exts_slab, obj_exts); struct slabobj_ext *ext = slab_obj_ext(obj_exts_slab, slab_exts, offs); - if (unlikely(is_codetag_empty(&ext->ref))) + if (unlikely(is_codetag_empty(&ext->ref))) { + put_slab_obj_exts(slab_exts); return; + } /* codetag should be NULL here */ WARN_ON(ext->ref.ct); set_codetag_empty(&ext->ref); + put_slab_obj_exts(slab_exts); } } @@ -2287,30 +2273,28 @@ static inline void free_slab_obj_exts(struct slab *slab) #ifdef CONFIG_MEM_ALLOC_PROFILING -static inline struct slabobj_ext * -prepare_slab_obj_ext_hook(struct kmem_cache *s, gfp_t flags, void *p) +static inline unsigned long +prepare_slab_obj_exts_hook(struct kmem_cache *s, struct slab *slab, + gfp_t flags, void *p) { - struct slab *slab; - unsigned long obj_exts; - - slab = virt_to_slab(p); - obj_exts = slab_obj_exts(slab); - if (!obj_exts && + if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { pr_warn_once("%s, %s: Failed to create slab extension vector!\n", __func__, s->name); - return NULL; + return 0; } - obj_exts = slab_obj_exts(slab); - return slab_obj_ext(slab, obj_exts, obj_to_index(s, slab, p)); + return slab_obj_exts(slab); } + /* Should be called only if mem_alloc_profiling_enabled() */ static noinline void __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { + unsigned long obj_exts; struct slabobj_ext *obj_ext; + struct slab *slab; if (!object) return; @@ -2321,16 +2305,23 @@ __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) if (flags & __GFP_NO_OBJ_EXT) return; - obj_ext = prepare_slab_obj_ext_hook(s, flags, object); + slab = virt_to_slab(object); + obj_exts = prepare_slab_obj_exts_hook(s, slab, flags, object); /* * Currently obj_exts is used only for allocation profiling. * If other users appear then mem_alloc_profiling_enabled() * check should be added before alloc_tag_add(). */ - if (likely(obj_ext)) + if (obj_exts) { + unsigned int obj_idx = obj_to_index(s, slab, object); + + get_slab_obj_exts(obj_exts); + obj_ext = slab_obj_ext(slab, obj_exts, obj_idx); alloc_tag_add(&obj_ext->ref, current->alloc_tag, s->size); - else + put_slab_obj_exts(obj_exts); + } else { alloc_tag_set_inaccurate(current->alloc_tag); + } } static inline void @@ -2356,11 +2347,13 @@ __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p if (!obj_exts) return; + get_slab_obj_exts(obj_exts); for (i = 0; i < objects; i++) { unsigned int off = obj_to_index(s, slab, p[i]); alloc_tag_sub(&slab_obj_ext(slab, obj_exts, off)->ref, s->size); } + put_slab_obj_exts(obj_exts); } static inline void @@ -2427,7 +2420,9 @@ void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, if (likely(!obj_exts)) return; + get_slab_obj_exts(obj_exts); __memcg_slab_free_hook(s, slab, p, objects, obj_exts); + put_slab_obj_exts(obj_exts); } static __fastpath_inline @@ -2477,10 +2472,14 @@ bool memcg_slab_post_charge(void *p, gfp_t flags) /* Ignore already charged objects. */ obj_exts = slab_obj_exts(slab); if (obj_exts) { + get_slab_obj_exts(obj_exts); off = obj_to_index(s, slab, p); obj_ext = slab_obj_ext(slab, obj_exts, off); - if (unlikely(obj_ext->objcg)) + if (unlikely(obj_ext->objcg)) { + put_slab_obj_exts(obj_exts); return true; + } + put_slab_obj_exts(obj_exts); } return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); From 70089d018807506e8a6acd03eede33a0619ec417 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:43 +0900 Subject: [PATCH 35/39] mm/slab: save memory by allocating slabobj_ext array from leftover The leftover space in a slab is always smaller than s->size, and kmem caches for large objects that are not power-of-two sizes tend to have a greater amount of leftover space per slab. In some cases, the leftover space is larger than the size of the slabobj_ext array for the slab. An excellent example of such a cache is ext4_inode_cache. On my system, the object size is 1136, with a preferred order of 3, 28 objects per slab, and 960 bytes of leftover space per slab. Since the size of the slabobj_ext array is only 224 bytes (w/o mem profiling) or 448 bytes (w/ mem profiling) per slab, the entire array fits within the leftover space. Allocate the slabobj_exts array from this unused space instead of using kcalloc() when it is large enough. The array is allocated from unused space only when creating new slabs, and it doesn't try to utilize unused space if alloc_slab_obj_exts() is called after slab creation because implementing lazy allocation involves more expensive synchronization. The implementation and evaluation of lazy allocation from unused space is left as future-work. As pointed by Vlastimil Babka [1], it could be beneficial when a slab cache without SLAB_ACCOUNT can be created, and some of the allocations from the cache use __GFP_ACCOUNT. For example, xarray does that. To avoid unnecessary overhead when MEMCG (with SLAB_ACCOUNT) and MEM_ALLOC_PROFILING are not used for the cache, allocate the slabobj_ext array only when either of them is enabled on slab allocation. [ MEMCG=y, MEM_ALLOC_PROFILING=n ] Before patch (creating ~2.64M directories on ext4): Slab: 4747880 kB SReclaimable: 4169652 kB SUnreclaim: 578228 kB After patch (creating ~2.64M directories on ext4): Slab: 4724020 kB SReclaimable: 4169188 kB SUnreclaim: 554832 kB (-22.84 MiB) Enjoy the memory savings! Link: https://lore.kernel.org/linux-mm/48029aab-20ea-4d90-bfd1-255592b2018e@suse.cz [1] Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260113061845.159790-8-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- mm/slub.c | 155 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 150 insertions(+), 5 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b0c50df49cf3..7b6d8df06ad9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -883,6 +883,97 @@ static inline unsigned long get_orig_size(struct kmem_cache *s, void *object) return *(unsigned long *)p; } +#ifdef CONFIG_SLAB_OBJ_EXT + +/* + * Check if memory cgroup or memory allocation profiling is enabled. + * If enabled, SLUB tries to reduce memory overhead of accounting + * slab objects. If neither is enabled when this function is called, + * the optimization is simply skipped to avoid affecting caches that do not + * need slabobj_ext metadata. + * + * However, this may disable optimization when memory cgroup or memory + * allocation profiling is used, but slabs are created too early + * even before those subsystems are initialized. + */ +static inline bool need_slab_obj_exts(struct kmem_cache *s) +{ + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + return true; + + if (mem_alloc_profiling_enabled()) + return true; + + return false; +} + +static inline unsigned int obj_exts_size_in_slab(struct slab *slab) +{ + return sizeof(struct slabobj_ext) * slab->objects; +} + +static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s, + struct slab *slab) +{ + unsigned long objext_offset; + + objext_offset = s->size * slab->objects; + objext_offset = ALIGN(objext_offset, sizeof(struct slabobj_ext)); + return objext_offset; +} + +static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s, + struct slab *slab) +{ + unsigned long objext_offset = obj_exts_offset_in_slab(s, slab); + unsigned long objext_size = obj_exts_size_in_slab(slab); + + return objext_offset + objext_size <= slab_size(slab); +} + +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) +{ + unsigned long obj_exts; + unsigned long start; + unsigned long end; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return false; + + start = (unsigned long)slab_address(slab); + end = start + slab_size(slab); + return (obj_exts >= start) && (obj_exts < end); +} +#else +static inline bool need_slab_obj_exts(struct kmem_cache *s) +{ + return false; +} + +static inline unsigned int obj_exts_size_in_slab(struct slab *slab) +{ + return 0; +} + +static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s, + struct slab *slab) +{ + return 0; +} + +static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s, + struct slab *slab) +{ + return false; +} + +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) +{ + return false; +} +#endif + #ifdef CONFIG_SLUB_DEBUG /* @@ -1418,7 +1509,15 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) start = slab_address(slab); length = slab_size(slab); end = start + length; - remainder = length % s->size; + + if (obj_exts_in_slab(s, slab)) { + remainder = length; + remainder -= obj_exts_offset_in_slab(s, slab); + remainder -= obj_exts_size_in_slab(slab); + } else { + remainder = length % s->size; + } + if (!remainder) return; @@ -2238,6 +2337,11 @@ static inline void free_slab_obj_exts(struct slab *slab) return; } + if (obj_exts_in_slab(slab->slab_cache, slab)) { + slab->obj_exts = 0; + return; + } + /* * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its * corresponding extension will be NULL. alloc_tag_sub() will throw a @@ -2253,6 +2357,36 @@ static inline void free_slab_obj_exts(struct slab *slab) slab->obj_exts = 0; } +/* + * Try to allocate slabobj_ext array from unused space. + * This function must be called on a freshly allocated slab to prevent + * concurrency problems. + */ +static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab) +{ + void *addr; + unsigned long obj_exts; + + if (!need_slab_obj_exts(s)) + return; + + if (obj_exts_fit_within_slab_leftover(s, slab)) { + addr = slab_address(slab) + obj_exts_offset_in_slab(s, slab); + addr = kasan_reset_tag(addr); + obj_exts = (unsigned long)addr; + + get_slab_obj_exts(obj_exts); + memset(addr, 0, obj_exts_size_in_slab(slab)); + put_slab_obj_exts(obj_exts); + +#ifdef CONFIG_MEMCG + obj_exts |= MEMCG_DATA_OBJEXTS; +#endif + slab->obj_exts = obj_exts; + slab_set_stride(slab, sizeof(struct slabobj_ext)); + } +} + #else /* CONFIG_SLAB_OBJ_EXT */ static inline void init_slab_obj_exts(struct slab *slab) @@ -2269,6 +2403,11 @@ static inline void free_slab_obj_exts(struct slab *slab) { } +static inline void alloc_slab_obj_exts_early(struct kmem_cache *s, + struct slab *slab) +{ +} + #endif /* CONFIG_SLAB_OBJ_EXT */ #ifdef CONFIG_MEM_ALLOC_PROFILING @@ -3265,7 +3404,9 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) + if (memcg_kmem_online() && + (s->flags & SLAB_ACCOUNT) && + !slab_obj_exts(slab)) alloc_slab_obj_exts(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), @@ -3329,9 +3470,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) slab->objects = oo_objects(oo); slab->inuse = 0; slab->frozen = 0; - init_slab_obj_exts(slab); - - account_slab(slab, oo_order(oo), s, flags); slab->slab_cache = s; @@ -3340,6 +3478,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) start = slab_address(slab); setup_slab_debug(s, slab, start); + init_slab_obj_exts(slab); + /* + * Poison the slab before initializing the slabobj_ext array + * to prevent the array from being overwritten. + */ + alloc_slab_obj_exts_early(s, slab); + account_slab(slab, oo_order(oo), s, flags); shuffle = shuffle_freelist(s, slab); From fab0694646d75d5b03e9898ffb85899fb23320ea Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:44 +0900 Subject: [PATCH 36/39] mm/slab: move [__]ksize and slab_ksize() to mm/slub.c To access SLUB's internal implementation details beyond cache flags in ksize(), move __ksize(), ksize(), and slab_ksize() to mm/slub.c. [vbabka@suse.cz: also make __ksize() static and move its kerneldoc to ksize() ] Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260113061845.159790-9-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 1 - mm/slab.h | 27 -------------- mm/slab_common.c | 61 ------------------------------- mm/slub.c | 86 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 89 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 4554c04a9bd7..93e367b6a5f6 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -509,7 +509,6 @@ void * __must_check krealloc_node_align_noprof(const void *objp, size_t new_size void kfree(const void *objp); void kfree_nolock(const void *objp); void kfree_sensitive(const void *objp); -size_t __ksize(const void *objp); DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T)) diff --git a/mm/slab.h b/mm/slab.h index d5da2f69e2d5..43b7c5ababb5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -661,33 +661,6 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void kvfree_rcu_cb(struct rcu_head *head); -size_t __ksize(const void *objp); - -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->object_size; -#endif - if (s->flags & SLAB_KASAN) - return s->object_size; - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline unsigned int large_kmalloc_order(const struct page *page) { return page[1].flags.f & 0xff; diff --git a/mm/slab_common.c b/mm/slab_common.c index 027bf64c2e35..b2db8f8f3cf0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1021,43 +1021,6 @@ void __init create_kmalloc_caches(void) 0, SLAB_NO_MERGE, NULL); } -/** - * __ksize -- Report full size of underlying allocation - * @object: pointer to the object - * - * This should only be used internally to query the true size of allocations. - * It is not meant to be a way to discover the usable size of an allocation - * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond - * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, - * and/or FORTIFY_SOURCE. - * - * Return: size of the actual memory used by @object in bytes - */ -size_t __ksize(const void *object) -{ - const struct page *page; - const struct slab *slab; - - if (unlikely(object == ZERO_SIZE_PTR)) - return 0; - - page = virt_to_page(object); - - if (unlikely(PageLargeKmalloc(page))) - return large_kmalloc_size(page); - - slab = page_slab(page); - /* Delete this after we're sure there are no users */ - if (WARN_ON(!slab)) - return page_size(page); - -#ifdef CONFIG_SLUB_DEBUG - skip_orig_size_check(slab->slab_cache, object); -#endif - - return slab_ksize(slab->slab_cache); -} - gfp_t kmalloc_fix_flags(gfp_t flags) { gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; @@ -1273,30 +1236,6 @@ void kfree_sensitive(const void *p) } EXPORT_SYMBOL(kfree_sensitive); -size_t ksize(const void *objp) -{ - /* - * We need to first check that the pointer to the object is valid. - * The KASAN report printed from ksize() is more useful, then when - * it's printed later when the behaviour could be undefined due to - * a potential use-after-free or double-free. - * - * We use kasan_check_byte(), which is supported for the hardware - * tag-based KASAN mode, unlike kasan_check_read/write(). - * - * If the pointed to memory is invalid, we return 0 to avoid users of - * ksize() writing to and potentially corrupting the memory region. - * - * We want to perform the check before __ksize(), to avoid potentially - * crashing in __ksize() due to accessing invalid metadata. - */ - if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) - return 0; - - return kfence_ksize(objp) ?: __ksize(objp); -} -EXPORT_SYMBOL(ksize); - #ifdef CONFIG_BPF_SYSCALL #include diff --git a/mm/slub.c b/mm/slub.c index 7b6d8df06ad9..782685433580 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -7028,6 +7028,92 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +#endif + if (s->flags & SLAB_KASAN) + return s->object_size; + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +} + +static size_t __ksize(const void *object) +{ + const struct page *page; + const struct slab *slab; + + if (unlikely(object == ZERO_SIZE_PTR)) + return 0; + + page = virt_to_page(object); + + if (unlikely(PageLargeKmalloc(page))) + return large_kmalloc_size(page); + + slab = page_slab(page); + /* Delete this after we're sure there are no users */ + if (WARN_ON(!slab)) + return page_size(page); + +#ifdef CONFIG_SLUB_DEBUG + skip_orig_size_check(slab->slab_cache, object); +#endif + + return slab_ksize(slab->slab_cache); +} + +/** + * ksize -- Report full size of underlying allocation + * @objp: pointer to the object + * + * This should only be used internally to query the true size of allocations. + * It is not meant to be a way to discover the usable size of an allocation + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, + * and/or FORTIFY_SOURCE. + * + * Return: size of the actual memory used by @objp in bytes + */ +size_t ksize(const void *objp) +{ + /* + * We need to first check that the pointer to the object is valid. + * The KASAN report printed from ksize() is more useful, then when + * it's printed later when the behaviour could be undefined due to + * a potential use-after-free or double-free. + * + * We use kasan_check_byte(), which is supported for the hardware + * tag-based KASAN mode, unlike kasan_check_read/write(). + * + * If the pointed to memory is invalid, we return 0 to avoid users of + * ksize() writing to and potentially corrupting the memory region. + * + * We want to perform the check before __ksize(), to avoid potentially + * crashing in __ksize() due to accessing invalid metadata. + */ + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) + return 0; + + return kfence_ksize(objp) ?: __ksize(objp); +} +EXPORT_SYMBOL(ksize); + static void free_large_kmalloc(struct page *page, void *object) { unsigned int order = compound_order(page); From a77d6d338685025cbf84f6e3abd92a8e59a4d894 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 13 Jan 2026 15:18:45 +0900 Subject: [PATCH 37/39] mm/slab: place slabobj_ext metadata in unused space within s->size When a cache has high s->align value and s->object_size is not aligned to it, each object ends up with some unused space because of alignment. If this wasted space is big enough, we can use it to store the slabobj_ext metadata instead of wasting it. On my system, this happens with caches like kmem_cache, mm_struct, pid, task_struct, sighand_cache, xfs_inode, and others. To place the slabobj_ext metadata within each object, the existing slab_obj_ext() logic can still be used by setting: - slab->obj_exts = slab_address(slab) + (slabobj_ext offset) - stride = s->size slab_obj_ext() doesn't need know where the metadata is stored, so this method works without adding extra overhead to slab_obj_ext(). A good example benefiting from this optimization is xfs_inode (object_size: 992, align: 64). To measure memory savings, 2 millions of files were created on XFS. [ MEMCG=y, MEM_ALLOC_PROFILING=n ] Before patch (creating ~2.64M directories on xfs): Slab: 5175976 kB SReclaimable: 3837524 kB SUnreclaim: 1338452 kB After patch (creating ~2.64M directories on xfs): Slab: 5152912 kB SReclaimable: 3838568 kB SUnreclaim: 1314344 kB (-23.54 MiB) Enjoy the memory savings! Suggested-by: Vlastimil Babka Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260113061845.159790-10-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 9 +++++ mm/slab_common.c | 7 +++- mm/slub.c | 96 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 101 insertions(+), 11 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 93e367b6a5f6..34db237319c1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -59,6 +59,9 @@ enum _slab_flag_bits { _SLAB_CMPXCHG_DOUBLE, #ifdef CONFIG_SLAB_OBJ_EXT _SLAB_NO_OBJ_EXT, +#endif +#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) + _SLAB_OBJ_EXT_IN_OBJ, #endif _SLAB_FLAGS_LAST_BIT }; @@ -244,6 +247,12 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif +#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) +#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ) +#else +#define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_UNUSED +#endif + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * diff --git a/mm/slab_common.c b/mm/slab_common.c index b2db8f8f3cf0..886d02fa94fb 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -43,10 +43,13 @@ DEFINE_MUTEX(slab_mutex); struct kmem_cache *kmem_cache; /* - * Set of flags that will prevent slab merging + * Set of flags that will prevent slab merging. + * Any flag that adds per-object metadata should be included, + * since slab merging can update s->inuse that affects the metadata layout. */ #define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \ - SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE) + SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \ + SLAB_OBJ_EXT_IN_OBJ) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_ACCOUNT) diff --git a/mm/slub.c b/mm/slub.c index 782685433580..0805c09d4b55 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -972,6 +972,46 @@ static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) { return false; } + +#endif + +#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) +static bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab) +{ + /* + * Note we cannot rely on the SLAB_OBJ_EXT_IN_OBJ flag here and need to + * check the stride. A cache can have SLAB_OBJ_EXT_IN_OBJ set, but + * allocations within_slab_leftover are preferred. And those may be + * possible or not depending on the particular slab's size. + */ + return obj_exts_in_slab(s, slab) && + (slab_get_stride(slab) == s->size); +} + +static unsigned int obj_exts_offset_in_object(struct kmem_cache *s) +{ + unsigned int offset = get_info_end(s); + + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) + offset += sizeof(struct track) * 2; + + if (slub_debug_orig_size(s)) + offset += sizeof(unsigned long); + + offset += kasan_metadata_size(s, false); + + return offset; +} +#else +static inline bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab) +{ + return false; +} + +static inline unsigned int obj_exts_offset_in_object(struct kmem_cache *s) +{ + return 0; +} #endif #ifdef CONFIG_SLUB_DEBUG @@ -1272,6 +1312,9 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) off += kasan_metadata_size(s, false); + if (obj_exts_in_object(s, slab)) + off += sizeof(struct slabobj_ext); + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ print_section(KERN_ERR, "Padding ", p + off, @@ -1453,8 +1496,11 @@ skip_bug_print: * between metadata and the next object, independent of alignment. * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set. * [Final alignment padding] - * - Any bytes added by ALIGN(size, s->align) to reach s->size. - * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set. + * - Bytes added by ALIGN(size, s->align) to reach s->size. + * - When the padding is large enough, it can be used to store + * struct slabobj_ext for accounting metadata (obj_exts_in_object()). + * - The remaining bytes (if any) are filled with 0x5a (POISON_INUSE) + * when SLAB_POISON is set. * * Notes: * - Redzones are filled by init_object() with SLUB_RED_ACTIVE/INACTIVE. @@ -1485,6 +1531,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) off += kasan_metadata_size(s, false); + if (obj_exts_in_object(s, slab)) + off += sizeof(struct slabobj_ext); + if (size_from_object(s) == off) return 1; @@ -1510,7 +1559,7 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab) length = slab_size(slab); end = start + length; - if (obj_exts_in_slab(s, slab)) { + if (obj_exts_in_slab(s, slab) && !obj_exts_in_object(s, slab)) { remainder = length; remainder -= obj_exts_offset_in_slab(s, slab); remainder -= obj_exts_size_in_slab(slab); @@ -2384,6 +2433,24 @@ static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab) #endif slab->obj_exts = obj_exts; slab_set_stride(slab, sizeof(struct slabobj_ext)); + } else if (s->flags & SLAB_OBJ_EXT_IN_OBJ) { + unsigned int offset = obj_exts_offset_in_object(s); + + obj_exts = (unsigned long)slab_address(slab); + obj_exts += s->red_left_pad; + obj_exts += offset; + + get_slab_obj_exts(obj_exts); + for_each_object(addr, s, slab_address(slab), slab->objects) + memset(kasan_reset_tag(addr) + offset, 0, + sizeof(struct slabobj_ext)); + put_slab_obj_exts(obj_exts); + +#ifdef CONFIG_MEMCG + obj_exts |= MEMCG_DATA_OBJEXTS; +#endif + slab->obj_exts = obj_exts; + slab_set_stride(slab, s->size); } } @@ -7028,8 +7095,10 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); -static inline size_t slab_ksize(const struct kmem_cache *s) +static inline size_t slab_ksize(struct slab *slab) { + struct kmem_cache *s = slab->slab_cache; + #ifdef CONFIG_SLUB_DEBUG /* * Debugging requires use of the padding between object @@ -7042,11 +7111,13 @@ static inline size_t slab_ksize(const struct kmem_cache *s) return s->object_size; /* * If we have the need to store the freelist pointer - * back there or track user information then we can + * or any other metadata back there then we can * only use the space before that information. */ if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) return s->inuse; + else if (obj_exts_in_object(s, slab)) + return s->inuse; /* * Else we can use all the padding etc for the allocation */ @@ -7055,8 +7126,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s) static size_t __ksize(const void *object) { - const struct page *page; - const struct slab *slab; + struct page *page; + struct slab *slab; if (unlikely(object == ZERO_SIZE_PTR)) return 0; @@ -7075,7 +7146,7 @@ static size_t __ksize(const void *object) skip_orig_size_check(slab->slab_cache, object); #endif - return slab_ksize(slab->slab_cache); + return slab_ksize(slab); } /** @@ -8199,6 +8270,7 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; + unsigned int aligned_size; unsigned int order; /* @@ -8308,7 +8380,13 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) * offset 0. In order to align the objects we have to simply size * each object to conform to the alignment. */ - size = ALIGN(size, s->align); + aligned_size = ALIGN(size, s->align); +#if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) + if (aligned_size - size >= sizeof(struct slabobj_ext)) + s->flags |= SLAB_OBJ_EXT_IN_OBJ; +#endif + size = aligned_size; + s->size = size; s->reciprocal_size = reciprocal_value(size); order = calculate_order(size); From 2f35fee943435b5b1a3e403c7fb9bd19727754d8 Mon Sep 17 00:00:00 2001 From: Harry Yoo Date: Tue, 27 Jan 2026 19:31:51 +0900 Subject: [PATCH 38/39] mm/slab: only allow SLAB_OBJ_EXT_IN_OBJ for unmergeable caches While SLAB_OBJ_EXT_IN_OBJ allows to reduce memory overhead to account slab objects, it prevents slab merging because merging can change the metadata layout. As pointed out Vlastimil Babka, disabling merging solely for this memory optimization may not be a net win, because disabling slab merging tends to increase overall memory usage. Restrict SLAB_OBJ_EXT_IN_OBJ to caches that are already unmergeable for other reasons (e.g., those with constructors or SLAB_TYPESAFE_BY_RCU). Suggested-by: Vlastimil Babka Signed-off-by: Harry Yoo Link: https://patch.msgid.link/20260127103151.21883-3-harry.yoo@oracle.com Signed-off-by: Vlastimil Babka --- mm/slab.h | 1 + mm/slab_common.c | 3 +-- mm/slub.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 43b7c5ababb5..3f49666e943c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -411,6 +411,7 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); +bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags); slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); diff --git a/mm/slab_common.c b/mm/slab_common.c index 886d02fa94fb..094afa2792d0 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -174,8 +174,7 @@ int slab_unmergeable(struct kmem_cache *s) return 0; } -static bool slab_args_unmergeable(struct kmem_cache_args *args, - slab_flags_t flags) +bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags) { if (slab_nomerge) return true; diff --git a/mm/slub.c b/mm/slub.c index 0805c09d4b55..18ac9460f9e9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -8382,7 +8382,8 @@ static int calculate_sizes(struct kmem_cache_args *args, struct kmem_cache *s) */ aligned_size = ALIGN(size, s->align); #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) - if (aligned_size - size >= sizeof(struct slabobj_ext)) + if (slab_args_unmergeable(args, s->flags) && + (aligned_size - size >= sizeof(struct slabobj_ext))) s->flags |= SLAB_OBJ_EXT_IN_OBJ; #endif size = aligned_size; From 98e99fc4ad4b30dd28c09ba19686ec583af345b4 Mon Sep 17 00:00:00 2001 From: Hao Li Date: Thu, 5 Feb 2026 20:07:23 +0800 Subject: [PATCH 39/39] slub: let need_slab_obj_exts() return false if SLAB_NO_OBJ_EXT is set SLAB_NO_OBJ_EXT is set for boot caches, but need_slab_obj_exts() doesn't check this flag. We should return false unconditionally when SLAB_NO_OBJ_EXT is set. Signed-off-by: Hao Li Acked-by: Harry Yoo Link: https://patch.msgid.link/20260205120709.425719-1-hao.li@linux.dev Signed-off-by: Vlastimil Babka --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 18ac9460f9e9..6fac2b123b42 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -898,6 +898,9 @@ static inline unsigned long get_orig_size(struct kmem_cache *s, void *object) */ static inline bool need_slab_obj_exts(struct kmem_cache *s) { + if (s->flags & SLAB_NO_OBJ_EXT) + return false; + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) return true;