mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache destruction
Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab
caches when a cache is destroyed. This is unnecessary; only the RCU
sheaves belonging to the cache being destroyed need to be flushed.
As suggested by Vlastimil Babka, introduce a weaker form of
kvfree_rcu_barrier() that operates on a specific slab cache.
Factor out flush_rcu_sheaves_on_cache() from flush_all_rcu_sheaves() and
call it from flush_all_rcu_sheaves() and kvfree_rcu_barrier_on_cache().
Call kvfree_rcu_barrier_on_cache() instead of kvfree_rcu_barrier() on
cache destruction.
The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen
5900X machine (1 socket), by loading slub_kunit module.
Before:
Total calls: 19
Average latency (us): 18127
Total time (us): 344414
After:
Total calls: 19
Average latency (us): 10066
Total time (us): 191264
Two performance regression have been reported:
- stress module loader test's runtime increases by 50-60% (Daniel)
- internal graphics test's runtime on Tegra234 increases by 35% (Jon)
They are fixed by this change.
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Fixes: ec66e0d599 ("slab: add sheaf support for batching kfree_rcu() operations")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz
Reported-and-tested-by: Daniel Gomez <da.gomez@samsung.com>
Closes: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org
Reported-and-tested-by: Jon Hunter <jonathanh@nvidia.com>
Closes: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
Link: https://patch.msgid.link/20251207154148.117723-1-harry.yoo@oracle.com
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
master
parent
b687034b1a
commit
0f35040de5
|
|
@ -1150,10 +1150,17 @@ static inline void kvfree_rcu_barrier(void)
|
|||
rcu_barrier();
|
||||
}
|
||||
|
||||
static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
|
||||
{
|
||||
rcu_barrier();
|
||||
}
|
||||
|
||||
static inline void kfree_rcu_scheduler_running(void) { }
|
||||
#else
|
||||
void kvfree_rcu_barrier(void);
|
||||
|
||||
void kvfree_rcu_barrier_on_cache(struct kmem_cache *s);
|
||||
|
||||
void kfree_rcu_scheduler_running(void);
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s)
|
|||
|
||||
bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj);
|
||||
void flush_all_rcu_sheaves(void);
|
||||
void flush_rcu_sheaves_on_cache(struct kmem_cache *s);
|
||||
|
||||
#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
|
||||
SLAB_CACHE_DMA32 | SLAB_PANIC | \
|
||||
|
|
|
|||
|
|
@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
|
|||
return;
|
||||
|
||||
/* in-flight kfree_rcu()'s may include objects from our cache */
|
||||
kvfree_rcu_barrier();
|
||||
kvfree_rcu_barrier_on_cache(s);
|
||||
|
||||
if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) &&
|
||||
(s->flags & SLAB_TYPESAFE_BY_RCU)) {
|
||||
|
|
@ -2038,25 +2038,13 @@ unlock_return:
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
|
||||
|
||||
/**
|
||||
* kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
|
||||
*
|
||||
* Note that a single argument of kvfree_rcu() call has a slow path that
|
||||
* triggers synchronize_rcu() following by freeing a pointer. It is done
|
||||
* before the return from the function. Therefore for any single-argument
|
||||
* call that will result in a kfree() to a cache that is to be destroyed
|
||||
* during module exit, it is developer's responsibility to ensure that all
|
||||
* such calls have returned before the call to kmem_cache_destroy().
|
||||
*/
|
||||
void kvfree_rcu_barrier(void)
|
||||
static inline void __kvfree_rcu_barrier(void)
|
||||
{
|
||||
struct kfree_rcu_cpu_work *krwp;
|
||||
struct kfree_rcu_cpu *krcp;
|
||||
bool queued;
|
||||
int i, cpu;
|
||||
|
||||
flush_all_rcu_sheaves();
|
||||
|
||||
/*
|
||||
* Firstly we detach objects and queue them over an RCU-batch
|
||||
* for all CPUs. Finally queued works are flushed for each CPU.
|
||||
|
|
@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void)
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
|
||||
*
|
||||
* Note that a single argument of kvfree_rcu() call has a slow path that
|
||||
* triggers synchronize_rcu() following by freeing a pointer. It is done
|
||||
* before the return from the function. Therefore for any single-argument
|
||||
* call that will result in a kfree() to a cache that is to be destroyed
|
||||
* during module exit, it is developer's responsibility to ensure that all
|
||||
* such calls have returned before the call to kmem_cache_destroy().
|
||||
*/
|
||||
void kvfree_rcu_barrier(void)
|
||||
{
|
||||
flush_all_rcu_sheaves();
|
||||
__kvfree_rcu_barrier();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
|
||||
|
||||
/**
|
||||
* kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a
|
||||
* specific slab cache.
|
||||
* @s: slab cache to wait for
|
||||
*
|
||||
* See the description of kvfree_rcu_barrier() for details.
|
||||
*/
|
||||
void kvfree_rcu_barrier_on_cache(struct kmem_cache *s)
|
||||
{
|
||||
if (s->cpu_sheaves)
|
||||
flush_rcu_sheaves_on_cache(s);
|
||||
/*
|
||||
* TODO: Introduce a version of __kvfree_rcu_barrier() that works
|
||||
* on a specific slab cache.
|
||||
*/
|
||||
__kvfree_rcu_barrier();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache);
|
||||
|
||||
static unsigned long
|
||||
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
|
||||
{
|
||||
|
|
@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void)
|
|||
}
|
||||
|
||||
#endif /* CONFIG_KVFREE_RCU_BATCHED */
|
||||
|
||||
|
|
|
|||
57
mm/slub.c
57
mm/slub.c
|
|
@ -4122,42 +4122,47 @@ static void flush_rcu_sheaf(struct work_struct *w)
|
|||
|
||||
|
||||
/* needed for kvfree_rcu_barrier() */
|
||||
void flush_all_rcu_sheaves(void)
|
||||
void flush_rcu_sheaves_on_cache(struct kmem_cache *s)
|
||||
{
|
||||
struct slub_flush_work *sfw;
|
||||
struct kmem_cache *s;
|
||||
unsigned int cpu;
|
||||
|
||||
mutex_lock(&flush_lock);
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
sfw = &per_cpu(slub_flush, cpu);
|
||||
|
||||
/*
|
||||
* we don't check if rcu_free sheaf exists - racing
|
||||
* __kfree_rcu_sheaf() might have just removed it.
|
||||
* by executing flush_rcu_sheaf() on the cpu we make
|
||||
* sure the __kfree_rcu_sheaf() finished its call_rcu()
|
||||
*/
|
||||
|
||||
INIT_WORK(&sfw->work, flush_rcu_sheaf);
|
||||
sfw->s = s;
|
||||
queue_work_on(cpu, flushwq, &sfw->work);
|
||||
}
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
sfw = &per_cpu(slub_flush, cpu);
|
||||
flush_work(&sfw->work);
|
||||
}
|
||||
|
||||
mutex_unlock(&flush_lock);
|
||||
}
|
||||
|
||||
void flush_all_rcu_sheaves(void)
|
||||
{
|
||||
struct kmem_cache *s;
|
||||
|
||||
cpus_read_lock();
|
||||
mutex_lock(&slab_mutex);
|
||||
|
||||
list_for_each_entry(s, &slab_caches, list) {
|
||||
if (!s->cpu_sheaves)
|
||||
continue;
|
||||
|
||||
mutex_lock(&flush_lock);
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
sfw = &per_cpu(slub_flush, cpu);
|
||||
|
||||
/*
|
||||
* we don't check if rcu_free sheaf exists - racing
|
||||
* __kfree_rcu_sheaf() might have just removed it.
|
||||
* by executing flush_rcu_sheaf() on the cpu we make
|
||||
* sure the __kfree_rcu_sheaf() finished its call_rcu()
|
||||
*/
|
||||
|
||||
INIT_WORK(&sfw->work, flush_rcu_sheaf);
|
||||
sfw->s = s;
|
||||
queue_work_on(cpu, flushwq, &sfw->work);
|
||||
}
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
sfw = &per_cpu(slub_flush, cpu);
|
||||
flush_work(&sfw->work);
|
||||
}
|
||||
|
||||
mutex_unlock(&flush_lock);
|
||||
flush_rcu_sheaves_on_cache(s);
|
||||
}
|
||||
|
||||
mutex_unlock(&slab_mutex);
|
||||
|
|
|
|||
Loading…
Reference in New Issue