bpf_try_alloc_pages

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEE+soXsSLHKoYyzcli6rmadz2vbToFAmfkHCQACgkQ6rmadz2v
 bTrzWhAAnDcJsGgSJ9EbElpTfgBWE7aijXo/MsPxxRhORc0uR6MnhPx1iADP4KYj
 lTGEIBgRuDG3qaM4EXpPd32rUJJHv8hot7z9zfvUgSuFNLZEHWXJtz/i4ileOxin
 08zV+zA5WL2fqamAmMRFMI37DeSWy3xU0/qlbWgNnURjPjRri6CF4rVFUWq+QMY+
 XP8ITD/6nOLUR6Bq2M18aHnk2VJWkxVP9Oi+vz1VHbOjKaJC7ATa1+Q4qMqWyTb1
 8IAYWiZR1ZPc214ITaspVzLoLb/wxHxy3QMrdAWAL6sjp0B4J8YxIq1qsBuR1FN7
 TxTRQND/+LjqrAgs5AmFqz3ndKmahjGQWnQEh/rDYJtx+sLJk9hfsMIDF8Wmxuwl
 RftdV0g9bPljR5Qgc9i8DNtEjoAbNjoP8xLjt9HfQakVl8V9jPe0bxZ5tJDf+T0M
 n/VgEjaRzdXqFOLal6Z5wl/jkIn1l1kWQuCMI2z5Z0Ls+PlYX56xdZxfK2Rh3m+e
 3W89vqj9ytJ3rZKG8DRsxukuHwnJ+Gia3XI2h/5cc8kEM5ss1Ase8oIkmrwaLd9x
 +zVXNoDCCPRQgTStwItW+2YdFmE9uijhEZh9yPwT1/rtFuKd0oSebVIpjih/bGqH
 mMN9gYO4+ArSbqku9X2lP3VjMOf6M6SZGm+PzG25PAMGzjqGqwk=
 =AHTr
 -----END PGP SIGNATURE-----

Merge tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Pull bpf try_alloc_pages() support from Alexei Starovoitov:
 "The pull includes work from Sebastian, Vlastimil and myself with a lot
  of help from Michal and Shakeel.

  This is a first step towards making kmalloc reentrant to get rid of
  slab wrappers: bpf_mem_alloc, kretprobe's objpool, etc. These patches
  make page allocator safe from any context.

  Vlastimil kicked off this effort at LSFMM 2024:

    https://lwn.net/Articles/974138/

  and we continued at LSFMM 2025:

    https://lore.kernel.org/all/CAADnVQKfkGxudNUkcPJgwe3nTZ=xohnRshx9kLZBTmR_E1DFEg@mail.gmail.com/

  Why:

  SLAB wrappers bind memory to a particular subsystem making it
  unavailable to the rest of the kernel. Some BPF maps in production
  consume Gbytes of preallocated memory. Top 5 in Meta: 1.5G, 1.2G,
  1.1G, 300M, 200M. Once we have kmalloc that works in any context BPF
  map preallocation won't be necessary.

  How:

  Synchronous kmalloc/page alloc stack has multiple stages going from
  fast to slow: cmpxchg16 -> slab_alloc -> new_slab -> alloc_pages ->
  rmqueue_pcplist -> __rmqueue, where rmqueue_pcplist was already
  relying on trylock.

  This set changes rmqueue_bulk/rmqueue_buddy to attempt a trylock and
  return ENOMEM if alloc_flags & ALLOC_TRYLOCK. It then wraps this
  functionality into try_alloc_pages() helper. We make sure that the
  logic is sane in PREEMPT_RT.

  End result: try_alloc_pages()/free_pages_nolock() are safe to call
  from any context.

  try_kmalloc() for any context with similar trylock approach will
  follow. It will use try_alloc_pages() when slab needs a new page.
  Though such try_kmalloc/page_alloc() is an opportunistic allocator,
  this design ensures that the probability of successful allocation of
  small objects (up to one page in size) is high.

  Even before we have try_kmalloc(), we already use try_alloc_pages() in
  BPF arena implementation and it's going to be used more extensively in
  BPF"

* tag 'bpf_try_alloc_pages' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next:
  mm: Fix the flipped condition in gfpflags_allow_spinning()
  bpf: Use try_alloc_pages() to allocate pages for bpf needs.
  mm, bpf: Use memcg in try_alloc_pages().
  memcg: Use trylock to access memcg stock_lock.
  mm, bpf: Introduce free_pages_nolock()
  mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation
  locking/local_lock: Introduce localtry_lock_t
pull/1190/head
Linus Torvalds 2025-03-30 13:45:28 -07:00
commit aa918db707
13 changed files with 511 additions and 44 deletions

View File

@ -2385,7 +2385,7 @@ int generic_map_delete_batch(struct bpf_map *map,
struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **page_array);
#ifdef CONFIG_MEMCG
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,

View File

@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}
static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
{
/*
* !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
* !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
* All GFP_* flags including GFP_NOWAIT use one or both flags.
* try_alloc_pages() is the only API that doesn't specify either flag.
*
* This is stronger than GFP_NOWAIT or GFP_ATOMIC because
* those are guaranteed to never block on a sleeping lock.
* Here we are enforcing that the allocation doesn't ever spin
* on any locks (i.e. only trylocks). There is no high level
* GFP_$FOO flag for this use in try_alloc_pages() as the
* regular page allocator doesn't fully support this
* allocation mode.
*/
return !!(gfp_flags & __GFP_RECLAIM);
}
#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
}
#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))
struct page *try_alloc_pages_noprof(int nid, unsigned int order);
#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))
extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__))
@ -357,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
__get_free_pages((gfp_mask) | GFP_DMA, (order))
extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages_nolock(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);
#define __free_page(page) __free_pages((page), 0)

View File

@ -51,6 +51,76 @@
#define local_unlock_irqrestore(lock, flags) \
__local_unlock_irqrestore(lock, flags)
/**
* localtry_lock_init - Runtime initialize a lock instance
*/
#define localtry_lock_init(lock) __localtry_lock_init(lock)
/**
* localtry_lock - Acquire a per CPU local lock
* @lock: The lock variable
*/
#define localtry_lock(lock) __localtry_lock(lock)
/**
* localtry_lock_irq - Acquire a per CPU local lock and disable interrupts
* @lock: The lock variable
*/
#define localtry_lock_irq(lock) __localtry_lock_irq(lock)
/**
* localtry_lock_irqsave - Acquire a per CPU local lock, save and disable
* interrupts
* @lock: The lock variable
* @flags: Storage for interrupt flags
*/
#define localtry_lock_irqsave(lock, flags) \
__localtry_lock_irqsave(lock, flags)
/**
* localtry_trylock - Try to acquire a per CPU local lock.
* @lock: The lock variable
*
* The function can be used in any context such as NMI or HARDIRQ. Due to
* locking constrains it will _always_ fail to acquire the lock in NMI or
* HARDIRQ context on PREEMPT_RT.
*/
#define localtry_trylock(lock) __localtry_trylock(lock)
/**
* localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable
* interrupts if acquired
* @lock: The lock variable
* @flags: Storage for interrupt flags
*
* The function can be used in any context such as NMI or HARDIRQ. Due to
* locking constrains it will _always_ fail to acquire the lock in NMI or
* HARDIRQ context on PREEMPT_RT.
*/
#define localtry_trylock_irqsave(lock, flags) \
__localtry_trylock_irqsave(lock, flags)
/**
* local_unlock - Release a per CPU local lock
* @lock: The lock variable
*/
#define localtry_unlock(lock) __localtry_unlock(lock)
/**
* local_unlock_irq - Release a per CPU local lock and enable interrupts
* @lock: The lock variable
*/
#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock)
/**
* localtry_unlock_irqrestore - Release a per CPU local lock and restore
* interrupt flags
* @lock: The lock variable
* @flags: Interrupt flags to restore
*/
#define localtry_unlock_irqrestore(lock, flags) \
__localtry_unlock_irqrestore(lock, flags)
DEFINE_GUARD(local_lock, local_lock_t __percpu*,
local_lock(_T),
local_unlock(_T))

View File

@ -15,6 +15,11 @@ typedef struct {
#endif
} local_lock_t;
typedef struct {
local_lock_t llock;
unsigned int acquired;
} localtry_lock_t;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname) \
.dep_map = { \
@ -31,6 +36,13 @@ static inline void local_lock_acquire(local_lock_t *l)
l->owner = current;
}
static inline void local_trylock_acquire(local_lock_t *l)
{
lock_map_acquire_try(&l->dep_map);
DEBUG_LOCKS_WARN_ON(l->owner);
l->owner = current;
}
static inline void local_lock_release(local_lock_t *l)
{
DEBUG_LOCKS_WARN_ON(l->owner != current);
@ -45,11 +57,13 @@ static inline void local_lock_debug_init(local_lock_t *l)
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_trylock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */
#define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) }
#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }}
#define __local_lock_init(lock) \
do { \
@ -118,6 +132,104 @@ do { \
#define __local_unlock_nested_bh(lock) \
local_lock_release(this_cpu_ptr(lock))
/* localtry_lock_t variants */
#define __localtry_lock_init(lock) \
do { \
__local_lock_init(&(lock)->llock); \
WRITE_ONCE((lock)->acquired, 0); \
} while (0)
#define __localtry_lock(lock) \
do { \
localtry_lock_t *lt; \
preempt_disable(); \
lt = this_cpu_ptr(lock); \
local_lock_acquire(&lt->llock); \
WRITE_ONCE(lt->acquired, 1); \
} while (0)
#define __localtry_lock_irq(lock) \
do { \
localtry_lock_t *lt; \
local_irq_disable(); \
lt = this_cpu_ptr(lock); \
local_lock_acquire(&lt->llock); \
WRITE_ONCE(lt->acquired, 1); \
} while (0)
#define __localtry_lock_irqsave(lock, flags) \
do { \
localtry_lock_t *lt; \
local_irq_save(flags); \
lt = this_cpu_ptr(lock); \
local_lock_acquire(&lt->llock); \
WRITE_ONCE(lt->acquired, 1); \
} while (0)
#define __localtry_trylock(lock) \
({ \
localtry_lock_t *lt; \
bool _ret; \
\
preempt_disable(); \
lt = this_cpu_ptr(lock); \
if (!READ_ONCE(lt->acquired)) { \
WRITE_ONCE(lt->acquired, 1); \
local_trylock_acquire(&lt->llock); \
_ret = true; \
} else { \
_ret = false; \
preempt_enable(); \
} \
_ret; \
})
#define __localtry_trylock_irqsave(lock, flags) \
({ \
localtry_lock_t *lt; \
bool _ret; \
\
local_irq_save(flags); \
lt = this_cpu_ptr(lock); \
if (!READ_ONCE(lt->acquired)) { \
WRITE_ONCE(lt->acquired, 1); \
local_trylock_acquire(&lt->llock); \
_ret = true; \
} else { \
_ret = false; \
local_irq_restore(flags); \
} \
_ret; \
})
#define __localtry_unlock(lock) \
do { \
localtry_lock_t *lt; \
lt = this_cpu_ptr(lock); \
WRITE_ONCE(lt->acquired, 0); \
local_lock_release(&lt->llock); \
preempt_enable(); \
} while (0)
#define __localtry_unlock_irq(lock) \
do { \
localtry_lock_t *lt; \
lt = this_cpu_ptr(lock); \
WRITE_ONCE(lt->acquired, 0); \
local_lock_release(&lt->llock); \
local_irq_enable(); \
} while (0)
#define __localtry_unlock_irqrestore(lock, flags) \
do { \
localtry_lock_t *lt; \
lt = this_cpu_ptr(lock); \
WRITE_ONCE(lt->acquired, 0); \
local_lock_release(&lt->llock); \
local_irq_restore(flags); \
} while (0)
#else /* !CONFIG_PREEMPT_RT */
/*
@ -125,8 +237,10 @@ do { \
* critical section while staying preemptible.
*/
typedef spinlock_t local_lock_t;
typedef spinlock_t localtry_lock_t;
#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname)
#define __local_lock_init(l) \
do { \
@ -169,4 +283,36 @@ do { \
spin_unlock(this_cpu_ptr((lock))); \
} while (0)
/* localtry_lock_t variants */
#define __localtry_lock_init(lock) __local_lock_init(lock)
#define __localtry_lock(lock) __local_lock(lock)
#define __localtry_lock_irq(lock) __local_lock(lock)
#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags)
#define __localtry_unlock(lock) __local_unlock(lock)
#define __localtry_unlock_irq(lock) __local_unlock(lock)
#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags)
#define __localtry_trylock(lock) \
({ \
int __locked; \
\
if (in_nmi() | in_hardirq()) { \
__locked = 0; \
} else { \
migrate_disable(); \
__locked = spin_trylock(this_cpu_ptr((lock))); \
if (!__locked) \
migrate_enable(); \
} \
__locked; \
})
#define __localtry_trylock_irqsave(lock, flags) \
({ \
typecheck(unsigned long, flags); \
flags = 0; \
__localtry_trylock(lock); \
})
#endif /* CONFIG_PREEMPT_RT */

View File

@ -99,6 +99,10 @@ struct page {
/* Or, free page */
struct list_head buddy_list;
struct list_head pcp_list;
struct {
struct llist_node pcp_llist;
unsigned int order;
};
};
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;

View File

@ -972,6 +972,9 @@ struct zone {
/* Primarily protects free_area */
spinlock_t lock;
/* Pages to be freed when next trylock succeeds */
struct llist_head trylock_free_pages;
/* Write-intensive fields used by compaction and vmstats. */
CACHELINE_PADDING(_pad2_);

View File

@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
return VM_FAULT_SIGSEGV;
@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
if (ret)
goto out_free_pages;
ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO,
node_id, page_cnt, pages);
ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
if (ret)
goto out;

View File

@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map)
}
#endif
int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
static bool can_alloc_pages(void)
{
return preempt_count() == 0 && !irqs_disabled() &&
!IS_ENABLED(CONFIG_PREEMPT_RT);
}
static struct page *__bpf_alloc_page(int nid)
{
if (!can_alloc_pages())
return try_alloc_pages(nid, 0);
return alloc_pages_node(nid,
GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT
| __GFP_NOWARN,
0);
}
int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long nr_pages, struct page **pages)
{
unsigned long i, j;
@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid,
old_memcg = set_active_memcg(memcg);
#endif
for (i = 0; i < nr_pages; i++) {
pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0);
pg = __bpf_alloc_page(nid);
if (pg) {
pages[i] = pg;
continue;
}
for (j = 0; j < i; j++)
__free_page(pages[j]);
free_pages_nolock(pages[j], 0);
ret = -ENOMEM;
break;
}

View File

@ -591,7 +591,8 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
depot_stack_handle_t handle = 0;
struct page *page = NULL;
void *prealloc = NULL;
bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC;
bool allow_spin = gfpflags_allow_spinning(alloc_flags);
bool can_alloc = (depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC) && allow_spin;
unsigned long flags;
u32 hash;
@ -630,7 +631,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
prealloc = page_address(page);
}
if (in_nmi()) {
if (in_nmi() || !allow_spin) {
/* We can never allocate in NMI context. */
WARN_ON_ONCE(can_alloc);
/* Best effort; bail if we fail to take the lock. */
@ -671,7 +672,10 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
exit:
if (prealloc) {
/* Stack depot didn't use this memory, free it. */
free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
if (!allow_spin)
free_pages_nolock(virt_to_page(prealloc), DEPOT_POOL_ORDER);
else
free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
}
if (found)
handle = found->handle.handle;

View File

@ -1198,6 +1198,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_NOFRAGMENT 0x0
#endif
#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */
#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
/* Flags that allow allocations below the min watermark. */

View File

@ -1739,7 +1739,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
struct memcg_stock_pcp {
local_lock_t stock_lock;
localtry_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@ -1754,7 +1754,7 @@ struct memcg_stock_pcp {
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
.stock_lock = INIT_LOCAL_LOCK(stock_lock),
.stock_lock = INIT_LOCALTRY_LOCK(stock_lock),
};
static DEFINE_MUTEX(percpu_charge_mutex);
@ -1766,6 +1766,7 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
* @nr_pages: how many pages to charge.
* @gfp_mask: allocation mask.
*
* The charges will only happen if @memcg matches the current cpu's memcg
* stock, and at least @nr_pages are available in that stock. Failure to
@ -1773,7 +1774,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
*
* returns true if successful, false otherwise.
*/
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
gfp_t gfp_mask)
{
struct memcg_stock_pcp *stock;
unsigned int stock_pages;
@ -1783,7 +1785,11 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
local_lock_irqsave(&memcg_stock.stock_lock, flags);
if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
if (!gfpflags_allow_spinning(gfp_mask))
return ret;
localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
}
stock = this_cpu_ptr(&memcg_stock);
stock_pages = READ_ONCE(stock->nr_pages);
@ -1792,7 +1798,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
ret = true;
}
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@ -1831,14 +1837,14 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
local_lock_irqsave(&memcg_stock.stock_lock, flags);
localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@ -1868,9 +1874,20 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long flags;
local_lock_irqsave(&memcg_stock.stock_lock, flags);
if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
/*
* In case of unlikely failure to lock percpu stock_lock
* uncharge memcg directly.
*/
if (mem_cgroup_is_root(memcg))
return;
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
return;
}
__refill_stock(memcg, nr_pages);
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@ -1927,9 +1944,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
stock = &per_cpu(memcg_stock, cpu);
/* drain_obj_stock requires stock_lock */
local_lock_irqsave(&memcg_stock.stock_lock, flags);
localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
old = drain_obj_stock(stock);
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
drain_stock(stock);
obj_cgroup_put(old);
@ -2222,9 +2239,13 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned long pflags;
retry:
if (consume_stock(memcg, nr_pages))
if (consume_stock(memcg, nr_pages, gfp_mask))
return 0;
if (!gfpflags_allow_spinning(gfp_mask))
/* Avoid the refill and flush of the older stock */
batch = nr_pages;
if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
@ -2708,7 +2729,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
unsigned long flags;
int *bytes;
local_lock_irqsave(&memcg_stock.stock_lock, flags);
localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
/*
@ -2761,7 +2782,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
__mod_objcg_mlstate(objcg, pgdat, idx, nr);
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@ -2771,7 +2792,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
local_lock_irqsave(&memcg_stock.stock_lock, flags);
localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
@ -2779,7 +2800,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
ret = true;
}
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@ -2871,7 +2892,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
unsigned long flags;
unsigned int nr_pages = 0;
local_lock_irqsave(&memcg_stock.stock_lock, flags);
localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
@ -2889,7 +2910,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
if (nr_pages)

View File

@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
/* Free the page without taking locks. Rely on trylock only. */
#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@ -1249,13 +1252,44 @@ static void split_large_buddy(struct zone *zone, struct page *page,
} while (1);
}
static void add_page_to_zone_llist(struct zone *zone, struct page *page,
unsigned int order)
{
/* Remember the order */
page->order = order;
/* Add the page to the free list */
llist_add(&page->pcp_llist, &zone->trylock_free_pages);
}
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
struct llist_head *llhead;
unsigned long flags;
spin_lock_irqsave(&zone->lock, flags);
if (!spin_trylock_irqsave(&zone->lock, flags)) {
if (unlikely(fpi_flags & FPI_TRYLOCK)) {
add_page_to_zone_llist(zone, page, order);
return;
}
spin_lock_irqsave(&zone->lock, flags);
}
/* The lock succeeded. Process deferred pages. */
llhead = &zone->trylock_free_pages;
if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
struct llist_node *llnode;
struct page *p, *tmp;
llnode = llist_del_all(llhead);
llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
unsigned int p_order = p->order;
split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
__count_vm_events(PGFREE, 1 << p_order);
}
}
split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
@ -2307,7 +2341,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long flags;
int i;
spin_lock_irqsave(&zone->lock, flags);
if (!spin_trylock_irqsave(&zone->lock, flags)) {
if (unlikely(alloc_flags & ALLOC_TRYLOCK))
return 0;
spin_lock_irqsave(&zone->lock, flags);
}
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
alloc_flags);
@ -2595,7 +2633,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
static void free_frozen_page_commit(struct zone *zone,
struct per_cpu_pages *pcp, struct page *page, int migratetype,
unsigned int order)
unsigned int order, fpi_t fpi_flags)
{
int high, batch;
int pindex;
@ -2630,6 +2668,14 @@ static void free_frozen_page_commit(struct zone *zone,
}
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
pcp->free_count += (1 << order);
if (unlikely(fpi_flags & FPI_TRYLOCK)) {
/*
* Do not attempt to take a zone lock. Let pcp->count get
* over high mark temporarily.
*/
return;
}
high = nr_pcp_high(pcp, zone, batch, free_high);
if (pcp->count >= high) {
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@ -2644,7 +2690,8 @@ static void free_frozen_page_commit(struct zone *zone,
/*
* Free a pcp page
*/
void free_frozen_pages(struct page *page, unsigned int order)
static void __free_frozen_pages(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
unsigned long __maybe_unused UP_flags;
struct per_cpu_pages *pcp;
@ -2653,7 +2700,7 @@ void free_frozen_pages(struct page *page, unsigned int order)
int migratetype;
if (!pcp_allowed_order(order)) {
__free_pages_ok(page, order, FPI_NONE);
__free_pages_ok(page, order, fpi_flags);
return;
}
@ -2671,23 +2718,33 @@ void free_frozen_pages(struct page *page, unsigned int order)
migratetype = get_pfnblock_migratetype(page, pfn);
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
if (unlikely(is_migrate_isolate(migratetype))) {
free_one_page(zone, page, pfn, order, FPI_NONE);
free_one_page(zone, page, pfn, order, fpi_flags);
return;
}
migratetype = MIGRATE_MOVABLE;
}
if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
&& (in_nmi() || in_hardirq()))) {
add_page_to_zone_llist(zone, page, order);
return;
}
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
free_frozen_page_commit(zone, pcp, page, migratetype, order);
free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
pcp_spin_unlock(pcp);
} else {
free_one_page(zone, page, pfn, order, FPI_NONE);
free_one_page(zone, page, pfn, order, fpi_flags);
}
pcp_trylock_finish(UP_flags);
}
void free_frozen_pages(struct page *page, unsigned int order)
{
__free_frozen_pages(page, order, FPI_NONE);
}
/*
* Free a batch of folios
*/
@ -2776,7 +2833,7 @@ void free_unref_folios(struct folio_batch *folios)
trace_mm_page_free_batched(&folio->page);
free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
order);
order, FPI_NONE);
}
if (pcp) {
@ -2907,7 +2964,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
spin_lock_irqsave(&zone->lock, flags);
if (!spin_trylock_irqsave(&zone->lock, flags)) {
if (unlikely(alloc_flags & ALLOC_TRYLOCK))
return NULL;
spin_lock_irqsave(&zone->lock, flags);
}
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
@ -4512,7 +4573,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
might_alloc(gfp_mask);
if (should_fail_alloc_page(gfp_mask, order))
/*
* Don't invoke should_fail logic, since it may call
* get_random_u32() and printk() which need to spin_lock.
*/
if (!(*alloc_flags & ALLOC_TRYLOCK) &&
should_fail_alloc_page(gfp_mask, order))
return false;
*alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags);
@ -4810,9 +4876,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
EXPORT_SYMBOL(get_zeroed_page_noprof);
/**
* __free_pages - Free pages allocated with alloc_pages().
* ___free_pages - Free pages allocated with alloc_pages().
* @page: The page pointer returned from alloc_pages().
* @order: The order of the allocation.
* @fpi_flags: Free Page Internal flags.
*
* This function can free multi-page allocations that are not compound
* pages. It does not check that the @order passed in matches that of
@ -4829,22 +4896,37 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
* Context: May be called in interrupt context or while holding a normal
* spinlock, but not in NMI context or while holding a raw spinlock.
*/
void __free_pages(struct page *page, unsigned int order)
static void ___free_pages(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
/* get PageHead before we drop reference */
int head = PageHead(page);
struct alloc_tag *tag = pgalloc_tag_get(page);
if (put_page_testzero(page))
free_frozen_pages(page, order);
__free_frozen_pages(page, order, fpi_flags);
else if (!head) {
pgalloc_tag_sub_pages(tag, (1 << order) - 1);
while (order-- > 0)
free_frozen_pages(page + (1 << order), order);
__free_frozen_pages(page + (1 << order), order,
fpi_flags);
}
}
void __free_pages(struct page *page, unsigned int order)
{
___free_pages(page, order, FPI_NONE);
}
EXPORT_SYMBOL(__free_pages);
/*
* Can be called while holding raw_spin_lock or from IRQ and NMI for any
* page type (not only those that came from try_alloc_pages)
*/
void free_pages_nolock(struct page *page, unsigned int order)
{
___free_pages(page, order, FPI_TRYLOCK);
}
void free_pages(unsigned long addr, unsigned int order)
{
if (addr != 0) {
@ -7081,3 +7163,94 @@ static bool __free_unaccepted(struct page *page)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
/**
* try_alloc_pages - opportunistic reentrant allocation from any context
* @nid: node to allocate from
* @order: allocation order size
*
* Allocates pages of a given order from the given node. This is safe to
* call from any context (from atomic, NMI, and also reentrant
* allocator -> tracepoint -> try_alloc_pages_noprof).
* Allocation is best effort and to be expected to fail easily so nobody should
* rely on the success. Failures are not reported via warn_alloc().
* See always fail conditions below.
*
* Return: allocated page or NULL on failure.
*/
struct page *try_alloc_pages_noprof(int nid, unsigned int order)
{
/*
* Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed.
* Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd
* is not safe in arbitrary context.
*
* These two are the conditions for gfpflags_allow_spinning() being true.
*
* Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason
* to warn. Also warn would trigger printk() which is unsafe from
* various contexts. We cannot use printk_deferred_enter() to mitigate,
* since the running context is unknown.
*
* Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below
* is safe in any context. Also zeroing the page is mandatory for
* BPF use cases.
*
* Though __GFP_NOMEMALLOC is not checked in the code path below,
* specify it here to highlight that try_alloc_pages()
* doesn't want to deplete reserves.
*/
gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC
| __GFP_ACCOUNT;
unsigned int alloc_flags = ALLOC_TRYLOCK;
struct alloc_context ac = { };
struct page *page;
/*
* In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is
* unsafe in NMI. If spin_trylock() is called from hard IRQ the current
* task may be waiting for one rt_spin_lock, but rt_spin_trylock() will
* mark the task as the owner of another rt_spin_lock which will
* confuse PI logic, so return immediately if called form hard IRQ or
* NMI.
*
* Note, irqs_disabled() case is ok. This function can be called
* from raw_spin_lock_irqsave region.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq()))
return NULL;
if (!pcp_allowed_order(order))
return NULL;
#ifdef CONFIG_UNACCEPTED_MEMORY
/* Bailout, since try_to_accept_memory_one() needs to take a lock */
if (has_unaccepted_memory())
return NULL;
#endif
/* Bailout, since _deferred_grow_zone() needs to take a lock */
if (deferred_pages_enabled())
return NULL;
if (nid == NUMA_NO_NODE)
nid = numa_node_id();
prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac,
&alloc_gfp, &alloc_flags);
/*
* Best effort allocation from percpu free list.
* If it's empty attempt to spin_trylock zone->lock.
*/
page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
/* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */
if (memcg_kmem_online() && page &&
unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) {
free_pages_nolock(page, order);
page = NULL;
}
trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
kmsan_alloc_page(page, order, alloc_gfp);
return page;
}

View File

@ -294,7 +294,13 @@ void __reset_page_owner(struct page *page, unsigned short order)
page_owner = get_page_owner(page_ext);
alloc_handle = page_owner->handle;
handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
/*
* Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
* to prevent issues in stack_depot_save().
* This is similar to try_alloc_pages() gfp flags, but only used
* to signal stack_depot to avoid spin_locks.
*/
handle = save_stack(__GFP_NOWARN);
__update_page_owner_free_handle(page_ext, handle, order, current->pid,
current->tgid, free_ts_nsec);
page_ext_put(page_ext);