mm: memcontrol: convert objcg to be per-memcg per-node type
Convert objcg to be per-memcg per-node type, so that when reparent LRU folios later, we can hold the lru lock at the node level, thus avoiding holding too many lru locks at once. [zhengqi.arch@bytedance.com: reset pn->orig_objcg to NULL] Link: https://lore.kernel.org/20260309112939.31937-1-qi.zheng@linux.dev [akpm@linux-foundation.org: fix comment typo, per Usama. Reflow comment to 80 cols] [devnexen@gmail.com: fix obj_cgroup leak in mem_cgroup_css_online() error path] Link: https://lore.kernel.org/20260322193631.45457-1-devnexen@gmail.com [devnexen@gmail.com: add newline, per Qi Zheng] Link: https://lore.kernel.org/20260323063007.7783-1-devnexen@gmail.com Link: https://lore.kernel.org/56c04b1c5d54f75ccdc12896df6c1ca35403ecc3.1772711148.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com> Signed-off-by: David Carlier <devnexen@gmail.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Cc: Allen Pais <apais@linux.microsoft.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Baoquan He <bhe@redhat.com> Cc: Chengming Zhou <chengming.zhou@linux.dev> Cc: Chen Ridong <chenridong@huawei.com> Cc: David Hildenbrand <david@kernel.org> Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com> Cc: Harry Yoo <harry.yoo@oracle.com> Cc: Hugh Dickins <hughd@google.com> Cc: Imran Khan <imran.f.khan@oracle.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com> Cc: Lance Yang <lance.yang@linux.dev> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Michal Koutný <mkoutny@suse.com> Cc: Mike Rapoport <rppt@kernel.org> Cc: Muchun Song <muchun.song@linux.dev> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Usama Arif <usamaarif642@gmail.com> Cc: Vlastimil Babka <vbabka@kernel.org> Cc: Wei Xu <weixugc@google.com> Cc: Yosry Ahmed <yosry@kernel.org> Cc: Yuanchu Xie <yuanchu@google.com> Cc: Zi Yan <ziy@nvidia.com> Cc: Usama Arif <usama.arif@linux.dev> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>master
parent
8285917d6f
commit
01b9da291c
|
|
@ -115,6 +115,16 @@ struct mem_cgroup_per_node {
|
|||
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
|
||||
struct mem_cgroup_reclaim_iter iter;
|
||||
|
||||
/*
|
||||
* objcg is wiped out as a part of the objcg repaprenting process.
|
||||
* orig_objcg preserves a pointer (and a reference) to the original
|
||||
* objcg until the end of live of memcg.
|
||||
*/
|
||||
struct obj_cgroup __rcu *objcg;
|
||||
struct obj_cgroup *orig_objcg;
|
||||
/* list of inherited objcgs, protected by objcg_lock */
|
||||
struct list_head objcg_list;
|
||||
|
||||
#ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC
|
||||
/* slab stats for nmi context */
|
||||
atomic_t slab_reclaimable;
|
||||
|
|
@ -179,6 +189,7 @@ struct obj_cgroup {
|
|||
struct list_head list; /* protected by objcg_lock */
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
bool is_root;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -257,15 +268,6 @@ struct mem_cgroup {
|
|||
seqlock_t socket_pressure_seqlock;
|
||||
#endif
|
||||
int kmemcg_id;
|
||||
/*
|
||||
* memcg->objcg is wiped out as a part of the objcg repaprenting
|
||||
* process. memcg->orig_objcg preserves a pointer (and a reference)
|
||||
* to the original objcg until the end of live of memcg.
|
||||
*/
|
||||
struct obj_cgroup __rcu *objcg;
|
||||
struct obj_cgroup *orig_objcg;
|
||||
/* list of inherited objcgs, protected by objcg_lock */
|
||||
struct list_head objcg_list;
|
||||
|
||||
struct memcg_vmstats_percpu __percpu *vmstats_percpu;
|
||||
|
||||
|
|
@ -332,7 +334,6 @@ struct mem_cgroup {
|
|||
#define MEMCG_CHARGE_BATCH 64U
|
||||
|
||||
extern struct mem_cgroup *root_mem_cgroup;
|
||||
extern struct obj_cgroup *root_obj_cgroup;
|
||||
|
||||
enum page_memcg_data_flags {
|
||||
/* page->memcg_data is a pointer to an slabobj_ext vector */
|
||||
|
|
@ -551,7 +552,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
|
|||
|
||||
static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg)
|
||||
{
|
||||
return objcg == root_obj_cgroup;
|
||||
return objcg->is_root;
|
||||
}
|
||||
|
||||
static inline bool mem_cgroup_disabled(void)
|
||||
|
|
|
|||
|
|
@ -1533,7 +1533,7 @@ struct task_struct {
|
|||
/* Used by memcontrol for targeted memcg charge: */
|
||||
struct mem_cgroup *active_memcg;
|
||||
|
||||
/* Cache for current->cgroups->memcg->objcg lookups: */
|
||||
/* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */
|
||||
struct obj_cgroup *objcg;
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -83,8 +83,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
|
|||
struct mem_cgroup *root_mem_cgroup __read_mostly;
|
||||
EXPORT_SYMBOL(root_mem_cgroup);
|
||||
|
||||
struct obj_cgroup *root_obj_cgroup __read_mostly;
|
||||
|
||||
/* Active memory cgroup to use from an interrupt context */
|
||||
DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
|
||||
EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
|
||||
|
|
@ -209,18 +207,21 @@ static struct obj_cgroup *obj_cgroup_alloc(void)
|
|||
}
|
||||
|
||||
static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg,
|
||||
struct mem_cgroup *parent)
|
||||
struct mem_cgroup *parent,
|
||||
int nid)
|
||||
{
|
||||
struct obj_cgroup *objcg, *iter;
|
||||
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
|
||||
struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid];
|
||||
|
||||
objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
|
||||
objcg = rcu_replace_pointer(pn->objcg, NULL, true);
|
||||
/* 1) Ready to reparent active objcg. */
|
||||
list_add(&objcg->list, &memcg->objcg_list);
|
||||
list_add(&objcg->list, &pn->objcg_list);
|
||||
/* 2) Reparent active objcg and already reparented objcgs to parent. */
|
||||
list_for_each_entry(iter, &memcg->objcg_list, list)
|
||||
list_for_each_entry(iter, &pn->objcg_list, list)
|
||||
WRITE_ONCE(iter->memcg, parent);
|
||||
/* 3) Move already reparented objcgs to the parent's list */
|
||||
list_splice(&memcg->objcg_list, &parent->objcg_list);
|
||||
list_splice(&pn->objcg_list, &parent_pn->objcg_list);
|
||||
|
||||
return objcg;
|
||||
}
|
||||
|
|
@ -267,14 +268,17 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg)
|
|||
{
|
||||
struct obj_cgroup *objcg;
|
||||
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
|
||||
int nid;
|
||||
|
||||
reparent_locks(memcg, parent);
|
||||
for_each_node(nid) {
|
||||
reparent_locks(memcg, parent);
|
||||
|
||||
objcg = __memcg_reparent_objcgs(memcg, parent);
|
||||
objcg = __memcg_reparent_objcgs(memcg, parent, nid);
|
||||
|
||||
reparent_unlocks(memcg, parent);
|
||||
reparent_unlocks(memcg, parent);
|
||||
|
||||
percpu_ref_kill(&objcg->refcnt);
|
||||
percpu_ref_kill(&objcg->refcnt);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -2830,8 +2834,10 @@ struct mem_cgroup *mem_cgroup_from_virt(void *p)
|
|||
|
||||
static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
int nid = numa_node_id();
|
||||
|
||||
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
||||
struct obj_cgroup *objcg = rcu_dereference(memcg->objcg);
|
||||
struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg);
|
||||
|
||||
if (likely(objcg && obj_cgroup_tryget(objcg)))
|
||||
return objcg;
|
||||
|
|
@ -2895,6 +2901,7 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
|
|||
{
|
||||
struct mem_cgroup *memcg;
|
||||
struct obj_cgroup *objcg;
|
||||
int nid = numa_node_id();
|
||||
|
||||
if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi())
|
||||
return NULL;
|
||||
|
|
@ -2911,14 +2918,14 @@ __always_inline struct obj_cgroup *current_obj_cgroup(void)
|
|||
* Objcg reference is kept by the task, so it's safe
|
||||
* to use the objcg by the current task.
|
||||
*/
|
||||
return objcg ? : root_obj_cgroup;
|
||||
return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
|
||||
}
|
||||
|
||||
memcg = this_cpu_read(int_active_memcg);
|
||||
if (unlikely(memcg))
|
||||
goto from_memcg;
|
||||
|
||||
return root_obj_cgroup;
|
||||
return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
|
||||
|
||||
from_memcg:
|
||||
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
|
||||
|
|
@ -2928,12 +2935,12 @@ from_memcg:
|
|||
* away and can be used within the scope without any additional
|
||||
* protection.
|
||||
*/
|
||||
objcg = rcu_dereference_check(memcg->objcg, 1);
|
||||
objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1);
|
||||
if (likely(objcg))
|
||||
return objcg;
|
||||
}
|
||||
|
||||
return root_obj_cgroup;
|
||||
return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1);
|
||||
}
|
||||
|
||||
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
|
||||
|
|
@ -3876,6 +3883,8 @@ static bool alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
|
|||
if (!pn->lruvec_stats_percpu)
|
||||
goto fail;
|
||||
|
||||
INIT_LIST_HEAD(&pn->objcg_list);
|
||||
|
||||
lruvec_init(&pn->lruvec);
|
||||
pn->memcg = memcg;
|
||||
|
||||
|
|
@ -3890,10 +3899,14 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
|
|||
{
|
||||
int node;
|
||||
|
||||
obj_cgroup_put(memcg->orig_objcg);
|
||||
for_each_node(node) {
|
||||
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
|
||||
if (!pn)
|
||||
continue;
|
||||
|
||||
for_each_node(node)
|
||||
free_mem_cgroup_per_node_info(memcg->nodeinfo[node]);
|
||||
obj_cgroup_put(pn->orig_objcg);
|
||||
free_mem_cgroup_per_node_info(pn);
|
||||
}
|
||||
memcg1_free_events(memcg);
|
||||
kfree(memcg->vmstats);
|
||||
free_percpu(memcg->vmstats_percpu);
|
||||
|
|
@ -3964,7 +3977,6 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
|
|||
#endif
|
||||
memcg1_memcg_init(memcg);
|
||||
memcg->kmemcg_id = -1;
|
||||
INIT_LIST_HEAD(&memcg->objcg_list);
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
INIT_LIST_HEAD(&memcg->cgwb_list);
|
||||
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
|
||||
|
|
@ -4041,6 +4053,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
|||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
struct obj_cgroup *objcg;
|
||||
int nid;
|
||||
|
||||
memcg_online_kmem(memcg);
|
||||
|
||||
|
|
@ -4052,17 +4065,19 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
|||
if (alloc_shrinker_info(memcg))
|
||||
goto offline_kmem;
|
||||
|
||||
objcg = obj_cgroup_alloc();
|
||||
if (!objcg)
|
||||
goto free_shrinker;
|
||||
for_each_node(nid) {
|
||||
objcg = obj_cgroup_alloc();
|
||||
if (!objcg)
|
||||
goto free_objcg;
|
||||
|
||||
if (unlikely(mem_cgroup_is_root(memcg)))
|
||||
root_obj_cgroup = objcg;
|
||||
if (unlikely(mem_cgroup_is_root(memcg)))
|
||||
objcg->is_root = true;
|
||||
|
||||
objcg->memcg = memcg;
|
||||
rcu_assign_pointer(memcg->objcg, objcg);
|
||||
obj_cgroup_get(objcg);
|
||||
memcg->orig_objcg = objcg;
|
||||
objcg->memcg = memcg;
|
||||
rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg);
|
||||
obj_cgroup_get(objcg);
|
||||
memcg->nodeinfo[nid]->orig_objcg = objcg;
|
||||
}
|
||||
|
||||
if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled())
|
||||
queue_delayed_work(system_dfl_wq, &stats_flush_dwork,
|
||||
|
|
@ -4086,7 +4101,24 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
|
|||
xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL);
|
||||
|
||||
return 0;
|
||||
free_shrinker:
|
||||
free_objcg:
|
||||
for_each_node(nid) {
|
||||
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
|
||||
|
||||
objcg = rcu_replace_pointer(pn->objcg, NULL, true);
|
||||
if (objcg)
|
||||
percpu_ref_kill(&objcg->refcnt);
|
||||
|
||||
if (pn->orig_objcg) {
|
||||
obj_cgroup_put(pn->orig_objcg);
|
||||
/*
|
||||
* Reset pn->orig_objcg to NULL to prevent
|
||||
* obj_cgroup_put() from being called again in
|
||||
* __mem_cgroup_free().
|
||||
*/
|
||||
pn->orig_objcg = NULL;
|
||||
}
|
||||
}
|
||||
free_shrinker_info(memcg);
|
||||
offline_kmem:
|
||||
memcg_offline_kmem(memcg);
|
||||
|
|
|
|||
Loading…
Reference in New Issue