LinuxLists.cc - [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

2019-10-18 22:24:23

Subject: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

Switch to per-object accounting of non-root slab objects.

Charging is performed using subpage charging API in pre_alloc hook.
If the amount of memory has been charged successfully, we proceed
with the actual allocation. Otherwise, -ENOMEM is returned.

In post_alloc hook we do check if the actual allocation succeeded.
If so, corresponding vmstats are bumped and memcg membership
information is recorded. Otherwise, the charge is canceled.

On free path we do look for memcg membership information,
decrement stats and do uncharge. No operations are performed
with root kmem_caches.

Global per-node slab-related vmstats NR_SLAB_(UN)RECLAIMABLE_B
are still modified from (un)charge_slab_page() functions. The idea
is to keep all slab pages accounted as slab pages on system level.
Memcg and lruvec counters are now representing only memory used
by actual slab objects and do not include free space. Free space
is shared and doesn't belong to any specific cgroup.

Signed-off-by: Roman Gushchin <[email protected]>
---
mm/slab.h | 152 ++++++++++++++++++++----------------------------------
1 file changed, 57 insertions(+), 95 deletions(-)

diff --git a/mm/slab.h b/mm/slab.h
index 28feabed1e9a..0f2f712de77a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -340,72 +340,6 @@ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
return NULL;
}

-/*
- * Charge the slab page belonging to the non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline int memcg_charge_slab(struct page *page,
- gfp_t gfp, int order,
- struct kmem_cache *s)
-{
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
- int ret;
-
- rcu_read_lock();
- memcg = READ_ONCE(s->memcg_params.memcg);
- while (memcg && !css_tryget_online(&memcg->css))
- memcg = parent_mem_cgroup(memcg);
- rcu_read_unlock();
-
- if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- (PAGE_SIZE << order));
- percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
- return 0;
- }
-
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
- if (ret)
- goto out;
-
- lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), PAGE_SIZE << order);
-
- /* transer try_charge() page references to kmem_cache */
- percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
- css_put_many(&memcg->css, 1 << order);
-out:
- css_put(&memcg->css);
- return ret;
-}
-
-/*
- * Uncharge a slab page belonging to a non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline void memcg_uncharge_slab(struct page *page, int order,
- struct kmem_cache *s)
-{
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
-
- rcu_read_lock();
- memcg = READ_ONCE(s->memcg_params.memcg);
- if (likely(!mem_cgroup_is_root(memcg))) {
- lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
- mod_lruvec_state(lruvec, cache_vmstat_idx(s),
- -(PAGE_SIZE << order));
- memcg_kmem_uncharge_memcg(page, order, memcg);
- } else {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -(PAGE_SIZE << order));
- }
- rcu_read_unlock();
-
- percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
-}
-
static inline int memcg_alloc_page_memcg_vec(struct page *page, gfp_t gfp,
unsigned int objects)
{
@@ -423,11 +357,31 @@ static inline void memcg_free_page_memcg_vec(struct page *page)
page->mem_cgroup_vec = NULL;
}

+static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct mem_cgroup **memcgp,
+ size_t size, gfp_t flags)
+{
+ struct kmem_cache *cachep;
+
+ cachep = memcg_kmem_get_cache(s, memcgp);
+ if (is_root_cache(cachep))
+ return s;
+
+ if (__memcg_kmem_charge_subpage(*memcgp, size * s->size, flags)) {
+ mem_cgroup_put(*memcgp);
+ memcg_kmem_put_cache(cachep);
+ cachep = NULL;
+ }
+
+ return cachep;
+}
+
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
struct mem_cgroup *memcg,
size_t size, void **p)
{
struct mem_cgroup_ptr *memcg_ptr;
+ struct lruvec *lruvec;
struct page *page;
unsigned long off;
size_t i;
@@ -439,6 +393,11 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
off = obj_to_index(s, page, p[i]);
mem_cgroup_ptr_get(memcg_ptr);
page->mem_cgroup_vec[off] = memcg_ptr;
+ lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s),
+ s->size);
+ } else {
+ __memcg_kmem_uncharge_subpage(memcg, s->size);
}
}
mem_cgroup_ptr_put(memcg_ptr);
@@ -451,6 +410,8 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
void *p)
{
struct mem_cgroup_ptr *memcg_ptr;
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
unsigned int off;

if (!memcg_kmem_enabled() || is_root_cache(s))
@@ -459,6 +420,14 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
off = obj_to_index(s, page, p);
memcg_ptr = page->mem_cgroup_vec[off];
page->mem_cgroup_vec[off] = NULL;
+ rcu_read_lock();
+ memcg = memcg_ptr->memcg;
+ if (likely(!mem_cgroup_is_root(memcg))) {
+ __memcg_kmem_uncharge_subpage(memcg, s->size);
+ lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s), -s->size);
+ }
+ rcu_read_unlock();
mem_cgroup_ptr_put(memcg_ptr);
}

@@ -500,17 +469,6 @@ static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
return NULL;
}

-static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
- struct kmem_cache *s)
-{
- return 0;
-}
-
-static inline void memcg_uncharge_slab(struct page *page, int order,
- struct kmem_cache *s)
-{
-}
-
static inline int memcg_alloc_page_memcg_vec(struct page *page, gfp_t gfp,
unsigned int objects)
{
@@ -521,6 +479,13 @@ static inline void memcg_free_page_memcg_vec(struct page *page)
{
}

+static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct mem_cgroup **memcgp,
+ size_t size, gfp_t flags)
+{
+ return NULL;
+}
+
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
struct mem_cgroup *memcg,
size_t size, void **p)
@@ -561,30 +526,27 @@ static __always_inline int charge_slab_page(struct page *page,
{
int ret;

- if (is_root_cache(s)) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- PAGE_SIZE << order);
- return 0;
- }
-
- ret = memcg_alloc_page_memcg_vec(page, gfp, objects);
- if (ret)
- return ret;
+ if (!is_root_cache(s)) {
+ ret = memcg_alloc_page_memcg_vec(page, gfp, objects);
+ if (ret)
+ return ret;

- return memcg_charge_slab(page, gfp, order, s);
+ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+ }
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ PAGE_SIZE << order);
+ return 0;
}

static __always_inline void uncharge_slab_page(struct page *page, int order,
struct kmem_cache *s)
{
- if (is_root_cache(s)) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -(PAGE_SIZE << order));
- return;
+ if (!is_root_cache(s)) {
+ memcg_free_page_memcg_vec(page);
+ percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
}
-
- memcg_free_page_memcg_vec(page);
- memcg_uncharge_slab(page, order, s);
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ -(PAGE_SIZE << order));
}

static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
@@ -656,7 +618,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,

if (memcg_kmem_enabled() &&
((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
- return memcg_kmem_get_cache(s, memcgp);
+ return memcg_slab_pre_alloc_hook(s, memcgp, size, flags);

return s;
}
--
2.21.0

2019-10-25 20:55:56

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

On Thu, Oct 17, 2019 at 05:28:13PM -0700, Roman Gushchin wrote:
> +static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
> + struct mem_cgroup **memcgp,
> + size_t size, gfp_t flags)
> +{
> + struct kmem_cache *cachep;
> +
> + cachep = memcg_kmem_get_cache(s, memcgp);
> + if (is_root_cache(cachep))
> + return s;
> +
> + if (__memcg_kmem_charge_subpage(*memcgp, size * s->size, flags)) {
> + mem_cgroup_put(*memcgp);
> + memcg_kmem_put_cache(cachep);
> + cachep = NULL;
> + }
> +
> + return cachep;
> +}
> +
> static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> struct mem_cgroup *memcg,
> size_t size, void **p)
> {
> struct mem_cgroup_ptr *memcg_ptr;
> + struct lruvec *lruvec;
> struct page *page;
> unsigned long off;
> size_t i;
> @@ -439,6 +393,11 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> off = obj_to_index(s, page, p[i]);
> mem_cgroup_ptr_get(memcg_ptr);
> page->mem_cgroup_vec[off] = memcg_ptr;
> + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
> + mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s),
> + s->size);
> + } else {
> + __memcg_kmem_uncharge_subpage(memcg, s->size);
> }
> }
> mem_cgroup_ptr_put(memcg_ptr);

The memcg_ptr as a collection vessel for object references makes a lot
of sense. But this code showcases that it should be a first-class
memory tracking API that the allocator interacts with, rather than
having to deal with a combination of memcg_ptr and memcg.

In the two hunks here, on one hand we charge bytes to the memcg
object, and then handle all the refcounting through a different
bucketing object. To support that in the first place, we have to
overload the memcg API all the way down to try_charge() to support
bytes and pages. This is difficult to follow throughout all layers.

What would be better is for this to be an abstraction layer for a
subpage object tracker that sits on top of the memcg page tracker -
not unlike the page allocator and the slab allocators themselves.

And then the slab allocator would only interact with the subpage
object tracker, and the object tracker would deal with the memcg page
tracker under the hood.

Something like the below, just a rough sketch to convey the idea:

On allocation, we look up an obj_cgroup from the current task, charge
it with obj_cgroup_charge() and bump its refcount by the number of
objects we allocated. obj_cgroup_charge() in turn uses the memcg layer
to charge pages, and then doles them out as bytes using
objcg->nr_stocked_bytes and the byte per-cpu stock.

On freeing, we call obj_cgroup_uncharge() and drop the references. In
turn, this first refills the byte per-cpu stock and drains the
overflow to the memcg layer (which in turn refills its own page stock
and drains to the page_counter).

On cgroup deletion, we reassign the obj_cgroup to the parent, and all
remaining live objects will free their pages to the updated parental
obj_cgroup->memcg, as per usual.

---

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f36203cf75f8..efc3398aaf02 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -200,15 +200,15 @@ struct memcg_cgwb_frn {
};

/*
- * A pointer to a memory cgroup with a built-in reference counter.
- * For a use as an intermediate object to simplify reparenting of
- * objects charged to the cgroup. The memcg pointer can be switched
- * to the parent cgroup without a need to modify all objects
- * which hold the reference to the cgroup.
+ * Bucket for arbitrarily byte-sized objects charged to a memory
+ * cgroup. The bucket can be reparented in one piece when the cgroup
+ * is destroyed, without having to round up the individual references
+ * of all live memory objects in the wild.
*/
-struct mem_cgroup_ptr {
- struct percpu_ref refcnt;
+struct obj_cgroup {
+ struct percpu_ref count;
struct mem_cgroup *memcg;
+ unsigned long nr_stocked_bytes;
union {
struct list_head list;
struct rcu_head rcu;
@@ -230,7 +230,6 @@ struct mem_cgroup {
/* Accounted resources */
struct page_counter memory;
struct page_counter swap;
- atomic_t nr_stocked_bytes;

/* Legacy consumer-oriented counters */
struct page_counter memsw;
@@ -327,8 +326,8 @@ struct mem_cgroup {
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
enum memcg_kmem_state kmem_state;
- struct mem_cgroup_ptr __rcu *kmem_memcg_ptr;
- struct list_head kmem_memcg_ptr_list;
+ struct obj_cgroup __rcu *slab_objs;
+ struct list_head slab_objs_list;
#endif

int last_scanned_node;
@@ -474,21 +473,6 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
return css ? container_of(css, struct mem_cgroup, css) : NULL;
}

-static inline bool mem_cgroup_ptr_tryget(struct mem_cgroup_ptr *ptr)
-{
- return percpu_ref_tryget(&ptr->refcnt);
-}
-
-static inline void mem_cgroup_ptr_get(struct mem_cgroup_ptr *ptr)
-{
- percpu_ref_get(&ptr->refcnt);
-}
-
-static inline void mem_cgroup_ptr_put(struct mem_cgroup_ptr *ptr)
-{
- percpu_ref_put(&ptr->refcnt);
-}
-
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
if (memcg)
@@ -1444,9 +1428,9 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg);
void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
unsigned int nr_pages);
-int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
+int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size,
gfp_t gfp);
-void __memcg_kmem_uncharge_subpage(struct mem_cgroup *memcg, size_t size);
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
@@ -1513,20 +1497,20 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}

-static inline struct mem_cgroup_ptr *
-mem_cgroup_get_kmem_ptr(struct mem_cgroup *memcg)
+static inline struct obj_cgroup *
+mem_cgroup_get_slab_objs(struct mem_cgroup *memcg)
{
- struct mem_cgroup_ptr *memcg_ptr;
+ struct obj_cgroup *objcg;

rcu_read_lock();
do {
- memcg_ptr = rcu_dereference(memcg->kmem_memcg_ptr);
- if (memcg_ptr && mem_cgroup_ptr_tryget(memcg_ptr))
+ objcg = rcu_dereference(memcg->slab_objs);
+ if (objcg && percpu_ref_tryget(&objcg->count))
break;
} while ((memcg = parent_mem_cgroup(memcg)));
rcu_read_unlock();

- return memcg_ptr;
+ return objcg;
}

#else
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4d99ee5a9c53..7f663fd53c17 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -200,7 +200,7 @@ struct page {
#ifdef CONFIG_MEMCG
union {
struct mem_cgroup *mem_cgroup;
- struct mem_cgroup_ptr **mem_cgroup_vec;
+ struct obj_cgroup **obj_cgroups;
};
#endif

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b0d0c833150c..47110f4f0f4c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -260,52 +260,51 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
#ifdef CONFIG_MEMCG_KMEM
extern spinlock_t css_set_lock;

-static void memcg_ptr_release(struct percpu_ref *ref)
+static void obj_cgroup_release(struct percpu_ref *ref)
{
- struct mem_cgroup_ptr *ptr = container_of(ref, struct mem_cgroup_ptr,
- refcnt);
+ struct obj_cgroup *objcg;
unsigned long flags;

+ objcg = container_of(ref, struct obj_cgroup, count);
+
spin_lock_irqsave(&css_set_lock, flags);
- list_del(&ptr->list);
+ list_del(&objcg->list);
spin_unlock_irqrestore(&css_set_lock, flags);

- mem_cgroup_put(ptr->memcg);
+ mem_cgroup_put(objcg->memcg);
percpu_ref_exit(ref);
- kfree_rcu(ptr, rcu);
+ kfree_rcu(objcg, rcu);
}

-static int memcg_init_kmem_memcg_ptr(struct mem_cgroup *memcg)
+static int obj_cgroup_alloc(struct mem_cgroup *memcg)
{
- struct mem_cgroup_ptr *kmem_memcg_ptr;
+ struct obj_cgroup *objcg;
int ret;

- kmem_memcg_ptr = kmalloc(sizeof(struct mem_cgroup_ptr), GFP_KERNEL);
- if (!kmem_memcg_ptr)
+ objcg = kmalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
+ if (!objcg)
return -ENOMEM;

- ret = percpu_ref_init(&kmem_memcg_ptr->refcnt, memcg_ptr_release,
+ ret = percpu_ref_init(&objcg->count, obj_cgroup_release,
0, GFP_KERNEL);
if (ret) {
- kfree(kmem_memcg_ptr);
+ kfree(objcg);
return ret;
}

- kmem_memcg_ptr->memcg = memcg;
- INIT_LIST_HEAD(&kmem_memcg_ptr->list);
- rcu_assign_pointer(memcg->kmem_memcg_ptr, kmem_memcg_ptr);
- list_add(&kmem_memcg_ptr->list, &memcg->kmem_memcg_ptr_list);
+ INIT_LIST_HEAD(&objcg->list);
+ objcg->memcg = memcg;
return 0;
}

-static void memcg_reparent_kmem_memcg_ptr(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
+static void slab_objs_reparent(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
{
+ struct obj_cgroup *objcg = NULL;
unsigned int nr_reparented = 0;
- struct mem_cgroup_ptr *memcg_ptr = NULL;

- rcu_swap_protected(memcg->kmem_memcg_ptr, memcg_ptr, true);
- percpu_ref_kill(&memcg_ptr->refcnt);
+ rcu_swap_protected(memcg->slab_objs, objcg, true);
+ percpu_ref_kill(&objcg->count);

/*
* kmem_memcg_ptr is initialized before css refcounter, so until now
@@ -314,13 +313,13 @@ static void memcg_reparent_kmem_memcg_ptr(struct mem_cgroup *memcg,
css_get(&memcg->css);

spin_lock_irq(&css_set_lock);
- list_for_each_entry(memcg_ptr, &memcg->kmem_memcg_ptr_list, list) {
- xchg(&memcg_ptr->memcg, parent);
+ list_for_each_entry(objcg, &memcg->slab_objs_list, list) {
+ xchg(&objcg->memcg, parent);
nr_reparented++;
}
if (nr_reparented)
- list_splice(&memcg->kmem_memcg_ptr_list,
- &parent->kmem_memcg_ptr_list);
+ list_splice(&memcg->slab_objs_list,
+ &parent->slab_objs_list);
spin_unlock_irq(&css_set_lock);

if (nr_reparented) {
@@ -2217,7 +2216,7 @@ struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;

- struct mem_cgroup *subpage_cached;
+ struct obj_cgroup *obj_cached;
unsigned int nr_bytes;

struct work_struct work;
@@ -2260,29 +2259,6 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
return ret;
}

-static bool consume_subpage_stock(struct mem_cgroup *memcg,
- unsigned int nr_bytes)
-{
- struct memcg_stock_pcp *stock;
- unsigned long flags;
- bool ret = false;
-
- if (nr_bytes > (MEMCG_CHARGE_BATCH << PAGE_SHIFT))
- return ret;
-
- local_irq_save(flags);
-
- stock = this_cpu_ptr(&memcg_stock);
- if (memcg == stock->subpage_cached && stock->nr_bytes >= nr_bytes) {
- stock->nr_bytes -= nr_bytes;
- ret = true;
- }
-
- local_irq_restore(flags);
-
- return ret;
-}
-
/*
* Returns stocks cached in percpu and reset cached information.
*/
@@ -2300,59 +2276,73 @@ static void drain_stock(struct memcg_stock_pcp *stock)
stock->cached = NULL;
}

-static void drain_subpage_stock(struct memcg_stock_pcp *stock)
+/*
+ * Cache charges(val) to local per_cpu area.
+ * This will be consumed by consume_stock() function, later.
+ */
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
- struct mem_cgroup *old = stock->subpage_cached;
-
- if (stock->nr_bytes) {
- unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
- unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;

- page_counter_uncharge(&old->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&old->memsw, nr_pages);
+ local_irq_save(flags);

- atomic_add(nr_bytes, &old->nr_stocked_bytes);
- stock->nr_bytes = 0;
- }
- if (stock->subpage_cached) {
- /*
- * Unlike generic percpu stock, where ->cached pointer is
- * protected by css references of held pages, pages held
- * by subpage stocks are not holding a css reference, so
- * we need to protect ->subpage_cached pointer explicitly.
- *
- * css_put() is paired with css_get() in refill_subpage_stock()
- */
- css_put(&old->css);
- stock->subpage_cached = NULL;
+ stock = this_cpu_ptr(&memcg_stock);
+ if (stock->cached != memcg) { /* reset if necessary */
+ drain_stock(stock);
+ stock->cached = memcg;
}
+ stock->nr_pages += nr_pages;
+
+ if (stock->nr_pages > MEMCG_CHARGE_BATCH)
+ drain_stock(stock);
+
+ local_irq_restore(flags);
}

-static void drain_local_stock(struct work_struct *dummy)
+static bool consume_obj_stock(struct obj_cgroup *objcg,
+ unsigned int nr_bytes)
{
struct memcg_stock_pcp *stock;
unsigned long flags;
+ bool ret = false;
+
+ if (nr_bytes > (MEMCG_CHARGE_BATCH << PAGE_SHIFT))
+ return ret;

- /*
- * The only protection from memory hotplug vs. drain_stock races is
- * that we always operate on local CPU stock here with IRQ disabled
- */
local_irq_save(flags);

stock = this_cpu_ptr(&memcg_stock);
- drain_stock(stock);
- drain_subpage_stock(stock);
- clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+ if (objcg == stock->obj_cached && stock->nr_bytes >= nr_bytes) {
+ stock->nr_bytes -= nr_bytes;
+ ret = true;
+ }

local_irq_restore(flags);
+
+ return ret;
}

-/*
- * Cache charges(val) to local per_cpu area.
- * This will be consumed by consume_stock() function, later.
- */
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+ struct obj_cgroup *old = stock->obj_cached;
+
+ if (stock->nr_bytes) {
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+ refill_stock(old->memcg, nr_pages);
+ atomic_add(nr_bytes, &old->nr_stocked_bytes);
+ stock->nr_bytes = 0;
+ }
+ if (stock->subpage_cached) {
+ percpu_ref_put(&objcg->refcount);
+ stock->obj_cached = NULL;
+ }
+}
+
+static void refill_obj_stock(struct obj_cgroup *objcg,
+ unsigned int nr_bytes)
{
struct memcg_stock_pcp *stock;
unsigned long flags;
@@ -2360,37 +2350,35 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
local_irq_save(flags);

stock = this_cpu_ptr(&memcg_stock);
- if (stock->cached != memcg) { /* reset if necessary */
- drain_stock(stock);
- stock->cached = memcg;
+ if (stock->obj_cached != objcg) { /* reset if necessary */
+ drain_obj_stock(stock);
+ percpu_ref_get(&objcg->refcount);
+ stock->obj_cached = objcg;
+ stock->nr_bytes = atomic_xchg(&objcg->nr_stocked_bytes, 0);
}
- stock->nr_pages += nr_pages;
+ stock->nr_bytes += nr_bytes;

- if (stock->nr_pages > MEMCG_CHARGE_BATCH)
- drain_stock(stock);
+ if (stock->nr_bytes > (MEMCG_CHARGE_BATCH << PAGE_SHIFT))
+ drain_obj_stock(stock);

local_irq_restore(flags);
}

-static void refill_subpage_stock(struct mem_cgroup *memcg,
- unsigned int nr_bytes)
+static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
unsigned long flags;

+ /*
+ * The only protection from memory hotplug vs. drain_stock races is
+ * that we always operate on local CPU stock here with IRQ disabled
+ */
local_irq_save(flags);

stock = this_cpu_ptr(&memcg_stock);
- if (stock->subpage_cached != memcg) { /* reset if necessary */
- drain_subpage_stock(stock);
- css_get(&memcg->css);
- stock->subpage_cached = memcg;
- stock->nr_bytes = atomic_xchg(&memcg->nr_stocked_bytes, 0);
- }
- stock->nr_bytes += nr_bytes;
-
- if (stock->nr_bytes > (MEMCG_CHARGE_BATCH << PAGE_SHIFT))
- drain_subpage_stock(stock);
+ drain_obj_stock(stock);
+ drain_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

local_irq_restore(flags);
}
@@ -2658,9 +2646,8 @@ void mem_cgroup_handle_over_high(void)
*/

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int amount, bool subpage)
+ unsigned int nr_pages)
{
- unsigned int nr_pages = subpage ? ((amount >> PAGE_SHIFT) + 1) : amount;
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
@@ -2673,9 +2660,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (mem_cgroup_is_root(memcg))
return 0;
retry:
- if (subpage && consume_subpage_stock(memcg, amount))
- return 0;
- else if (!subpage && consume_stock(memcg, nr_pages))
+ if (consume_stock(memcg, nr_pages))
return 0;

if (!do_memsw_account() ||
@@ -2794,21 +2779,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);

- if (subpage)
- refill_subpage_stock(memcg, (nr_pages << PAGE_SHIFT) - amount);
- else
- css_get_many(&memcg->css, nr_pages);
+ css_get_many(&memcg->css, nr_pages);

return 0;

done_restock:
- if (subpage && (batch << PAGE_SHIFT) > amount) {
- refill_subpage_stock(memcg, (batch << PAGE_SHIFT) - amount);
- } else if (!subpage) {
- css_get_many(&memcg->css, batch);
- if (batch > nr_pages)
- refill_stock(memcg, batch - nr_pages);
- }
+ css_get_many(&memcg->css, batch);
+ if (batch > nr_pages)
+ refill_stock(memcg, batch - nr_pages);

/*
* If the hierarchy is above the normal consumption range, schedule
@@ -3117,15 +3095,24 @@ void __memcg_kmem_uncharge(struct page *page, int order)
css_put_many(&memcg->css, nr_pages);
}

-int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
- gfp_t gfp)
+int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size, gfp_t gfp)
{
- return try_charge(memcg, gfp, size, true);
+ int ret;
+
+ if (consume_obj_stock(objcg, nr_bytes))
+ return 0;
+
+ ret = try_charge(objcg->memcg, gfp, 1);
+ if (ret)
+ return ret;
+
+ refill_obj_stock(objcg, PAGE_SIZE - size);
+ return 0;
}

-void __memcg_kmem_uncharge_subpage(struct mem_cgroup *memcg, size_t size)
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
{
- refill_subpage_stock(memcg, size);
+ refill_obj_stock(objcg, size);
}

#endif /* CONFIG_MEMCG_KMEM */
@@ -3562,6 +3549,7 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
+ struct obj_cgroup *objcg;
int memcg_id, ret;

if (cgroup_memory_nokmem)
@@ -3574,11 +3562,13 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
if (memcg_id < 0)
return memcg_id;

- ret = memcg_init_kmem_memcg_ptr(memcg);
+ objcg = obj_cgroup_alloc(memcg);
if (ret) {
memcg_free_cache_id(memcg_id);
return ret;
}
+ rcu_assign_pointer(memcg->slab_objs, objcg);
+ list_add(&objcg->list, &memcg->slab_objs_list);

static_branch_inc(&memcg_kmem_enabled_key);
/*
@@ -3613,7 +3603,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
* precise values at the parent and all ancestor levels. It's required
* to keep slab stats accurate after the reparenting of kmem_caches.
*/
- memcg_reparent_kmem_memcg_ptr(memcg, parent);
+ obj_cgroup_reparent(memcg, parent);
memcg_flush_percpu_vmstats(memcg, true);

kmemcg_id = memcg->kmemcg_id;
@@ -5164,7 +5154,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->kmem_memcg_ptr_list);
+ INIT_LIST_HEAD(&memcg->obj_cgroup_list);
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
diff --git a/mm/slab.c b/mm/slab.c
index 0914d7cd869f..d452e0e7aab1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3212,10 +3212,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
unsigned long save_flags;
void *ptr;
int slab_node = numa_mem_id();
- struct mem_cgroup *memcg = NULL;
+ struct obj_cgroup *objcg = NULL;

flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, &memcg, 1, flags);
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;

@@ -3251,7 +3251,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
memset(ptr, 0, cachep->object_size);

- slab_post_alloc_hook(cachep, memcg, flags, 1, &ptr);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
return ptr;
}

@@ -3292,10 +3292,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
- struct mem_cgroup *memcg = NULL;
+ struct obj_cgroup *objcg = NULL;

flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, &memcg, 1, flags);
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;

@@ -3309,7 +3309,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
memset(objp, 0, cachep->object_size);

- slab_post_alloc_hook(cachep, memcg, flags, 1, &objp);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
return objp;
}

@@ -3497,9 +3497,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
size_t i;
- struct mem_cgroup *memcg = NULL;
+ struct obj_cgroup *objcg = NULL;

- s = slab_pre_alloc_hook(s, &memcg, size, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (!s)
return 0;

@@ -3522,13 +3522,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
for (i = 0; i < size; i++)
memset(p[i], 0, s->object_size);

- slab_post_alloc_hook(s, memcg, flags, size, p);
+ slab_post_alloc_hook(s, objcg, flags, size, p);
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, memcg, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
diff --git a/mm/slab.h b/mm/slab.h
index 035c2969a2ca..9dc082c8140d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -301,29 +301,30 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
return memcg_ptr->memcg;
}

-static inline int memcg_alloc_page_memcg_vec(struct page *page, gfp_t gfp,
+static inline int memcg_alloc_page_object_cgroups(struct page *page, gfp_t gfp,
unsigned int objects)
{
- page->mem_cgroup_vec = kmalloc(sizeof(struct mem_cgroup_ptr *) *
+ page->obj_cgroups = kmalloc(sizeof(struct mem_cgroup_ptr *) *
objects, gfp | __GFP_ZERO);
- if (!page->mem_cgroup_vec)
+ if (!page->obj_cgroups)
return -ENOMEM;

return 0;
}

-static inline void memcg_free_page_memcg_vec(struct page *page)
+static inline void memcg_free_page_obj_cgroups(struct page *page)
{
- kfree(page->mem_cgroup_vec);
- page->mem_cgroup_vec = NULL;
+ kfree(page->obj_cgroups);
+ page->obj_cgroups = NULL;
}

static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- struct mem_cgroup **memcgp,
+ struct obj_cgroup **objcgp,
size_t size, gfp_t flags)
{
struct kmem_cache *cachep;
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;

if (memcg_kmem_bypass())
return s;
@@ -338,48 +339,51 @@ static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
return s;
}

- if (__memcg_kmem_charge_subpage(memcg, size * s->size, flags)) {
+ objcg = mem_cgroup_get_slab_objs(memcg);
+ if (obj_cgroup_charge(objcg, s->size, size, flags)) {
+ percpu_ref_put(&objcg->count);
mem_cgroup_put(memcg);
return NULL;
}
-
- *memcgp = memcg;
+ *objcgp = objcg;
return cachep;
}

-static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
- struct mem_cgroup *memcg,
- size_t size, void **p)
+static inline void cgroup_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ size_t size, void **p)
{
- struct mem_cgroup_ptr *memcg_ptr;
+ struct obj_cgroup *objcg;
struct lruvec *lruvec;
struct page *page;
unsigned long off;
size_t i;

- memcg_ptr = mem_cgroup_get_kmem_ptr(memcg);
for (i = 0; i < size; i++) {
if (likely(p[i])) {
page = virt_to_head_page(p[i]);
off = obj_to_index(s, page, p[i]);
- mem_cgroup_ptr_get(memcg_ptr);
- page->mem_cgroup_vec[off] = memcg_ptr;
- lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ page->obj_cgroups[off] = objcg;
+ lruvec = mem_cgroup_lruvec(page_pgdat(page),
+ objcg->memcg);
mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s),
s->size);
} else {
- __memcg_kmem_uncharge_subpage(memcg, s->size);
+ object_cgroup_uncharge(objcg, s->size);
+ percpu_ref_put(&objcg->count);
}
}
- mem_cgroup_ptr_put(memcg_ptr);
+
+ /* pre_hook references */
+ percpu_ref_put(&objcg->count);
mem_cgroup_put(memcg);
}

static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
void *p)
{
- struct mem_cgroup_ptr *memcg_ptr;
struct mem_cgroup *memcg;
+ struct obj_cgroup *objcg;
struct lruvec *lruvec;
unsigned int off;

@@ -387,17 +391,16 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
return;

off = obj_to_index(s, page, p);
- memcg_ptr = page->mem_cgroup_vec[off];
- page->mem_cgroup_vec[off] = NULL;
+ objcg = page->obj_cgroup[off];
+ page->obj_cgroups[off] = NULL;
+
rcu_read_lock();
- memcg = memcg_ptr->memcg;
- if (likely(!mem_cgroup_is_root(memcg))) {
- __memcg_kmem_uncharge_subpage(memcg, s->size);
- lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
- mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s), -s->size);
- }
+ lruvec = mem_cgroup_lruvec(page_pgdat(pgdat), objcg->memcg);
+ mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s), -s->size);
rcu_read_unlock();
- mem_cgroup_ptr_put(memcg_ptr);
+
+ obj_cgroup_uncharge(objcg, s->size);
+ percpu_ref_put(&objcg->count);
}

extern void slab_init_memcg_params(struct kmem_cache *);
@@ -451,14 +454,14 @@ static inline void memcg_free_page_memcg_vec(struct page *page)
}

static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- struct mem_cgroup **memcgp,
+ struct obj_cgroup **objcgp,
size_t size, gfp_t flags)
{
return NULL;
}

static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
- struct mem_cgroup *memcg,
+ struct obj_cgroup *objcg,
size_t size, void **p)
{
}
@@ -570,7 +573,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
}

static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- struct mem_cgroup **memcgp,
+ struct obj_cgroup **objcg,
size_t size, gfp_t flags)
{
flags &= gfp_allowed_mask;
@@ -585,13 +588,13 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,

if (memcg_kmem_enabled() &&
((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
- return memcg_slab_pre_alloc_hook(s, memcgp, size, flags);
+ return memcg_slab_pre_alloc_hook(s, objcg, size, flags);

return s;
}

static inline void slab_post_alloc_hook(struct kmem_cache *s,
- struct mem_cgroup *memcg,
+ struct obj_cgroup *objcg,
gfp_t flags, size_t size, void **p)
{
size_t i;
@@ -605,7 +608,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
}

if (!is_root_cache(s))
- memcg_slab_post_alloc_hook(s, memcg, size, p);
+ memcg_slab_post_alloc_hook(s, objcg, size, p);
}

#ifndef CONFIG_SLOB
diff --git a/mm/slub.c b/mm/slub.c
index 53abbf0831b6..8f7f035d49a3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3162,10 +3162,10 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
{
struct kmem_cache_cpu *c;
int i;
- struct mem_cgroup *memcg = NULL;
+ struct obj_cgroup *objcg = NULL;

/* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, &memcg, size, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (unlikely(!s))
return false;
/*
@@ -3210,11 +3210,11 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
}

/* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, memcg, flags, size, p);
+ slab_post_alloc_hook(s, objcg, flags, size, p);
return i;
error:
local_irq_enable();
- slab_post_alloc_hook(s, memcg, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
}

2019-10-25 20:58:39

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

On Fri, Oct 25, 2019 at 08:00:32PM +0000, Roman Gushchin wrote:
> On Fri, Oct 25, 2019 at 03:41:18PM -0400, Johannes Weiner wrote:
> > On Thu, Oct 17, 2019 at 05:28:13PM -0700, Roman Gushchin wrote:
> > > +static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
> > > + struct mem_cgroup **memcgp,
> > > + size_t size, gfp_t flags)
> > > +{
> > > + struct kmem_cache *cachep;
> > > +
> > > + cachep = memcg_kmem_get_cache(s, memcgp);
> > > + if (is_root_cache(cachep))
> > > + return s;
> > > +
> > > + if (__memcg_kmem_charge_subpage(*memcgp, size * s->size, flags)) {
> > > + mem_cgroup_put(*memcgp);
> > > + memcg_kmem_put_cache(cachep);
> > > + cachep = NULL;
> > > + }
> > > +
> > > + return cachep;
> > > +}
> > > +
> > > static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> > > struct mem_cgroup *memcg,
> > > size_t size, void **p)
> > > {
> > > struct mem_cgroup_ptr *memcg_ptr;
> > > + struct lruvec *lruvec;
> > > struct page *page;
> > > unsigned long off;
> > > size_t i;
> > > @@ -439,6 +393,11 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> > > off = obj_to_index(s, page, p[i]);
> > > mem_cgroup_ptr_get(memcg_ptr);
> > > page->mem_cgroup_vec[off] = memcg_ptr;
> > > + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
> > > + mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s),
> > > + s->size);
> > > + } else {
> > > + __memcg_kmem_uncharge_subpage(memcg, s->size);
> > > }
> > > }
> > > mem_cgroup_ptr_put(memcg_ptr);
> >
> > The memcg_ptr as a collection vessel for object references makes a lot
> > of sense. But this code showcases that it should be a first-class
> > memory tracking API that the allocator interacts with, rather than
> > having to deal with a combination of memcg_ptr and memcg.
> >
> > In the two hunks here, on one hand we charge bytes to the memcg
> > object, and then handle all the refcounting through a different
> > bucketing object. To support that in the first place, we have to
> > overload the memcg API all the way down to try_charge() to support
> > bytes and pages. This is difficult to follow throughout all layers.
> >
> > What would be better is for this to be an abstraction layer for a
> > subpage object tracker that sits on top of the memcg page tracker -
> > not unlike the page allocator and the slab allocators themselves.
> >
> > And then the slab allocator would only interact with the subpage
> > object tracker, and the object tracker would deal with the memcg page
> > tracker under the hood.
>
> Yes, the idea makes total sense to me. I'm not sure I like the new naming
> (I have to spend some time with it first), but the idea of moving
> stocks and leftovers to the memcg_ptr/obj_cgroup level is really good.

I'm not set on the naming, it was just to illustrate the
structuring. I picked something that has cgroup in it, is not easily
confused with the memcg API, and shortens nicely to local variable
names (obj_cgroup -> objcg), but I'm all for a better name.

> I'll include something based on your proposal into the next version
> of the patchset.

Thanks, looking forward to it.

2019-10-25 20:58:41

by Roman Gushchin

[permalink] [raw]

Subject: Re: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

2019-10-31 01:55:01

by Roman Gushchin

[permalink] [raw]

Subject: Re: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

On Fri, Oct 25, 2019 at 03:41:18PM -0400, Johannes Weiner wrote:
> On Thu, Oct 17, 2019 at 05:28:13PM -0700, Roman Gushchin wrote:
> > +static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
> > + struct mem_cgroup **memcgp,
> > + size_t size, gfp_t flags)
> > +{
> > + struct kmem_cache *cachep;
> > +
> > + cachep = memcg_kmem_get_cache(s, memcgp);
> > + if (is_root_cache(cachep))
> > + return s;
> > +
> > + if (__memcg_kmem_charge_subpage(*memcgp, size * s->size, flags)) {
> > + mem_cgroup_put(*memcgp);
> > + memcg_kmem_put_cache(cachep);
> > + cachep = NULL;
> > + }
> > +
> > + return cachep;
> > +}
> > +
> > static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> > struct mem_cgroup *memcg,
> > size_t size, void **p)
> > {
> > struct mem_cgroup_ptr *memcg_ptr;
> > + struct lruvec *lruvec;
> > struct page *page;
> > unsigned long off;
> > size_t i;
> > @@ -439,6 +393,11 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> > off = obj_to_index(s, page, p[i]);
> > mem_cgroup_ptr_get(memcg_ptr);
> > page->mem_cgroup_vec[off] = memcg_ptr;
> > + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
> > + mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s),
> > + s->size);
> > + } else {
> > + __memcg_kmem_uncharge_subpage(memcg, s->size);
> > }
> > }
> > mem_cgroup_ptr_put(memcg_ptr);
>
> The memcg_ptr as a collection vessel for object references makes a lot
> of sense. But this code showcases that it should be a first-class
> memory tracking API that the allocator interacts with, rather than
> having to deal with a combination of memcg_ptr and memcg.
>
> In the two hunks here, on one hand we charge bytes to the memcg
> object, and then handle all the refcounting through a different
> bucketing object. To support that in the first place, we have to
> overload the memcg API all the way down to try_charge() to support
> bytes and pages. This is difficult to follow throughout all layers.
>
> What would be better is for this to be an abstraction layer for a
> subpage object tracker that sits on top of the memcg page tracker -
> not unlike the page allocator and the slab allocators themselves.
>
> And then the slab allocator would only interact with the subpage
> object tracker, and the object tracker would deal with the memcg page
> tracker under the hood.

Hello, Johannes!

After some thoughts and coding I don't think it's a right direction to go.
Under direction I mean building the new subpage/obj_cgroup API on top
of the intact existing memcg API.
Let me illustrate my point on the code you provided (please, look below).

>
> Something like the below, just a rough sketch to convey the idea:
>
> On allocation, we look up an obj_cgroup from the current task, charge
> it with obj_cgroup_charge() and bump its refcount by the number of
> objects we allocated. obj_cgroup_charge() in turn uses the memcg layer
> to charge pages, and then doles them out as bytes using
> objcg->nr_stocked_bytes and the byte per-cpu stock.
>
> On freeing, we call obj_cgroup_uncharge() and drop the references. In
> turn, this first refills the byte per-cpu stock and drains the
> overflow to the memcg layer (which in turn refills its own page stock
> and drains to the page_counter).
>
> On cgroup deletion, we reassign the obj_cgroup to the parent, and all
> remaining live objects will free their pages to the updated parental
> obj_cgroup->memcg, as per usual.
>
> ---
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index f36203cf75f8..efc3398aaf02 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -200,15 +200,15 @@ struct memcg_cgwb_frn {
> };
>
> /*
> - * A pointer to a memory cgroup with a built-in reference counter.
> - * For a use as an intermediate object to simplify reparenting of
> - * objects charged to the cgroup. The memcg pointer can be switched
> - * to the parent cgroup without a need to modify all objects
> - * which hold the reference to the cgroup.
> + * Bucket for arbitrarily byte-sized objects charged to a memory
> + * cgroup. The bucket can be reparented in one piece when the cgroup
> + * is destroyed, without having to round up the individual references
> + * of all live memory objects in the wild.
> */
> -struct mem_cgroup_ptr {
> - struct percpu_ref refcnt;
> +struct obj_cgroup {
> + struct percpu_ref count;
> struct mem_cgroup *memcg;
> + unsigned long nr_stocked_bytes;
> union {
> struct list_head list;
> struct rcu_head rcu;
> @@ -230,7 +230,6 @@ struct mem_cgroup {
> /* Accounted resources */
> struct page_counter memory;
> struct page_counter swap;
> - atomic_t nr_stocked_bytes;
>
> /* Legacy consumer-oriented counters */
> struct page_counter memsw;
> @@ -327,8 +326,8 @@ struct mem_cgroup {
> /* Index in the kmem_cache->memcg_params.memcg_caches array */
> int kmemcg_id;
> enum memcg_kmem_state kmem_state;
> - struct mem_cgroup_ptr __rcu *kmem_memcg_ptr;
> - struct list_head kmem_memcg_ptr_list;
> + struct obj_cgroup __rcu *slab_objs;
> + struct list_head slab_objs_list;
> #endif
>
> int last_scanned_node;
> @@ -474,21 +473,6 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
> return css ? container_of(css, struct mem_cgroup, css) : NULL;
> }
>
> -static inline bool mem_cgroup_ptr_tryget(struct mem_cgroup_ptr *ptr)
> -{
> - return percpu_ref_tryget(&ptr->refcnt);
> -}
> -
> -static inline void mem_cgroup_ptr_get(struct mem_cgroup_ptr *ptr)
> -{
> - percpu_ref_get(&ptr->refcnt);
> -}
> -
> -static inline void mem_cgroup_ptr_put(struct mem_cgroup_ptr *ptr)
> -{
> - percpu_ref_put(&ptr->refcnt);
> -}
> -
> static inline void mem_cgroup_put(struct mem_cgroup *memcg)
> {
> if (memcg)
> @@ -1444,9 +1428,9 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
> struct mem_cgroup *memcg);
> void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
> unsigned int nr_pages);
> -int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
> +int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size,
> gfp_t gfp);
> -void __memcg_kmem_uncharge_subpage(struct mem_cgroup *memcg, size_t size);
> +void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
>
> extern struct static_key_false memcg_kmem_enabled_key;
> extern struct workqueue_struct *memcg_kmem_cache_wq;
> @@ -1513,20 +1497,20 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
> return memcg ? memcg->kmemcg_id : -1;
> }
>
> -static inline struct mem_cgroup_ptr *
> -mem_cgroup_get_kmem_ptr(struct mem_cgroup *memcg)
> +static inline struct obj_cgroup *
> +mem_cgroup_get_slab_objs(struct mem_cgroup *memcg)
> {
> - struct mem_cgroup_ptr *memcg_ptr;
> + struct obj_cgroup *objcg;
>
> rcu_read_lock();
> do {
> - memcg_ptr = rcu_dereference(memcg->kmem_memcg_ptr);
> - if (memcg_ptr && mem_cgroup_ptr_tryget(memcg_ptr))
> + objcg = rcu_dereference(memcg->slab_objs);
> + if (objcg && percpu_ref_tryget(&objcg->count))
> break;
> } while ((memcg = parent_mem_cgroup(memcg)));
> rcu_read_unlock();
>
> - return memcg_ptr;
> + return objcg;
> }
>
> #else
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 4d99ee5a9c53..7f663fd53c17 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -200,7 +200,7 @@ struct page {
> #ifdef CONFIG_MEMCG
> union {
> struct mem_cgroup *mem_cgroup;
> - struct mem_cgroup_ptr **mem_cgroup_vec;
> + struct obj_cgroup **obj_cgroups;
> };
> #endif
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index b0d0c833150c..47110f4f0f4c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -260,52 +260,51 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
> #ifdef CONFIG_MEMCG_KMEM
> extern spinlock_t css_set_lock;
>
> -static void memcg_ptr_release(struct percpu_ref *ref)
> +static void obj_cgroup_release(struct percpu_ref *ref)
> {
> - struct mem_cgroup_ptr *ptr = container_of(ref, struct mem_cgroup_ptr,
> - refcnt);
> + struct obj_cgroup *objcg;
> unsigned long flags;
>
> + objcg = container_of(ref, struct obj_cgroup, count);
> +
> spin_lock_irqsave(&css_set_lock, flags);
> - list_del(&ptr->list);
> + list_del(&objcg->list);
> spin_unlock_irqrestore(&css_set_lock, flags);
>
> - mem_cgroup_put(ptr->memcg);
> + mem_cgroup_put(objcg->memcg);
> percpu_ref_exit(ref);
> - kfree_rcu(ptr, rcu);
> + kfree_rcu(objcg, rcu);
> }

Btw, the part below is looking good, thanks for the idea!

>
> -static int memcg_init_kmem_memcg_ptr(struct mem_cgroup *memcg)
> +static int obj_cgroup_alloc(struct mem_cgroup *memcg)
> {
> - struct mem_cgroup_ptr *kmem_memcg_ptr;
> + struct obj_cgroup *objcg;
> int ret;
>
> - kmem_memcg_ptr = kmalloc(sizeof(struct mem_cgroup_ptr), GFP_KERNEL);
> - if (!kmem_memcg_ptr)
> + objcg = kmalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
> + if (!objcg)
> return -ENOMEM;
>
> - ret = percpu_ref_init(&kmem_memcg_ptr->refcnt, memcg_ptr_release,
> + ret = percpu_ref_init(&objcg->count, obj_cgroup_release,
> 0, GFP_KERNEL);
> if (ret) {
> - kfree(kmem_memcg_ptr);
> + kfree(objcg);
> return ret;
> }
>
> - kmem_memcg_ptr->memcg = memcg;
> - INIT_LIST_HEAD(&kmem_memcg_ptr->list);
> - rcu_assign_pointer(memcg->kmem_memcg_ptr, kmem_memcg_ptr);
> - list_add(&kmem_memcg_ptr->list, &memcg->kmem_memcg_ptr_list);
> + INIT_LIST_HEAD(&objcg->list);
> + objcg->memcg = memcg;
> return 0;
> }
>

< cut >

> @@ -3117,15 +3095,24 @@ void __memcg_kmem_uncharge(struct page *page, int order)
> css_put_many(&memcg->css, nr_pages);
> }
>
> -int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
> - gfp_t gfp)
> +int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size, gfp_t gfp)
> {
> - return try_charge(memcg, gfp, size, true);
> + int ret;
> +
> + if (consume_obj_stock(objcg, nr_bytes))
> + return 0;
> +
> + ret = try_charge(objcg->memcg, gfp, 1);
> + if (ret)
> + return ret;

So, the first problem is here: try_charge() will bump the memcg reference.
It works for generic pages, where we set the mem_cgroup pointer, so that
the reference corresponds to the actual physical page.
But in this case, the actual page is shared between multiple cgroups.
Memcg will be basically referenced once for each stock cache miss.
I struggle to understand what does it mean and how to balance it
with unreferencing on the free path.

The second problem is also here. If a task belonging to a different memcg
is scheduled on this cpu, most likely we will need to refill both stocks,
even if we need only a small temporarily allocation.

> +
> + refill_obj_stock(objcg, PAGE_SIZE - size);

And the third problem is here. Percpu allocations (on which accounting I'm
working right now) can be larger than a page.

This is fairly small issue in comparison to the first one. But it illustrates
well the main point: we can't simple get a page from the existing API and
sublease it in parts. The problem is that we need to break the main principle
that a page belongs to a single memcg.

Please, let me know what do you think about it.

Thank you!

Roman

2019-10-31 14:24:57

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

On Thu, Oct 31, 2019 at 01:52:44AM +0000, Roman Gushchin wrote:
> On Fri, Oct 25, 2019 at 03:41:18PM -0400, Johannes Weiner wrote:
> > On Thu, Oct 17, 2019 at 05:28:13PM -0700, Roman Gushchin wrote:
> > > +static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
> > > + struct mem_cgroup **memcgp,
> > > + size_t size, gfp_t flags)
> > > +{
> > > + struct kmem_cache *cachep;
> > > +
> > > + cachep = memcg_kmem_get_cache(s, memcgp);
> > > + if (is_root_cache(cachep))
> > > + return s;
> > > +
> > > + if (__memcg_kmem_charge_subpage(*memcgp, size * s->size, flags)) {
> > > + mem_cgroup_put(*memcgp);
> > > + memcg_kmem_put_cache(cachep);
> > > + cachep = NULL;
> > > + }
> > > +
> > > + return cachep;
> > > +}
> > > +
> > > static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> > > struct mem_cgroup *memcg,
> > > size_t size, void **p)
> > > {
> > > struct mem_cgroup_ptr *memcg_ptr;
> > > + struct lruvec *lruvec;
> > > struct page *page;
> > > unsigned long off;
> > > size_t i;
> > > @@ -439,6 +393,11 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> > > off = obj_to_index(s, page, p[i]);
> > > mem_cgroup_ptr_get(memcg_ptr);
> > > page->mem_cgroup_vec[off] = memcg_ptr;
> > > + lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
> > > + mod_lruvec_memcg_state(lruvec, cache_vmstat_idx(s),
> > > + s->size);
> > > + } else {
> > > + __memcg_kmem_uncharge_subpage(memcg, s->size);
> > > }
> > > }
> > > mem_cgroup_ptr_put(memcg_ptr);
> >
> > The memcg_ptr as a collection vessel for object references makes a lot
> > of sense. But this code showcases that it should be a first-class
> > memory tracking API that the allocator interacts with, rather than
> > having to deal with a combination of memcg_ptr and memcg.
> >
> > In the two hunks here, on one hand we charge bytes to the memcg
> > object, and then handle all the refcounting through a different
> > bucketing object. To support that in the first place, we have to
> > overload the memcg API all the way down to try_charge() to support
> > bytes and pages. This is difficult to follow throughout all layers.
> >
> > What would be better is for this to be an abstraction layer for a
> > subpage object tracker that sits on top of the memcg page tracker -
> > not unlike the page allocator and the slab allocators themselves.
> >
> > And then the slab allocator would only interact with the subpage
> > object tracker, and the object tracker would deal with the memcg page
> > tracker under the hood.
>
> Hello, Johannes!
>
> After some thoughts and coding I don't think it's a right direction to go.
> Under direction I mean building the new subpage/obj_cgroup API on top
> of the intact existing memcg API.
> Let me illustrate my point on the code you provided (please, look below).
>
> >
> > Something like the below, just a rough sketch to convey the idea:
> >
> > On allocation, we look up an obj_cgroup from the current task, charge
> > it with obj_cgroup_charge() and bump its refcount by the number of
> > objects we allocated. obj_cgroup_charge() in turn uses the memcg layer
> > to charge pages, and then doles them out as bytes using
> > objcg->nr_stocked_bytes and the byte per-cpu stock.
> >
> > On freeing, we call obj_cgroup_uncharge() and drop the references. In
> > turn, this first refills the byte per-cpu stock and drains the
> > overflow to the memcg layer (which in turn refills its own page stock
> > and drains to the page_counter).
> >
> > On cgroup deletion, we reassign the obj_cgroup to the parent, and all
> > remaining live objects will free their pages to the updated parental
> > obj_cgroup->memcg, as per usual.
> >
> > ---
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index f36203cf75f8..efc3398aaf02 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -200,15 +200,15 @@ struct memcg_cgwb_frn {
> > };
> >
> > /*
> > - * A pointer to a memory cgroup with a built-in reference counter.
> > - * For a use as an intermediate object to simplify reparenting of
> > - * objects charged to the cgroup. The memcg pointer can be switched
> > - * to the parent cgroup without a need to modify all objects
> > - * which hold the reference to the cgroup.
> > + * Bucket for arbitrarily byte-sized objects charged to a memory
> > + * cgroup. The bucket can be reparented in one piece when the cgroup
> > + * is destroyed, without having to round up the individual references
> > + * of all live memory objects in the wild.
> > */
> > -struct mem_cgroup_ptr {
> > - struct percpu_ref refcnt;
> > +struct obj_cgroup {
> > + struct percpu_ref count;
> > struct mem_cgroup *memcg;
> > + unsigned long nr_stocked_bytes;
> > union {
> > struct list_head list;
> > struct rcu_head rcu;
> > @@ -230,7 +230,6 @@ struct mem_cgroup {
> > /* Accounted resources */
> > struct page_counter memory;
> > struct page_counter swap;
> > - atomic_t nr_stocked_bytes;
> >
> > /* Legacy consumer-oriented counters */
> > struct page_counter memsw;
> > @@ -327,8 +326,8 @@ struct mem_cgroup {
> > /* Index in the kmem_cache->memcg_params.memcg_caches array */
> > int kmemcg_id;
> > enum memcg_kmem_state kmem_state;
> > - struct mem_cgroup_ptr __rcu *kmem_memcg_ptr;
> > - struct list_head kmem_memcg_ptr_list;
> > + struct obj_cgroup __rcu *slab_objs;
> > + struct list_head slab_objs_list;
> > #endif
> >
> > int last_scanned_node;
> > @@ -474,21 +473,6 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
> > return css ? container_of(css, struct mem_cgroup, css) : NULL;
> > }
> >
> > -static inline bool mem_cgroup_ptr_tryget(struct mem_cgroup_ptr *ptr)
> > -{
> > - return percpu_ref_tryget(&ptr->refcnt);
> > -}
> > -
> > -static inline void mem_cgroup_ptr_get(struct mem_cgroup_ptr *ptr)
> > -{
> > - percpu_ref_get(&ptr->refcnt);
> > -}
> > -
> > -static inline void mem_cgroup_ptr_put(struct mem_cgroup_ptr *ptr)
> > -{
> > - percpu_ref_put(&ptr->refcnt);
> > -}
> > -
> > static inline void mem_cgroup_put(struct mem_cgroup *memcg)
> > {
> > if (memcg)
> > @@ -1444,9 +1428,9 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
> > struct mem_cgroup *memcg);
> > void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
> > unsigned int nr_pages);
> > -int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
> > +int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size,
> > gfp_t gfp);
> > -void __memcg_kmem_uncharge_subpage(struct mem_cgroup *memcg, size_t size);
> > +void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
> >
> > extern struct static_key_false memcg_kmem_enabled_key;
> > extern struct workqueue_struct *memcg_kmem_cache_wq;
> > @@ -1513,20 +1497,20 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
> > return memcg ? memcg->kmemcg_id : -1;
> > }
> >
> > -static inline struct mem_cgroup_ptr *
> > -mem_cgroup_get_kmem_ptr(struct mem_cgroup *memcg)
> > +static inline struct obj_cgroup *
> > +mem_cgroup_get_slab_objs(struct mem_cgroup *memcg)
> > {
> > - struct mem_cgroup_ptr *memcg_ptr;
> > + struct obj_cgroup *objcg;
> >
> > rcu_read_lock();
> > do {
> > - memcg_ptr = rcu_dereference(memcg->kmem_memcg_ptr);
> > - if (memcg_ptr && mem_cgroup_ptr_tryget(memcg_ptr))
> > + objcg = rcu_dereference(memcg->slab_objs);
> > + if (objcg && percpu_ref_tryget(&objcg->count))
> > break;
> > } while ((memcg = parent_mem_cgroup(memcg)));
> > rcu_read_unlock();
> >
> > - return memcg_ptr;
> > + return objcg;
> > }
> >
> > #else
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 4d99ee5a9c53..7f663fd53c17 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -200,7 +200,7 @@ struct page {
> > #ifdef CONFIG_MEMCG
> > union {
> > struct mem_cgroup *mem_cgroup;
> > - struct mem_cgroup_ptr **mem_cgroup_vec;
> > + struct obj_cgroup **obj_cgroups;
> > };
> > #endif
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index b0d0c833150c..47110f4f0f4c 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -260,52 +260,51 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
> > #ifdef CONFIG_MEMCG_KMEM
> > extern spinlock_t css_set_lock;
> >
> > -static void memcg_ptr_release(struct percpu_ref *ref)
> > +static void obj_cgroup_release(struct percpu_ref *ref)
> > {
> > - struct mem_cgroup_ptr *ptr = container_of(ref, struct mem_cgroup_ptr,
> > - refcnt);
> > + struct obj_cgroup *objcg;
> > unsigned long flags;
> >
> > + objcg = container_of(ref, struct obj_cgroup, count);
> > +
> > spin_lock_irqsave(&css_set_lock, flags);
> > - list_del(&ptr->list);
> > + list_del(&objcg->list);
> > spin_unlock_irqrestore(&css_set_lock, flags);
> >
> > - mem_cgroup_put(ptr->memcg);
> > + mem_cgroup_put(objcg->memcg);
> > percpu_ref_exit(ref);
> > - kfree_rcu(ptr, rcu);
> > + kfree_rcu(objcg, rcu);
> > }
>
> Btw, the part below is looking good, thanks for the idea!
>
> >
> > -static int memcg_init_kmem_memcg_ptr(struct mem_cgroup *memcg)
> > +static int obj_cgroup_alloc(struct mem_cgroup *memcg)
> > {
> > - struct mem_cgroup_ptr *kmem_memcg_ptr;
> > + struct obj_cgroup *objcg;
> > int ret;
> >
> > - kmem_memcg_ptr = kmalloc(sizeof(struct mem_cgroup_ptr), GFP_KERNEL);
> > - if (!kmem_memcg_ptr)
> > + objcg = kmalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
> > + if (!objcg)
> > return -ENOMEM;
> >
> > - ret = percpu_ref_init(&kmem_memcg_ptr->refcnt, memcg_ptr_release,
> > + ret = percpu_ref_init(&objcg->count, obj_cgroup_release,
> > 0, GFP_KERNEL);
> > if (ret) {
> > - kfree(kmem_memcg_ptr);
> > + kfree(objcg);
> > return ret;
> > }
> >
> > - kmem_memcg_ptr->memcg = memcg;
> > - INIT_LIST_HEAD(&kmem_memcg_ptr->list);
> > - rcu_assign_pointer(memcg->kmem_memcg_ptr, kmem_memcg_ptr);
> > - list_add(&kmem_memcg_ptr->list, &memcg->kmem_memcg_ptr_list);
> > + INIT_LIST_HEAD(&objcg->list);
> > + objcg->memcg = memcg;
> > return 0;
> > }
> >
>
> < cut >
>
> > @@ -3117,15 +3095,24 @@ void __memcg_kmem_uncharge(struct page *page, int order)
> > css_put_many(&memcg->css, nr_pages);
> > }
> >
> > -int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
> > - gfp_t gfp)
> > +int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size, gfp_t gfp)
> > {
> > - return try_charge(memcg, gfp, size, true);
> > + int ret;
> > +
> > + if (consume_obj_stock(objcg, nr_bytes))
> > + return 0;
> > +
> > + ret = try_charge(objcg->memcg, gfp, 1);
> > + if (ret)
> > + return ret;
>
> So, the first problem is here: try_charge() will bump the memcg reference.
> It works for generic pages, where we set the mem_cgroup pointer, so that
> the reference corresponds to the actual physical page.
> But in this case, the actual page is shared between multiple cgroups.
> Memcg will be basically referenced once for each stock cache miss.
> I struggle to understand what does it mean and how to balance it
> with unreferencing on the free path.

The charge functions make refcounting assumptions they shouldn't
make. That's a sign that we need to fix the refcounting, not work
around it in the upper layers.

We need the following patch for the new slab controller. It's an
overdue cleanup, too.

---
From 10bfa8e3111c65a2fdd2156547fe645c23847fa6 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <[email protected]>
Date: Thu, 31 Oct 2019 07:04:23 -0400
Subject: [PATCH] mm: memcontrol: decouple reference counting from page
accounting

The reference counting of a memcg is currently coupled directly to how
many 4k pages are charged to it. This doesn't work well with Roman's
new slab controller, which maintains pools of objects and doesn't want
to keep an extra balance sheet for the pages backing those objects.

This unusual refcounting design (reference counts usually track
pointers to an object) is only for historical reasons: memcg used to
not take any css references and simply stalled offlining until all
charges had been reparented and the page counters had dropped to
zero. When we got rid of the reparenting requirement, the simple
mechanical translation was to take a reference for every charge.

More historical context can be found in e8ea14cc6ead ("mm: memcontrol:
take a css reference for each charged page"), 64f219938941 ("mm:
memcontrol: remove obsolete kmemcg pinning tricks") and b2052564e66d
("mm: memcontrol: continue cache reclaim from offlined groups").

The new slab controller exposes the limitations in this scheme, so
let's switch it to a more idiomatic reference counting model based on
actual kernel pointers to the memcg:

- The per-cpu stock holds a reference to the memcg its caching

- User pages hold a reference for their page->mem_cgroup. Transparent
huge pages will no longer acquire tail references in advance, we'll
get them if needed during the split.

- Kernel pages hold a reference for their page->mem_cgroup

- mem_cgroup_try_charge(), if successful, will return one reference to
be consumed by page->mem_cgroup during commit, or put during cancel

- Pages allocated in the root cgroup will acquire and release css
references for simplicity. css_get() and css_put() optimize that.

- The current memcg_charge_slab() already hacked around the per-charge
references; this change gets rid of that as well.

Signed-off-by: Johannes Weiner <[email protected]>
---
mm/memcontrol.c | 45 ++++++++++++++++++++++++++-------------------
mm/slab.h | 2 --
2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c7e3e758c165..b88c273d6dc0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2190,13 +2190,17 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;

+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&old->memsw, stock->nr_pages);
- css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}

@@ -2232,6 +2236,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
@@ -2635,12 +2640,10 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);

return 0;

done_restock:
- css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);

@@ -2677,8 +2680,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
-
- css_put_many(&memcg->css, nr_pages);
}

static void lock_page_lru(struct page *page, int *isolated)
@@ -2989,6 +2990,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ return 0;
}
}
css_put(&memcg->css);
@@ -3026,12 +3028,11 @@ void __memcg_kmem_uncharge(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge_memcg(memcg, nr_pages);
page->mem_cgroup = NULL;
+ css_put(&memcg->css);

/* slab pages do not have PageKmemcg flag set */
if (PageKmemcg(page))
__ClearPageKmemcg(page);
-
- css_put_many(&memcg->css, nr_pages);
}
#endif /* CONFIG_MEMCG_KMEM */

@@ -3043,15 +3044,18 @@ void __memcg_kmem_uncharge(struct page *page, int order)
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
+ struct mem_cgroup *memcg = head->mem_cgroup;
int i;

if (mem_cgroup_disabled())
return;

- for (i = 1; i < HPAGE_PMD_NR; i++)
- head[i].mem_cgroup = head->mem_cgroup;
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ css_get(&memcg->css);
+ head[i].mem_cgroup = memcg;
+ }

- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
+ __mod_memcg_state(memcg, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

@@ -5485,7 +5489,9 @@ static int mem_cgroup_move_account(struct page *page,
* uncharging, charging, migration, or LRU putback.
*/

- /* caller should have done css_get */
+ css_get(&to->css);
+ css_put(&from->css);
+
page->mem_cgroup = to;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -6514,8 +6520,10 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
memcg = get_mem_cgroup_from_mm(mm);

ret = try_charge(memcg, gfp_mask, nr_pages);
-
- css_put(&memcg->css);
+ if (ret) {
+ css_put(&memcg->css);
+ memcg = NULL;
+ }
out:
*memcgp = memcg;
return ret;
@@ -6611,6 +6619,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
return;

cancel_charge(memcg, nr_pages);
+
+ css_put(&memcg->css);
}

struct uncharge_gather {
@@ -6652,9 +6662,6 @@ static void uncharge_batch(const struct uncharge_gather *ug)
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
-
- if (!mem_cgroup_is_root(ug->memcg))
- css_put_many(&ug->memcg->css, nr_pages);
}

static void uncharge_page(struct page *page, struct uncharge_gather *ug)
@@ -6702,6 +6709,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)

ug->dummy_page = page;
page->mem_cgroup = NULL;
+ css_put(&ug->memcg->css);
}

static void uncharge_list(struct list_head *page_list)
@@ -6810,8 +6818,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);

+ css_get(&memcg->css);
commit_charge(newpage, memcg, false);

local_irq_save(flags);
@@ -7059,8 +7067,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
-nr_entries);
memcg_check_events(memcg, page);

- if (!mem_cgroup_is_root(memcg))
- css_put_many(&memcg->css, nr_entries);
+ css_put(&memcg->css);
}

/**
diff --git a/mm/slab.h b/mm/slab.h
index 2bbecf28688d..9f84298db2d7 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -372,9 +372,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);

- /* transer try_charge() page references to kmem_cache */
percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
- css_put_many(&memcg->css, 1 << order);
out:
css_put(&memcg->css);
return ret;
--
2.23.0

2019-10-31 14:42:59

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

On Thu, Oct 31, 2019 at 01:52:44AM +0000, Roman Gushchin wrote:
> On Fri, Oct 25, 2019 at 03:41:18PM -0400, Johannes Weiner wrote:
> > @@ -3117,15 +3095,24 @@ void __memcg_kmem_uncharge(struct page *page, int order)
> > css_put_many(&memcg->css, nr_pages);
> > }
> >
> > -int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
> > - gfp_t gfp)
> > +int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size, gfp_t gfp)
> > {
> > - return try_charge(memcg, gfp, size, true);
> > + int ret;
> > +
> > + if (consume_obj_stock(objcg, nr_bytes))
> > + return 0;
> > +
> > + ret = try_charge(objcg->memcg, gfp, 1);
> > + if (ret)
> > + return ret;

> The second problem is also here. If a task belonging to a different memcg
> is scheduled on this cpu, most likely we will need to refill both stocks,
> even if we need only a small temporarily allocation.

Yes, that's a good thing. The reason we have the per-cpu caches in the
first place is because most likely the same cgroup will perform
several allocations. Both the slab allocator and the page allocator
have per-cpu caches for the same reason. I don't really understand
what the argument is.

> > +
> > + refill_obj_stock(objcg, PAGE_SIZE - size);
>
> And the third problem is here. Percpu allocations (on which accounting I'm
> working right now) can be larger than a page.

How about this?

nr_pages = round_up(size, PAGE_SIZE);
try_charge(objcg->memcg, nr_pages);
refill_obj_stock(objcg, size % PAGE_SIZE);

> This is fairly small issue in comparison to the first one. But it illustrates
> well the main point: we can't simple get a page from the existing API and
> sublease it in parts. The problem is that we need to break the main principle
> that a page belongs to a single memcg.

We can change the underlying assumptions of the existing API if they
are no longer correct. We don't have to invent a parallel stack.

2019-10-31 15:09:32

by Roman Gushchin

[permalink] [raw]

Subject: Re: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

On Thu, Oct 31, 2019 at 10:41:51AM -0400, Johannes Weiner wrote:
> On Thu, Oct 31, 2019 at 01:52:44AM +0000, Roman Gushchin wrote:
> > On Fri, Oct 25, 2019 at 03:41:18PM -0400, Johannes Weiner wrote:
> > > @@ -3117,15 +3095,24 @@ void __memcg_kmem_uncharge(struct page *page, int order)
> > > css_put_many(&memcg->css, nr_pages);
> > > }
> > >
> > > -int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
> > > - gfp_t gfp)
> > > +int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size, gfp_t gfp)
> > > {
> > > - return try_charge(memcg, gfp, size, true);
> > > + int ret;
> > > +
> > > + if (consume_obj_stock(objcg, nr_bytes))
> > > + return 0;
> > > +
> > > + ret = try_charge(objcg->memcg, gfp, 1);
> > > + if (ret)
> > > + return ret;
>
> > The second problem is also here. If a task belonging to a different memcg
> > is scheduled on this cpu, most likely we will need to refill both stocks,
> > even if we need only a small temporarily allocation.
>
> Yes, that's a good thing. The reason we have the per-cpu caches in the
> first place is because most likely the same cgroup will perform
> several allocations. Both the slab allocator and the page allocator
> have per-cpu caches for the same reason. I don't really understand
> what the argument is.

I mean it seems strange (and most likely will show up in perf numbers)
to move a page from one stock to another. Is there a reason why do you want
to ask try_charge() and stock only a single page?

Can we do the following instead?

1) add a boolean argument to try_charge() to bypass the consume_stock() call
at the beginning and just go slow path immediately
2) use try_charge() with this argument set to true to fill the objc/subpage
stock with MEMCG_CHARGE_BATCH pages

In this case we'll reuse try_charge() and will have all corresponding benefits,
but will avoid the double stocking, which seems as a strange idea to me.

>
> > > +
> > > + refill_obj_stock(objcg, PAGE_SIZE - size);
> >
> > And the third problem is here. Percpu allocations (on which accounting I'm
> > working right now) can be larger than a page.
>
> How about this?
>
> nr_pages = round_up(size, PAGE_SIZE);
> try_charge(objcg->memcg, nr_pages);
> refill_obj_stock(objcg, size % PAGE_SIZE);

Ok, this will work.

>
> > This is fairly small issue in comparison to the first one. But it illustrates
> > well the main point: we can't simple get a page from the existing API and
> > sublease it in parts. The problem is that we need to break the main principle
> > that a page belongs to a single memcg.
>
> We can change the underlying assumptions of the existing API if they
> are no longer correct. We don't have to invent a parallel stack.

Ok, this makes sense. And thank you for the patch, I'll take it into the set.

Thanks!

2019-10-31 18:51:52

by Johannes Weiner

[permalink] [raw]

Subject: Re: [PATCH 09/16] mm: memcg/slab: charge individual slab objects instead of pages

On Thu, Oct 31, 2019 at 03:07:02PM +0000, Roman Gushchin wrote:
> On Thu, Oct 31, 2019 at 10:41:51AM -0400, Johannes Weiner wrote:
> > On Thu, Oct 31, 2019 at 01:52:44AM +0000, Roman Gushchin wrote:
> > > On Fri, Oct 25, 2019 at 03:41:18PM -0400, Johannes Weiner wrote:
> > > > @@ -3117,15 +3095,24 @@ void __memcg_kmem_uncharge(struct page *page, int order)
> > > > css_put_many(&memcg->css, nr_pages);
> > > > }
> > > >
> > > > -int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
> > > > - gfp_t gfp)
> > > > +int obj_cgroup_charge(struct obj_cgroup *objcg, size_t size, gfp_t gfp)
> > > > {
> > > > - return try_charge(memcg, gfp, size, true);
> > > > + int ret;
> > > > +
> > > > + if (consume_obj_stock(objcg, nr_bytes))
> > > > + return 0;
> > > > +
> > > > + ret = try_charge(objcg->memcg, gfp, 1);
> > > > + if (ret)
> > > > + return ret;
> >
> > > The second problem is also here. If a task belonging to a different memcg
> > > is scheduled on this cpu, most likely we will need to refill both stocks,
> > > even if we need only a small temporarily allocation.
> >
> > Yes, that's a good thing. The reason we have the per-cpu caches in the
> > first place is because most likely the same cgroup will perform
> > several allocations. Both the slab allocator and the page allocator
> > have per-cpu caches for the same reason. I don't really understand
> > what the argument is.
>
> I mean it seems strange (and most likely will show up in perf numbers)
> to move a page from one stock to another. Is there a reason why do you want
> to ask try_charge() and stock only a single page?
>
> Can we do the following instead?
>
> 1) add a boolean argument to try_charge() to bypass the consume_stock() call
> at the beginning and just go slow path immediately
> 2) use try_charge() with this argument set to true to fill the objc/subpage
> stock with MEMCG_CHARGE_BATCH pages

No, think this through.

If you have disjunct caches for the page_counter, it means the cache
work cannot be shared. A slab allocation has to hit the page_counter,
and a subsequent page allocation has to hit it again; likewise, a slab
allocation cannot benefit from the caching of prior page allocations.

You're trading cheap, unlocked, cpu-local subtractions against costly
atomic RMW ops on shared cachelines. You also double the amount of
cached per-cpu memory and introduce a layering violation.

Hotpath (bytes cached)

stacked: disjunct:
consume_subpage_stock() try_charge()
consume_subpage_stock()

Warmpath (pages cached)

stacked: disjunct:
consume_subpage_stock() try_charge()
try_charge() consume_subpage_stock()
consume_stock() page_counter_charge()
refill_subpage_stock() refill_subpage_stock()

Coldpath (nothing cached)

stacked: disjunct
consume_subpage_stock() try_charge()
try_charge() consume_subpage_stock()
consume_stock() page_counter_charge()
page_counter_charge() refill_subpage_stock()
refill_stock()
refill_subpage_stock()