2019-09-06 08:22:16

by Roman Gushchin

[permalink] [raw]
Subject: [PATCH RFC 01/14] mm: memcg: subpage charging API

Introduce an API to charge subpage objects to the memory cgroup.
The API will be used by the new slab memory controller. Later it
can also be used to implement percpu memory accounting.
In both cases, a single page can be shared between multiple cgroups
(and in percpu case a single allocation is split over multiple pages),
so it's not possible to use page-based accounting.

The implementation is based on percpu stocks. Memory cgroups are still
charged in pages, and the residue is stored in perpcu stock, or on the
memcg itself, when it's necessary to flush the stock.

Please, note, that unlike the generic page charging API, a subpage
object is not holding a reference to the memory cgroup. It's because
a more complicated indirect scheme is required in order to implement
cheap reparenting. The percpu stock holds a single reference to the
cached cgroup though.

Signed-off-by: Roman Gushchin <[email protected]>
---
include/linux/memcontrol.h | 4 ++
mm/memcontrol.c | 129 +++++++++++++++++++++++++++++++++----
2 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c762e8ca6a6..120d39066148 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -214,6 +214,7 @@ struct mem_cgroup {
/* Accounted resources */
struct page_counter memory;
struct page_counter swap;
+ atomic_t nr_stocked_bytes;

/* Legacy consumer-oriented counters */
struct page_counter memsw;
@@ -1370,6 +1371,9 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg);
void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
unsigned int nr_pages);
+int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
+ gfp_t gfp);
+void __memcg_kmem_uncharge_subpage(struct mem_cgroup *memcg, size_t size);

extern struct static_key_false memcg_kmem_enabled_key;
extern struct workqueue_struct *memcg_kmem_cache_wq;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1c4c08b45e44..effefcec47b3 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2149,6 +2149,10 @@ EXPORT_SYMBOL(unlock_page_memcg);
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
+
+ struct mem_cgroup *subpage_cached;
+ unsigned int nr_bytes;
+
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
@@ -2189,6 +2193,29 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
return ret;
}

+static bool consume_subpage_stock(struct mem_cgroup *memcg,
+ unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ bool ret = false;
+
+ if (nr_bytes > (MEMCG_CHARGE_BATCH << PAGE_SHIFT))
+ return ret;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (memcg == stock->subpage_cached && stock->nr_bytes >= nr_bytes) {
+ stock->nr_bytes -= nr_bytes;
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
/*
* Returns stocks cached in percpu and reset cached information.
*/
@@ -2206,6 +2233,27 @@ static void drain_stock(struct memcg_stock_pcp *stock)
stock->cached = NULL;
}

+static void drain_subpage_stock(struct memcg_stock_pcp *stock)
+{
+ struct mem_cgroup *old = stock->subpage_cached;
+
+ if (stock->nr_bytes) {
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+ page_counter_uncharge(&old->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&old->memsw, nr_pages);
+
+ atomic_add(nr_bytes, &old->nr_stocked_bytes);
+ stock->nr_bytes = 0;
+ }
+ if (stock->subpage_cached) {
+ css_put(&old->css);
+ stock->subpage_cached = NULL;
+ }
+}
+
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
@@ -2218,8 +2266,11 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags);

stock = this_cpu_ptr(&memcg_stock);
- drain_stock(stock);
- clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+ if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ drain_stock(stock);
+ drain_subpage_stock(stock);
+ clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
+ }

local_irq_restore(flags);
}
@@ -2248,6 +2299,29 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
local_irq_restore(flags);
}

+static void refill_subpage_stock(struct mem_cgroup *memcg,
+ unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (stock->subpage_cached != memcg) { /* reset if necessary */
+ drain_subpage_stock(stock);
+ css_get(&memcg->css);
+ stock->subpage_cached = memcg;
+ stock->nr_bytes = atomic_xchg(&memcg->nr_stocked_bytes, 0);
+ }
+ stock->nr_bytes += nr_bytes;
+
+ if (stock->nr_bytes > (MEMCG_CHARGE_BATCH << PAGE_SHIFT))
+ drain_subpage_stock(stock);
+
+ local_irq_restore(flags);
+}
+
/*
* Drains all per-CPU charge caches for given root_memcg resp. subtree
* of the hierarchy under it.
@@ -2276,6 +2350,9 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
+ memcg = stock->subpage_cached;
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
rcu_read_unlock();

if (flush &&
@@ -2500,8 +2577,9 @@ void mem_cgroup_handle_over_high(void)
}

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
- unsigned int nr_pages)
+ unsigned int amount, bool subpage)
{
+ unsigned int nr_pages = subpage ? ((amount >> PAGE_SHIFT) + 1) : amount;
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
@@ -2514,7 +2592,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
if (mem_cgroup_is_root(memcg))
return 0;
retry:
- if (consume_stock(memcg, nr_pages))
+ if (subpage && consume_subpage_stock(memcg, amount))
+ return 0;
+ else if (!subpage && consume_stock(memcg, nr_pages))
return 0;

if (!do_memsw_account() ||
@@ -2632,14 +2712,22 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
+
+ if (subpage)
+ refill_subpage_stock(memcg, (nr_pages << PAGE_SHIFT) - amount);
+ else
+ css_get_many(&memcg->css, nr_pages);

return 0;

done_restock:
- css_get_many(&memcg->css, batch);
- if (batch > nr_pages)
- refill_stock(memcg, batch - nr_pages);
+ if (subpage && (batch << PAGE_SHIFT) > amount) {
+ refill_subpage_stock(memcg, (batch << PAGE_SHIFT) - amount);
+ } else if (!subpage) {
+ css_get_many(&memcg->css, batch);
+ if (batch > nr_pages)
+ refill_stock(memcg, batch - nr_pages);
+ }

/*
* If the hierarchy is above the normal consumption range, schedule
@@ -2942,7 +3030,7 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct page_counter *counter;
int ret;

- ret = try_charge(memcg, gfp, nr_pages);
+ ret = try_charge(memcg, gfp, nr_pages, false);
if (ret)
return ret;

@@ -3020,6 +3108,18 @@ void __memcg_kmem_uncharge(struct page *page, int order)

css_put_many(&memcg->css, nr_pages);
}
+
+int __memcg_kmem_charge_subpage(struct mem_cgroup *memcg, size_t size,
+ gfp_t gfp)
+{
+ return try_charge(memcg, gfp, size, true);
+}
+
+void __memcg_kmem_uncharge_subpage(struct mem_cgroup *memcg, size_t size)
+{
+ refill_subpage_stock(memcg, size);
+}
+
#endif /* CONFIG_MEMCG_KMEM */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -5267,7 +5367,8 @@ static int mem_cgroup_do_precharge(unsigned long count)
int ret;

/* Try a single bulk charge without reclaim first, kswapd may wake */
- ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
+ ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count,
+ false);
if (!ret) {
mc.precharge += count;
return ret;
@@ -5275,7 +5376,7 @@ static int mem_cgroup_do_precharge(unsigned long count)

/* Try charges one by one with reclaim, but do not retry */
while (count--) {
- ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
+ ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1, false);
if (ret)
return ret;
mc.precharge++;
@@ -6487,7 +6588,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);

- ret = try_charge(memcg, gfp_mask, nr_pages);
+ ret = try_charge(memcg, gfp_mask, nr_pages, false);

css_put(&memcg->css);
out:
@@ -6866,10 +6967,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)

mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);

- if (try_charge(memcg, gfp_mask, nr_pages) == 0)
+ if (try_charge(memcg, gfp_mask, nr_pages, false) == 0)
return true;

- try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
+ try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages, false);
return false;
}

--
2.21.0


2019-09-16 13:00:51

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH RFC 01/14] mm: memcg: subpage charging API

On Thu, Sep 05, 2019 at 02:45:45PM -0700, Roman Gushchin wrote:
> Introduce an API to charge subpage objects to the memory cgroup.
> The API will be used by the new slab memory controller. Later it
> can also be used to implement percpu memory accounting.
> In both cases, a single page can be shared between multiple cgroups
> (and in percpu case a single allocation is split over multiple pages),
> so it's not possible to use page-based accounting.
>
> The implementation is based on percpu stocks. Memory cgroups are still
> charged in pages, and the residue is stored in perpcu stock, or on the
> memcg itself, when it's necessary to flush the stock.

Did you just implement a slab allocator for page_counter to track
memory consumed by the slab allocator?

> @@ -2500,8 +2577,9 @@ void mem_cgroup_handle_over_high(void)
> }
>
> static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> - unsigned int nr_pages)
> + unsigned int amount, bool subpage)
> {
> + unsigned int nr_pages = subpage ? ((amount >> PAGE_SHIFT) + 1) : amount;
> unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
> int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> struct mem_cgroup *mem_over_limit;
> @@ -2514,7 +2592,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> if (mem_cgroup_is_root(memcg))
> return 0;
> retry:
> - if (consume_stock(memcg, nr_pages))
> + if (subpage && consume_subpage_stock(memcg, amount))
> + return 0;
> + else if (!subpage && consume_stock(memcg, nr_pages))
> return 0;

The layering here isn't clean. We have an existing per-cpu cache to
batch-charge the page counter. Why does the new subpage allocator not
sit on *top* of this, instead of wedged in between?

I think what it should be is a try_charge_bytes() that simply gets one
page from try_charge() and then does its byte tracking, regardless of
how try_charge() chooses to implement its own page tracking.

That would avoid the awkward @amount + @subpage multiplexing, as well
as annotating all existing callsites of try_charge() with a
non-descript "false" parameter.

You can still reuse the stock data structures, use the lower bits of
stock->nr_bytes for a different cgroup etc., but the charge API should
really be separate.

2019-09-17 09:26:24

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH RFC 01/14] mm: memcg: subpage charging API

On Mon, Sep 16, 2019 at 02:56:11PM +0200, Johannes Weiner wrote:
> On Thu, Sep 05, 2019 at 02:45:45PM -0700, Roman Gushchin wrote:
> > Introduce an API to charge subpage objects to the memory cgroup.
> > The API will be used by the new slab memory controller. Later it
> > can also be used to implement percpu memory accounting.
> > In both cases, a single page can be shared between multiple cgroups
> > (and in percpu case a single allocation is split over multiple pages),
> > so it's not possible to use page-based accounting.
> >
> > The implementation is based on percpu stocks. Memory cgroups are still
> > charged in pages, and the residue is stored in perpcu stock, or on the
> > memcg itself, when it's necessary to flush the stock.
>
> Did you just implement a slab allocator for page_counter to track
> memory consumed by the slab allocator?

:)

>
> > @@ -2500,8 +2577,9 @@ void mem_cgroup_handle_over_high(void)
> > }
> >
> > static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> > - unsigned int nr_pages)
> > + unsigned int amount, bool subpage)
> > {
> > + unsigned int nr_pages = subpage ? ((amount >> PAGE_SHIFT) + 1) : amount;
> > unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
> > int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> > struct mem_cgroup *mem_over_limit;
> > @@ -2514,7 +2592,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> > if (mem_cgroup_is_root(memcg))
> > return 0;
> > retry:
> > - if (consume_stock(memcg, nr_pages))
> > + if (subpage && consume_subpage_stock(memcg, amount))
> > + return 0;
> > + else if (!subpage && consume_stock(memcg, nr_pages))
> > return 0;
>
> The layering here isn't clean. We have an existing per-cpu cache to
> batch-charge the page counter. Why does the new subpage allocator not
> sit on *top* of this, instead of wedged in between?
>
> I think what it should be is a try_charge_bytes() that simply gets one
> page from try_charge() and then does its byte tracking, regardless of
> how try_charge() chooses to implement its own page tracking.
>
> That would avoid the awkward @amount + @subpage multiplexing, as well
> as annotating all existing callsites of try_charge() with a
> non-descript "false" parameter.
>
> You can still reuse the stock data structures, use the lower bits of
> stock->nr_bytes for a different cgroup etc., but the charge API should
> really be separate.

Hm, I kinda like the idea, however there is a complication: for the subpage
accounting the css reference management is done in a different way, so that
all existing code should avoid changing the css refcounter. So I'd need
to pass a boolean argument anyway.

But let me try to write this down, hopefully v2 will be cleaner.

Thank you!

2019-09-17 09:54:28

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH RFC 01/14] mm: memcg: subpage charging API

On Tue, Sep 17, 2019 at 02:27:19AM +0000, Roman Gushchin wrote:
> On Mon, Sep 16, 2019 at 02:56:11PM +0200, Johannes Weiner wrote:
> > On Thu, Sep 05, 2019 at 02:45:45PM -0700, Roman Gushchin wrote:
> > > Introduce an API to charge subpage objects to the memory cgroup.
> > > The API will be used by the new slab memory controller. Later it
> > > can also be used to implement percpu memory accounting.
> > > In both cases, a single page can be shared between multiple cgroups
> > > (and in percpu case a single allocation is split over multiple pages),
> > > so it's not possible to use page-based accounting.
> > >
> > > The implementation is based on percpu stocks. Memory cgroups are still
> > > charged in pages, and the residue is stored in perpcu stock, or on the
> > > memcg itself, when it's necessary to flush the stock.
> >
> > Did you just implement a slab allocator for page_counter to track
> > memory consumed by the slab allocator?
>
> :)
>
> >
> > > @@ -2500,8 +2577,9 @@ void mem_cgroup_handle_over_high(void)
> > > }
> > >
> > > static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> > > - unsigned int nr_pages)
> > > + unsigned int amount, bool subpage)
> > > {
> > > + unsigned int nr_pages = subpage ? ((amount >> PAGE_SHIFT) + 1) : amount;
> > > unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
> > > int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> > > struct mem_cgroup *mem_over_limit;
> > > @@ -2514,7 +2592,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> > > if (mem_cgroup_is_root(memcg))
> > > return 0;
> > > retry:
> > > - if (consume_stock(memcg, nr_pages))
> > > + if (subpage && consume_subpage_stock(memcg, amount))
> > > + return 0;
> > > + else if (!subpage && consume_stock(memcg, nr_pages))
> > > return 0;
> >
> > The layering here isn't clean. We have an existing per-cpu cache to
> > batch-charge the page counter. Why does the new subpage allocator not
> > sit on *top* of this, instead of wedged in between?
> >
> > I think what it should be is a try_charge_bytes() that simply gets one
> > page from try_charge() and then does its byte tracking, regardless of
> > how try_charge() chooses to implement its own page tracking.
> >
> > That would avoid the awkward @amount + @subpage multiplexing, as well
> > as annotating all existing callsites of try_charge() with a
> > non-descript "false" parameter.
> >
> > You can still reuse the stock data structures, use the lower bits of
> > stock->nr_bytes for a different cgroup etc., but the charge API should
> > really be separate.
>
> Hm, I kinda like the idea, however there is a complication: for the subpage
> accounting the css reference management is done in a different way, so that
> all existing code should avoid changing the css refcounter. So I'd need
> to pass a boolean argument anyway.

Can you elaborate on the refcounting scheme? I don't quite understand
how there would be complications with that.

Generally, references are held for each page that is allocated in the
page_counter. try_charge() allocates a batch of css references,
returns one and keeps the rest in stock.

So couldn't the following work?

When somebody allocates a subpage, the css reference returned by
try_charge() is shared by the allocated subpage object and the
remainder that is kept via stock->subpage_cache and stock->nr_bytes
(or memcg->nr_stocked_bytes when the percpu cache is reset).

When the subpage objects are freed, you'll eventually have a full page
again in stock->nr_bytes, at which point you page_counter_uncharge()
paired with css_put(_many) as per usual.

A remainder left in old->nr_stocked_bytes would continue to hold on to
one css reference. (I don't quite understand who is protecting this
remainder in your current version, actually. A bug?)

Instead of doing your own batched page_counter uncharging in
refill_subpage_stock() -> drain_subpage_stock(), you should be able to
call refill_stock() when stock->nr_bytes adds up to a whole page again.

Again, IMO this would be much cleaner architecture if there was a
try_charge_bytes() byte allocator that would sit on top of a cleanly
abstracted try_charge() page allocator, just like the slab allocator
is sitting on top of the page allocator - instead of breaking through
the abstraction layer of the underlying page allocator.

2019-09-17 19:49:21

by Roman Gushchin

[permalink] [raw]
Subject: Re: [PATCH RFC 01/14] mm: memcg: subpage charging API

On Tue, Sep 17, 2019 at 10:50:04AM +0200, Johannes Weiner wrote:
> On Tue, Sep 17, 2019 at 02:27:19AM +0000, Roman Gushchin wrote:
> > On Mon, Sep 16, 2019 at 02:56:11PM +0200, Johannes Weiner wrote:
> > > On Thu, Sep 05, 2019 at 02:45:45PM -0700, Roman Gushchin wrote:
> > > > Introduce an API to charge subpage objects to the memory cgroup.
> > > > The API will be used by the new slab memory controller. Later it
> > > > can also be used to implement percpu memory accounting.
> > > > In both cases, a single page can be shared between multiple cgroups
> > > > (and in percpu case a single allocation is split over multiple pages),
> > > > so it's not possible to use page-based accounting.
> > > >
> > > > The implementation is based on percpu stocks. Memory cgroups are still
> > > > charged in pages, and the residue is stored in perpcu stock, or on the
> > > > memcg itself, when it's necessary to flush the stock.
> > >
> > > Did you just implement a slab allocator for page_counter to track
> > > memory consumed by the slab allocator?
> >
> > :)
> >
> > >
> > > > @@ -2500,8 +2577,9 @@ void mem_cgroup_handle_over_high(void)
> > > > }
> > > >
> > > > static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> > > > - unsigned int nr_pages)
> > > > + unsigned int amount, bool subpage)
> > > > {
> > > > + unsigned int nr_pages = subpage ? ((amount >> PAGE_SHIFT) + 1) : amount;
> > > > unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
> > > > int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> > > > struct mem_cgroup *mem_over_limit;
> > > > @@ -2514,7 +2592,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
> > > > if (mem_cgroup_is_root(memcg))
> > > > return 0;
> > > > retry:
> > > > - if (consume_stock(memcg, nr_pages))
> > > > + if (subpage && consume_subpage_stock(memcg, amount))
> > > > + return 0;
> > > > + else if (!subpage && consume_stock(memcg, nr_pages))
> > > > return 0;
> > >
> > > The layering here isn't clean. We have an existing per-cpu cache to
> > > batch-charge the page counter. Why does the new subpage allocator not
> > > sit on *top* of this, instead of wedged in between?
> > >
> > > I think what it should be is a try_charge_bytes() that simply gets one
> > > page from try_charge() and then does its byte tracking, regardless of
> > > how try_charge() chooses to implement its own page tracking.
> > >
> > > That would avoid the awkward @amount + @subpage multiplexing, as well
> > > as annotating all existing callsites of try_charge() with a
> > > non-descript "false" parameter.
> > >
> > > You can still reuse the stock data structures, use the lower bits of
> > > stock->nr_bytes for a different cgroup etc., but the charge API should
> > > really be separate.
> >
> > Hm, I kinda like the idea, however there is a complication: for the subpage
> > accounting the css reference management is done in a different way, so that
> > all existing code should avoid changing the css refcounter. So I'd need
> > to pass a boolean argument anyway.
>
> Can you elaborate on the refcounting scheme? I don't quite understand
> how there would be complications with that.
>
> Generally, references are held for each page that is allocated in the
> page_counter. try_charge() allocates a batch of css references,
> returns one and keeps the rest in stock.
>
> So couldn't the following work?
>
> When somebody allocates a subpage, the css reference returned by
> try_charge() is shared by the allocated subpage object and the
> remainder that is kept via stock->subpage_cache and stock->nr_bytes
> (or memcg->nr_stocked_bytes when the percpu cache is reset).

Because individual objects are a subject of reparenting and can outlive
the origin memory cgroup, they shouldn't hold a direct reference to the
memory cgroup. Instead they hold a reference to the mem_cgroup_ptr object,
and this objects holds a single reference to the memory cgroup.
Underlying pages shouldn't hold a reference too.

Btw, it's already true, just kmem_cache plays the role of such intermediate
object, and we do an explicit transfer of charge (look at memcg_charge_slab()).
So we initially associate a page with the memcg, and almost immediately
after break this association and insert kmem_cache in between.

But with subpage accounting it's not possible, as a page is shared between
multiple cgroups, and it can't be attributed to any specific cgroup at
any time.

>
> When the subpage objects are freed, you'll eventually have a full page
> again in stock->nr_bytes, at which point you page_counter_uncharge()
> paired with css_put(_many) as per usual.
>
> A remainder left in old->nr_stocked_bytes would continue to hold on to
> one css reference. (I don't quite understand who is protecting this
> remainder in your current version, actually. A bug?)
>
> Instead of doing your own batched page_counter uncharging in
> refill_subpage_stock() -> drain_subpage_stock(), you should be able to
> call refill_stock() when stock->nr_bytes adds up to a whole page again.
>
> Again, IMO this would be much cleaner architecture if there was a
> try_charge_bytes() byte allocator that would sit on top of a cleanly
> abstracted try_charge() page allocator, just like the slab allocator
> is sitting on top of the page allocator - instead of breaking through
> the abstraction layer of the underlying page allocator.
>

As I said, I like the idea to put it on top, but it can't be put on top
without changes in css refcounting (or I don't see how). I don't know
how to mix stocks which are holding css references and which are not,
so I might end up with two stocks as in current implementation. Then
the idea of having another layer of caching on top looks slightly less
appealing, but maybe still worth a try.

Thanks!