2020-03-04 02:21:47

by Shakeel Butt

[permalink] [raw]
Subject: [PATCH] memcg: optimize memory.numa_stat like memory.stat

Currently reading memory.numa_stat traverses the underlying memcg tree
multiple times to accumulate the stats to present the hierarchical view
of the memcg tree. However the kernel already maintains the hierarchical
view of the stats and use it in memory.stat. Just use the same mechanism
in memory.numa_stat as well.

I ran a simple benchmark which reads root_mem_cgroup's memory.numa_stat
file in the presense of 10000 memcgs. The results are:

Without the patch:
$ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null

real 0m0.700s
user 0m0.001s
sys 0m0.697s

With the patch:
$ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null

real 0m0.001s
user 0m0.001s
sys 0m0.000s

Signed-off-by: Shakeel Butt <[email protected]>
---
mm/memcontrol.c | 52 +++++++++++++++++++++++++------------------------
1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 63bb6a2aab81..d5485fa8a345 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3614,32 +3614,40 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)

static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
+ int nid, unsigned int lru_mask, bool tree)
{
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
enum lru_list lru;
+ unsigned long (*page_state)(struct lruvec *lruvec,
+ enum node_stat_item idx);

VM_BUG_ON((unsigned)nid >= nr_node_ids);

+ page_state = tree ? lruvec_page_state : lruvec_page_state_local;
+
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ nr += page_state(lruvec, NR_LRU_BASE + lru);
}
return nr;
}

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
+ unsigned int lru_mask,
+ bool tree)
{
unsigned long nr = 0;
enum lru_list lru;
+ unsigned long (*page_state)(struct mem_cgroup *memcg, int idx);
+
+ page_state = tree ? memcg_page_state : memcg_page_state_local;

for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ nr += page_state(memcg, NR_LRU_BASE + lru);
}
return nr;
}
@@ -3659,34 +3667,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
};
const struct numa_stat *stat;
int nid;
- unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
- seq_printf(m, "%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+ seq_printf(m, "%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ false));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, false));
seq_putc(m, '\n');
}

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- struct mem_cgroup *iter;
-
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_node_nr_lru_pages(
- iter, nid, stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ true));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, true));
seq_putc(m, '\n');
}

--
2.25.0.265.gbab2e86ba0-goog


2020-03-06 04:42:13

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] memcg: optimize memory.numa_stat like memory.stat

On Tue, 3 Mar 2020 18:20:58 -0800 Shakeel Butt <[email protected]> wrote:

> Currently reading memory.numa_stat traverses the underlying memcg tree
> multiple times to accumulate the stats to present the hierarchical view
> of the memcg tree. However the kernel already maintains the hierarchical
> view of the stats and use it in memory.stat. Just use the same mechanism
> in memory.numa_stat as well.
>
> I ran a simple benchmark which reads root_mem_cgroup's memory.numa_stat
> file in the presense of 10000 memcgs. The results are:
>
> Without the patch:
> $ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null
>
> real 0m0.700s
> user 0m0.001s
> sys 0m0.697s
>
> With the patch:
> $ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null
>
> real 0m0.001s
> user 0m0.001s
> sys 0m0.000s
>

Can't you do better than that ;)

>
> + page_state = tree ? lruvec_page_state : lruvec_page_state_local;
> ...
>
> + page_state = tree ? memcg_page_state : memcg_page_state_local;
>

All four of these functions are inlined. Taking their address in this
fashion will force the compiler to generate out-of-line copies.

If we do it the uglier-and-maybe-a-bit-slower way:

--- a/mm/memcontrol.c~memcg-optimize-memorynuma_stat-like-memorystat-fix
+++ a/mm/memcontrol.c
@@ -3658,17 +3658,16 @@ static unsigned long mem_cgroup_node_nr_
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
enum lru_list lru;
- unsigned long (*page_state)(struct lruvec *lruvec,
- enum node_stat_item idx);

VM_BUG_ON((unsigned)nid >= nr_node_ids);

- page_state = tree ? lruvec_page_state : lruvec_page_state_local;
-
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += page_state(lruvec, NR_LRU_BASE + lru);
+ if (tree)
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ else
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
}
return nr;
}
@@ -3679,14 +3678,14 @@ static unsigned long mem_cgroup_nr_lru_p
{
unsigned long nr = 0;
enum lru_list lru;
- unsigned long (*page_state)(struct mem_cgroup *memcg, int idx);
-
- page_state = tree ? memcg_page_state : memcg_page_state_local;

for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += page_state(memcg, NR_LRU_BASE + lru);
+ if (tree)
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ else
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
}
return nr;
}

Then we get:

text data bss dec hex filename
now: 106705 35641 1024 143370 2300a mm/memcontrol.o
shakeel: 107111 35657 1024 143792 231b0 mm/memcontrol.o
shakeel+the-above: 106805 35657 1024 143486 2307e mm/memcontrol.o

Which do we prefer? The 100-byte patch or the 406-byte patch?

2020-03-06 04:56:27

by Shakeel Butt

[permalink] [raw]
Subject: Re: [PATCH] memcg: optimize memory.numa_stat like memory.stat

On Thu, Mar 5, 2020 at 8:41 PM Andrew Morton <[email protected]> wrote:
>
> On Tue, 3 Mar 2020 18:20:58 -0800 Shakeel Butt <[email protected]> wrote:
>
> > Currently reading memory.numa_stat traverses the underlying memcg tree
> > multiple times to accumulate the stats to present the hierarchical view
> > of the memcg tree. However the kernel already maintains the hierarchical
> > view of the stats and use it in memory.stat. Just use the same mechanism
> > in memory.numa_stat as well.
> >
> > I ran a simple benchmark which reads root_mem_cgroup's memory.numa_stat
> > file in the presense of 10000 memcgs. The results are:
> >
> > Without the patch:
> > $ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null
> >
> > real 0m0.700s
> > user 0m0.001s
> > sys 0m0.697s
> >
> > With the patch:
> > $ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null
> >
> > real 0m0.001s
> > user 0m0.001s
> > sys 0m0.000s
> >
>
> Can't you do better than that ;)
>
> >
> > + page_state = tree ? lruvec_page_state : lruvec_page_state_local;
> > ...
> >
> > + page_state = tree ? memcg_page_state : memcg_page_state_local;
> >
>
> All four of these functions are inlined. Taking their address in this
> fashion will force the compiler to generate out-of-line copies.
>
> If we do it the uglier-and-maybe-a-bit-slower way:
>
> --- a/mm/memcontrol.c~memcg-optimize-memorynuma_stat-like-memorystat-fix
> +++ a/mm/memcontrol.c
> @@ -3658,17 +3658,16 @@ static unsigned long mem_cgroup_node_nr_
> struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> unsigned long nr = 0;
> enum lru_list lru;
> - unsigned long (*page_state)(struct lruvec *lruvec,
> - enum node_stat_item idx);
>
> VM_BUG_ON((unsigned)nid >= nr_node_ids);
>
> - page_state = tree ? lruvec_page_state : lruvec_page_state_local;
> -
> for_each_lru(lru) {
> if (!(BIT(lru) & lru_mask))
> continue;
> - nr += page_state(lruvec, NR_LRU_BASE + lru);
> + if (tree)
> + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
> + else
> + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
> }
> return nr;
> }
> @@ -3679,14 +3678,14 @@ static unsigned long mem_cgroup_nr_lru_p
> {
> unsigned long nr = 0;
> enum lru_list lru;
> - unsigned long (*page_state)(struct mem_cgroup *memcg, int idx);
> -
> - page_state = tree ? memcg_page_state : memcg_page_state_local;
>
> for_each_lru(lru) {
> if (!(BIT(lru) & lru_mask))
> continue;
> - nr += page_state(memcg, NR_LRU_BASE + lru);
> + if (tree)
> + nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
> + else
> + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
> }
> return nr;
> }
>
> Then we get:
>
> text data bss dec hex filename
> now: 106705 35641 1024 143370 2300a mm/memcontrol.o
> shakeel: 107111 35657 1024 143792 231b0 mm/memcontrol.o
> shakeel+the-above: 106805 35657 1024 143486 2307e mm/memcontrol.o
>
> Which do we prefer? The 100-byte patch or the 406-byte patch?

I would go with the 100-byte one. The for-loop is just 5 iteration, so
doing a check in each iteration should not be an issue.

Shakeel

2020-04-23 23:04:04

by Shakeel Butt

[permalink] [raw]
Subject: Re: [PATCH] memcg: optimize memory.numa_stat like memory.stat

On Thu, Mar 5, 2020 at 8:54 PM Shakeel Butt <[email protected]> wrote:
>
> On Thu, Mar 5, 2020 at 8:41 PM Andrew Morton <[email protected]> wrote:
> >
> > On Tue, 3 Mar 2020 18:20:58 -0800 Shakeel Butt <[email protected]> wrote:
> >
> > > Currently reading memory.numa_stat traverses the underlying memcg tree
> > > multiple times to accumulate the stats to present the hierarchical view
> > > of the memcg tree. However the kernel already maintains the hierarchical
> > > view of the stats and use it in memory.stat. Just use the same mechanism
> > > in memory.numa_stat as well.
> > >
> > > I ran a simple benchmark which reads root_mem_cgroup's memory.numa_stat
> > > file in the presense of 10000 memcgs. The results are:
> > >
> > > Without the patch:
> > > $ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null
> > >
> > > real 0m0.700s
> > > user 0m0.001s
> > > sys 0m0.697s
> > >
> > > With the patch:
> > > $ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null
> > >
> > > real 0m0.001s
> > > user 0m0.001s
> > > sys 0m0.000s
> > >
> >
> > Can't you do better than that ;)
> >
> > >
> > > + page_state = tree ? lruvec_page_state : lruvec_page_state_local;
> > > ...
> > >
> > > + page_state = tree ? memcg_page_state : memcg_page_state_local;
> > >
> >
> > All four of these functions are inlined. Taking their address in this
> > fashion will force the compiler to generate out-of-line copies.
> >
> > If we do it the uglier-and-maybe-a-bit-slower way:
> >
> > --- a/mm/memcontrol.c~memcg-optimize-memorynuma_stat-like-memorystat-fix
> > +++ a/mm/memcontrol.c
> > @@ -3658,17 +3658,16 @@ static unsigned long mem_cgroup_node_nr_
> > struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > unsigned long nr = 0;
> > enum lru_list lru;
> > - unsigned long (*page_state)(struct lruvec *lruvec,
> > - enum node_stat_item idx);
> >
> > VM_BUG_ON((unsigned)nid >= nr_node_ids);
> >
> > - page_state = tree ? lruvec_page_state : lruvec_page_state_local;
> > -
> > for_each_lru(lru) {
> > if (!(BIT(lru) & lru_mask))
> > continue;
> > - nr += page_state(lruvec, NR_LRU_BASE + lru);
> > + if (tree)
> > + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
> > + else
> > + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
> > }
> > return nr;
> > }
> > @@ -3679,14 +3678,14 @@ static unsigned long mem_cgroup_nr_lru_p
> > {
> > unsigned long nr = 0;
> > enum lru_list lru;
> > - unsigned long (*page_state)(struct mem_cgroup *memcg, int idx);
> > -
> > - page_state = tree ? memcg_page_state : memcg_page_state_local;
> >
> > for_each_lru(lru) {
> > if (!(BIT(lru) & lru_mask))
> > continue;
> > - nr += page_state(memcg, NR_LRU_BASE + lru);
> > + if (tree)
> > + nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
> > + else
> > + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
> > }
> > return nr;
> > }
> >
> > Then we get:
> >
> > text data bss dec hex filename
> > now: 106705 35641 1024 143370 2300a mm/memcontrol.o
> > shakeel: 107111 35657 1024 143792 231b0 mm/memcontrol.o
> > shakeel+the-above: 106805 35657 1024 143486 2307e mm/memcontrol.o
> >
> > Which do we prefer? The 100-byte patch or the 406-byte patch?
>
> I would go with the 100-byte one. The for-loop is just 5 iteration, so
> doing a check in each iteration should not be an issue.
>

Andrew, anything more needed for this patch to be merged?

Shakeel

2020-04-23 23:13:09

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH] memcg: optimize memory.numa_stat like memory.stat

On Thu, 23 Apr 2020 15:59:41 -0700 Shakeel Butt <[email protected]> wrote:

> > > text data bss dec hex filename
> > > now: 106705 35641 1024 143370 2300a mm/memcontrol.o
> > > shakeel: 107111 35657 1024 143792 231b0 mm/memcontrol.o
> > > shakeel+the-above: 106805 35657 1024 143486 2307e mm/memcontrol.o
> > >
> > > Which do we prefer? The 100-byte patch or the 406-byte patch?
> >
> > I would go with the 100-byte one. The for-loop is just 5 iteration, so
> > doing a check in each iteration should not be an issue.
> >
>
> Andrew, anything more needed for this patch to be merged?

Some feedback from hannes & mhocko would be appreciated?


From: Shakeel Butt <[email protected]>
Subject: mm/memcg: optimize memory.numa_stat like memory.stat

Currently reading memory.numa_stat traverses the underlying memcg tree
multiple times to accumulate the stats to present the hierarchical view of
the memcg tree. However the kernel already maintains the hierarchical
view of the stats and use it in memory.stat. Just use the same mechanism
in memory.numa_stat as well.

I ran a simple benchmark which reads root_mem_cgroup's memory.numa_stat
file in the presense of 10000 memcgs. The results are:

Without the patch:
$ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null

real 0m0.700s
user 0m0.001s
sys 0m0.697s

With the patch:
$ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null

real 0m0.001s
user 0m0.001s
sys 0m0.000s

[[email protected]: avoid forcing out-of-line code generation]
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Shakeel Butt <[email protected]>
Reviewed-by: Andrew Morton <[email protected]>
Cc: Johannes Weiner <[email protected]>
Cc: Roman Gushchin <[email protected]>
Cc: Michal Hocko <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
---

mm/memcontrol.c | 49 +++++++++++++++++++++++-----------------------
1 file changed, 25 insertions(+), 24 deletions(-)

--- a/mm/memcontrol.c~memcg-optimize-memorynuma_stat-like-memorystat
+++ a/mm/memcontrol.c
@@ -3688,7 +3688,7 @@ static int mem_cgroup_move_charge_write(
#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)

static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
+ int nid, unsigned int lru_mask, bool tree)
{
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
@@ -3699,13 +3699,17 @@ static unsigned long mem_cgroup_node_nr_
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ if (tree)
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ else
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
}
return nr;
}

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
+ unsigned int lru_mask,
+ bool tree)
{
unsigned long nr = 0;
enum lru_list lru;
@@ -3713,7 +3717,10 @@ static unsigned long mem_cgroup_nr_lru_p
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ if (tree)
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ else
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
}
return nr;
}
@@ -3733,34 +3740,28 @@ static int memcg_numa_stat_show(struct s
};
const struct numa_stat *stat;
int nid;
- unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
- seq_printf(m, "%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+ seq_printf(m, "%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ false));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, false));
seq_putc(m, '\n');
}

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- struct mem_cgroup *iter;

- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_node_nr_lru_pages(
- iter, nid, stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ true));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, true));
seq_putc(m, '\n');
}

_

2020-04-24 02:41:21

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH] memcg: optimize memory.numa_stat like memory.stat

On Thu, Apr 23, 2020 at 04:10:09PM -0700, Andrew Morton wrote:
> From: Shakeel Butt <[email protected]>
> Subject: mm/memcg: optimize memory.numa_stat like memory.stat
>
> Currently reading memory.numa_stat traverses the underlying memcg tree
> multiple times to accumulate the stats to present the hierarchical view of
> the memcg tree. However the kernel already maintains the hierarchical
> view of the stats and use it in memory.stat. Just use the same mechanism
> in memory.numa_stat as well.
>
> I ran a simple benchmark which reads root_mem_cgroup's memory.numa_stat
> file in the presense of 10000 memcgs. The results are:
>
> Without the patch:
> $ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null
>
> real 0m0.700s
> user 0m0.001s
> sys 0m0.697s
>
> With the patch:
> $ time cat /dev/cgroup/memory/memory.numa_stat > /dev/null
>
> real 0m0.001s
> user 0m0.001s
> sys 0m0.000s
>
> [[email protected]: avoid forcing out-of-line code generation]
> Link: http://lkml.kernel.org/r/[email protected]
> Signed-off-by: Shakeel Butt <[email protected]>
> Reviewed-by: Andrew Morton <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Roman Gushchin <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>

Ouch, yes. That makes sense.

Acked-by: Johannes Weiner <[email protected]>