2023-07-26 15:49:16

by Yosry Ahmed

[permalink] [raw]
Subject: [PATCH] mm: memcg: use rstat for non-hierarchical stats

Currently, memcg uses rstat to maintain hierarchical stats. The rstat
framework keeps track of which cgroups have updates on which cpus.

For non-hierarchical stats, as memcg moved to rstat, they are no longer
readily available as counters. Instead, the percpu counters for a given
stat need to be summed to get the non-hierarchical stat value. This
causes a performance regression when reading non-hierarchical stats on
kernels where memcg moved to using rstat. This is especially visible
when reading memory.stat on cgroup v1. There are also some code paths
internal to the kernel that read such non-hierarchical stats.

It is inefficient to iterate and sum counters in all cpus when the rstat
framework knows exactly when a percpu counter has an update. Instead,
maintain cpu-aggregated non-hierarchical counters for each stat. During
an rstat flush, keep those updated as well. When reading
non-hierarchical stats, we no longer need to iterate cpus, we just need
to read the maintainer counters, similar to hierarchical stats.

A caveat is that we now a stats flush before reading
local/non-hierarchical stats through {memcg/lruvec}_page_state_local()
or memcg_events_local(), where we previously only needed a flush to
read hierarchical stats. Most contexts reading non-hierarchical stats
are already doing a flush, add a flush to the only missing context in
count_shadow_nodes().

With this patch, reading memory.stat from 1000 memcgs is 3x faster on a
machine with 256 cpus on cgroup v1:
# for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done
# time cat /dev/cgroup/memory/cg*/memory.stat > /dev/null
real 0m0.125s
user 0m0.005s
sys 0m0.120s

After:
real 0m0.032s
user 0m0.005s
sys 0m0.027s

Signed-off-by: Yosry Ahmed <[email protected]>
---
include/linux/memcontrol.h | 7 ++++---
mm/memcontrol.c | 32 +++++++++++++++++++-------------
mm/workingset.c | 1 +
3 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5818af8eca5a..a9f2861a57a5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -112,6 +112,9 @@ struct lruvec_stats {
/* Aggregated (CPU and subtree) state */
long state[NR_VM_NODE_STAT_ITEMS];

+ /* Non-hierarchical (CPU aggregated) state */
+ long state_local[NR_VM_NODE_STAT_ITEMS];
+
/* Pending child counts during tree propagation */
long state_pending[NR_VM_NODE_STAT_ITEMS];
};
@@ -1020,14 +1023,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
{
struct mem_cgroup_per_node *pn;
long x = 0;
- int cpu;

if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- for_each_possible_cpu(cpu)
- x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
+ x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8ca4bdcb03c..90a22637818e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -742,6 +742,10 @@ struct memcg_vmstats {
long state[MEMCG_NR_STAT];
unsigned long events[NR_MEMCG_EVENTS];

+ /* Non-hierarchical (CPU aggregated) page state & events */
+ long state_local[MEMCG_NR_STAT];
+ unsigned long events_local[NR_MEMCG_EVENTS];
+
/* Pending child counts during tree propagation */
long state_pending[MEMCG_NR_STAT];
unsigned long events_pending[NR_MEMCG_EVENTS];
@@ -775,11 +779,8 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
/* idx can be of type enum memcg_stat_item or node_stat_item. */
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
- long x = 0;
- int cpu;
+ long x = READ_ONCE(memcg->vmstats->state_local[idx]);

- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -926,16 +927,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)

static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
- long x = 0;
- int cpu;
int index = memcg_events_index(event);

if (index < 0)
return 0;

- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
- return x;
+ return READ_ONCE(memcg->vmstats->events_local[index]);
}

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -5526,7 +5523,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct memcg_vmstats_percpu *statc;
- long delta, v;
+ long delta, delta_cpu, v;
int i, nid;

statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
@@ -5542,9 +5539,11 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
memcg->vmstats->state_pending[i] = 0;

/* Add CPU changes on this level since the last flush */
+ delta_cpu = 0;
v = READ_ONCE(statc->state[i]);
if (v != statc->state_prev[i]) {
- delta += v - statc->state_prev[i];
+ delta_cpu = v - statc->state_prev[i];
+ delta += delta_cpu;
statc->state_prev[i] = v;
}

@@ -5553,6 +5552,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)

/* Aggregate counts on this level and propagate upwards */
memcg->vmstats->state[i] += delta;
+ memcg->vmstats->state_local[i] += delta_cpu;
if (parent)
parent->vmstats->state_pending[i] += delta;
}
@@ -5562,9 +5562,11 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (delta)
memcg->vmstats->events_pending[i] = 0;

+ delta_cpu = 0;
v = READ_ONCE(statc->events[i]);
if (v != statc->events_prev[i]) {
- delta += v - statc->events_prev[i];
+ delta_cpu = v - statc->events_prev[i];
+ delta += delta_cpu;
statc->events_prev[i] = v;
}

@@ -5572,6 +5574,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
continue;

memcg->vmstats->events[i] += delta;
+ memcg->vmstats->events_local[i] += delta_cpu;
if (parent)
parent->vmstats->events_pending[i] += delta;
}
@@ -5591,9 +5594,11 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (delta)
pn->lruvec_stats.state_pending[i] = 0;

+ delta_cpu = 0;
v = READ_ONCE(lstatc->state[i]);
if (v != lstatc->state_prev[i]) {
- delta += v - lstatc->state_prev[i];
+ delta_cpu = v - lstatc->state_prev[i];
+ delta += delta_cpu;
lstatc->state_prev[i] = v;
}

@@ -5601,6 +5606,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
continue;

pn->lruvec_stats.state[i] += delta;
+ pn->lruvec_stats.state_local[i] += delta_cpu;
if (ppn)
ppn->lruvec_stats.state_pending[i] += delta;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 4686ae363000..da58a26d0d4d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -664,6 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct lruvec *lruvec;
int i;

+ mem_cgroup_flush_stats();
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
--
2.41.0.255.g8b1d071c50-goog



2023-07-26 15:58:23

by Yosry Ahmed

[permalink] [raw]
Subject: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

Currently, memcg uses rstat to maintain aggregated hierarchical stats.
Counters are maintained for hierarchical stats at each memcg. Rstat
tracks which cgroups have updates on which cpus to keep those counters
fresh on the read-side.

Non-hierarchical stats are currently not covered by rstat. Their
per-cpu counters are summed up on every read, which is expensive.
The original implementation did the same. At some point before rstat,
non-hierarchical aggregated counters were introduced by
commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
memory.stat reporting"). However, those counters were updated on the
performance critical write-side, which caused regressions, so they were
later removed by commit 815744d75152 ("mm: memcontrol: don't batch
updates of local VM stats and events"). See [1] for more detailed
history.

Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a
half) enjoyed cheap reads of non-hierarchical stats, specifically on
cgroup v1. When moving to more recent kernels, a performance regression
for reading non-hierarchical stats is observed.

Now that we have rstat, we know exactly which percpu counters have
updates for each stat. We can maintain non-hierarchical counters again,
making reads much more efficient, without affecting the performance
critical write-side. Hence, add non-hierarchical (i.e local) counters
for the stats, and extend rstat flushing to keep those up-to-date.

A caveat is that we now need a stats flush before reading
local/non-hierarchical stats through {memcg/lruvec}_page_state_local()
or memcg_events_local(), where we previously only needed a flush to
read hierarchical stats. Most contexts reading non-hierarchical stats
are already doing a flush, add a flush to the only missing context in
count_shadow_nodes().

With this patch, reading memory.stat from 1000 memcgs is 3x faster on a
machine with 256 cpus on cgroup v1:
# for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done
# time cat /dev/cgroup/memory/cg*/memory.stat > /dev/null
real 0m0.125s
user 0m0.005s
sys 0m0.120s

After:
real 0m0.032s
user 0m0.005s
sys 0m0.027s

[1]https://lore.kernel.org/lkml/[email protected]/

Signed-off-by: Yosry Ahmed <[email protected]>
Acked-by: Johannes Weiner <[email protected]>
Acked-by: Roman Gushchin <[email protected]>
---

v2 -> v3:
- Commit log fixes (thanks Roman & Johannes).

v2: https://lore.kernel.org/lkml/[email protected]/

---
include/linux/memcontrol.h | 7 ++--
mm/memcontrol.c | 67 +++++++++++++++++++++-----------------
mm/workingset.c | 1 +
3 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 5818af8eca5a..a9f2861a57a5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -112,6 +112,9 @@ struct lruvec_stats {
/* Aggregated (CPU and subtree) state */
long state[NR_VM_NODE_STAT_ITEMS];

+ /* Non-hierarchical (CPU aggregated) state */
+ long state_local[NR_VM_NODE_STAT_ITEMS];
+
/* Pending child counts during tree propagation */
long state_pending[NR_VM_NODE_STAT_ITEMS];
};
@@ -1020,14 +1023,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
{
struct mem_cgroup_per_node *pn;
long x = 0;
- int cpu;

if (mem_cgroup_disabled())
return node_page_state(lruvec_pgdat(lruvec), idx);

pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
- for_each_possible_cpu(cpu)
- x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
+ x = READ_ONCE(pn->lruvec_stats.state_local[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e8ca4bdcb03c..50f8035e998a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -742,6 +742,10 @@ struct memcg_vmstats {
long state[MEMCG_NR_STAT];
unsigned long events[NR_MEMCG_EVENTS];

+ /* Non-hierarchical (CPU aggregated) page state & events */
+ long state_local[MEMCG_NR_STAT];
+ unsigned long events_local[NR_MEMCG_EVENTS];
+
/* Pending child counts during tree propagation */
long state_pending[MEMCG_NR_STAT];
unsigned long events_pending[NR_MEMCG_EVENTS];
@@ -775,11 +779,8 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
/* idx can be of type enum memcg_stat_item or node_stat_item. */
static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
{
- long x = 0;
- int cpu;
+ long x = READ_ONCE(memcg->vmstats->state_local[idx]);

- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -926,16 +927,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)

static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
- long x = 0;
- int cpu;
int index = memcg_events_index(event);

if (index < 0)
return 0;

- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_percpu->events[index], cpu);
- return x;
+ return READ_ONCE(memcg->vmstats->events_local[index]);
}

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -5526,7 +5523,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
struct memcg_vmstats_percpu *statc;
- long delta, v;
+ long delta, delta_cpu, v;
int i, nid;

statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
@@ -5542,19 +5539,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
memcg->vmstats->state_pending[i] = 0;

/* Add CPU changes on this level since the last flush */
+ delta_cpu = 0;
v = READ_ONCE(statc->state[i]);
if (v != statc->state_prev[i]) {
- delta += v - statc->state_prev[i];
+ delta_cpu = v - statc->state_prev[i];
+ delta += delta_cpu;
statc->state_prev[i] = v;
}

- if (!delta)
- continue;
-
/* Aggregate counts on this level and propagate upwards */
- memcg->vmstats->state[i] += delta;
- if (parent)
- parent->vmstats->state_pending[i] += delta;
+ if (delta_cpu)
+ memcg->vmstats->state_local[i] += delta_cpu;
+
+ if (delta) {
+ memcg->vmstats->state[i] += delta;
+ if (parent)
+ parent->vmstats->state_pending[i] += delta;
+ }
}

for (i = 0; i < NR_MEMCG_EVENTS; i++) {
@@ -5562,18 +5563,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (delta)
memcg->vmstats->events_pending[i] = 0;

+ delta_cpu = 0;
v = READ_ONCE(statc->events[i]);
if (v != statc->events_prev[i]) {
- delta += v - statc->events_prev[i];
+ delta_cpu = v - statc->events_prev[i];
+ delta += delta_cpu;
statc->events_prev[i] = v;
}

- if (!delta)
- continue;
+ if (delta_cpu)
+ memcg->vmstats->events_local[i] += delta_cpu;

- memcg->vmstats->events[i] += delta;
- if (parent)
- parent->vmstats->events_pending[i] += delta;
+ if (delta) {
+ memcg->vmstats->events[i] += delta;
+ if (parent)
+ parent->vmstats->events_pending[i] += delta;
+ }
}

for_each_node_state(nid, N_MEMORY) {
@@ -5591,18 +5596,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (delta)
pn->lruvec_stats.state_pending[i] = 0;

+ delta_cpu = 0;
v = READ_ONCE(lstatc->state[i]);
if (v != lstatc->state_prev[i]) {
- delta += v - lstatc->state_prev[i];
+ delta_cpu = v - lstatc->state_prev[i];
+ delta += delta_cpu;
lstatc->state_prev[i] = v;
}

- if (!delta)
- continue;
+ if (delta_cpu)
+ pn->lruvec_stats.state_local[i] += delta_cpu;

- pn->lruvec_stats.state[i] += delta;
- if (ppn)
- ppn->lruvec_stats.state_pending[i] += delta;
+ if (delta) {
+ pn->lruvec_stats.state[i] += delta;
+ if (ppn)
+ ppn->lruvec_stats.state_pending[i] += delta;
+ }
}
}
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 4686ae363000..da58a26d0d4d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -664,6 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct lruvec *lruvec;
int i;

+ mem_cgroup_flush_stats();
lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
--
2.41.0.487.g6d72f3e995-goog


2023-08-01 16:30:27

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

On Wed 26-07-23 15:32:23, Yosry Ahmed wrote:
> Currently, memcg uses rstat to maintain aggregated hierarchical stats.
> Counters are maintained for hierarchical stats at each memcg. Rstat
> tracks which cgroups have updates on which cpus to keep those counters
> fresh on the read-side.
>
> Non-hierarchical stats are currently not covered by rstat. Their
> per-cpu counters are summed up on every read, which is expensive.
> The original implementation did the same. At some point before rstat,
> non-hierarchical aggregated counters were introduced by
> commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
> memory.stat reporting"). However, those counters were updated on the
> performance critical write-side, which caused regressions, so they were
> later removed by commit 815744d75152 ("mm: memcontrol: don't batch
> updates of local VM stats and events"). See [1] for more detailed
> history.
>
> Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a
> half) enjoyed cheap reads of non-hierarchical stats, specifically on
> cgroup v1. When moving to more recent kernels, a performance regression
> for reading non-hierarchical stats is observed.
>
> Now that we have rstat, we know exactly which percpu counters have
> updates for each stat. We can maintain non-hierarchical counters again,
> making reads much more efficient, without affecting the performance
> critical write-side. Hence, add non-hierarchical (i.e local) counters
> for the stats, and extend rstat flushing to keep those up-to-date.
>
> A caveat is that we now need a stats flush before reading
> local/non-hierarchical stats through {memcg/lruvec}_page_state_local()
> or memcg_events_local(), where we previously only needed a flush to
> read hierarchical stats. Most contexts reading non-hierarchical stats
> are already doing a flush, add a flush to the only missing context in
> count_shadow_nodes().
>
> With this patch, reading memory.stat from 1000 memcgs is 3x faster on a
> machine with 256 cpus on cgroup v1:
> # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done
> # time cat /dev/cgroup/memory/cg*/memory.stat > /dev/null
> real 0m0.125s
> user 0m0.005s
> sys 0m0.120s
>
> After:
> real 0m0.032s
> user 0m0.005s
> sys 0m0.027s

Have you measured any potential regression for cgroup v2 which collects
all this data without ever using it (AFAICS)?
--
Michal Hocko
SUSE Labs

2023-08-01 17:22:52

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

On Tue, Aug 1, 2023 at 7:30 AM Michal Hocko <[email protected]> wrote:
>
> On Wed 26-07-23 15:32:23, Yosry Ahmed wrote:
> > Currently, memcg uses rstat to maintain aggregated hierarchical stats.
> > Counters are maintained for hierarchical stats at each memcg. Rstat
> > tracks which cgroups have updates on which cpus to keep those counters
> > fresh on the read-side.
> >
> > Non-hierarchical stats are currently not covered by rstat. Their
> > per-cpu counters are summed up on every read, which is expensive.
> > The original implementation did the same. At some point before rstat,
> > non-hierarchical aggregated counters were introduced by
> > commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
> > memory.stat reporting"). However, those counters were updated on the
> > performance critical write-side, which caused regressions, so they were
> > later removed by commit 815744d75152 ("mm: memcontrol: don't batch
> > updates of local VM stats and events"). See [1] for more detailed
> > history.
> >
> > Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a
> > half) enjoyed cheap reads of non-hierarchical stats, specifically on
> > cgroup v1. When moving to more recent kernels, a performance regression
> > for reading non-hierarchical stats is observed.
> >
> > Now that we have rstat, we know exactly which percpu counters have
> > updates for each stat. We can maintain non-hierarchical counters again,
> > making reads much more efficient, without affecting the performance
> > critical write-side. Hence, add non-hierarchical (i.e local) counters
> > for the stats, and extend rstat flushing to keep those up-to-date.
> >
> > A caveat is that we now need a stats flush before reading
> > local/non-hierarchical stats through {memcg/lruvec}_page_state_local()
> > or memcg_events_local(), where we previously only needed a flush to
> > read hierarchical stats. Most contexts reading non-hierarchical stats
> > are already doing a flush, add a flush to the only missing context in
> > count_shadow_nodes().
> >
> > With this patch, reading memory.stat from 1000 memcgs is 3x faster on a
> > machine with 256 cpus on cgroup v1:
> > # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done
> > # time cat /dev/cgroup/memory/cg*/memory.stat > /dev/null
> > real 0m0.125s
> > user 0m0.005s
> > sys 0m0.120s
> >
> > After:
> > real 0m0.032s
> > user 0m0.005s
> > sys 0m0.027s
>
> Have you measured any potential regression for cgroup v2 which collects
> all this data without ever using it (AFAICS)?

I did not. I did not expect noticeable regressions given that all the
extra work is done during flushing, which should mostly be done by the
asynchronous worker, but can also happen in the stats reading context.
Let me run the same script on cgroup v2 just in case and report back.

> --
> Michal Hocko
> SUSE Labs

2023-08-01 19:15:38

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

On Tue, Aug 1, 2023 at 9:39 AM Yosry Ahmed <[email protected]> wrote:
>
> On Tue, Aug 1, 2023 at 7:30 AM Michal Hocko <[email protected]> wrote:
> >
> > On Wed 26-07-23 15:32:23, Yosry Ahmed wrote:
> > > Currently, memcg uses rstat to maintain aggregated hierarchical stats.
> > > Counters are maintained for hierarchical stats at each memcg. Rstat
> > > tracks which cgroups have updates on which cpus to keep those counters
> > > fresh on the read-side.
> > >
> > > Non-hierarchical stats are currently not covered by rstat. Their
> > > per-cpu counters are summed up on every read, which is expensive.
> > > The original implementation did the same. At some point before rstat,
> > > non-hierarchical aggregated counters were introduced by
> > > commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in
> > > memory.stat reporting"). However, those counters were updated on the
> > > performance critical write-side, which caused regressions, so they were
> > > later removed by commit 815744d75152 ("mm: memcontrol: don't batch
> > > updates of local VM stats and events"). See [1] for more detailed
> > > history.
> > >
> > > Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a
> > > half) enjoyed cheap reads of non-hierarchical stats, specifically on
> > > cgroup v1. When moving to more recent kernels, a performance regression
> > > for reading non-hierarchical stats is observed.
> > >
> > > Now that we have rstat, we know exactly which percpu counters have
> > > updates for each stat. We can maintain non-hierarchical counters again,
> > > making reads much more efficient, without affecting the performance
> > > critical write-side. Hence, add non-hierarchical (i.e local) counters
> > > for the stats, and extend rstat flushing to keep those up-to-date.
> > >
> > > A caveat is that we now need a stats flush before reading
> > > local/non-hierarchical stats through {memcg/lruvec}_page_state_local()
> > > or memcg_events_local(), where we previously only needed a flush to
> > > read hierarchical stats. Most contexts reading non-hierarchical stats
> > > are already doing a flush, add a flush to the only missing context in
> > > count_shadow_nodes().
> > >
> > > With this patch, reading memory.stat from 1000 memcgs is 3x faster on a
> > > machine with 256 cpus on cgroup v1:
> > > # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done
> > > # time cat /dev/cgroup/memory/cg*/memory.stat > /dev/null
> > > real 0m0.125s
> > > user 0m0.005s
> > > sys 0m0.120s
> > >
> > > After:
> > > real 0m0.032s
> > > user 0m0.005s
> > > sys 0m0.027s
> >
> > Have you measured any potential regression for cgroup v2 which collects
> > all this data without ever using it (AFAICS)?
>
> I did not. I did not expect noticeable regressions given that all the
> extra work is done during flushing, which should mostly be done by the
> asynchronous worker, but can also happen in the stats reading context.
> Let me run the same script on cgroup v2 just in case and report back.

A few runs on mm-unstable with this patch:

# time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null
real 0m0.020s
user 0m0.005s
sys 0m0.015s

# time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null
real 0m0.017s
user 0m0.005s
sys 0m0.012s

# time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null
real 0m0.016s
user 0m0.004s
sys 0m0.012s

A few runs on mm-unstable with the patch reverted:

# time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null
real 0m0.020s
user 0m0.005s
sys 0m0.015s

# time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null
real 0m0.016s
user 0m0.004s
sys 0m0.012s

# time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null
real 0m0.017s
user 0m0.005s
sys 0m0.012s

It looks like there are no regressions on cgroup v2 when reading the
stats. Please let me know if you want me to send a new version with
the cgroup v2 results as well in the commit log -- or I can just send
a new commit log. Whatever is easier for Andrew.

>
> > --
> > Michal Hocko
> > SUSE Labs

2023-08-02 08:06:00

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

On Tue 01-08-23 10:29:39, Yosry Ahmed wrote:
> On Tue, Aug 1, 2023 at 9:39 AM Yosry Ahmed <[email protected]> wrote:
[...]
> > > Have you measured any potential regression for cgroup v2 which collects
> > > all this data without ever using it (AFAICS)?
> >
> > I did not. I did not expect noticeable regressions given that all the
> > extra work is done during flushing, which should mostly be done by the
> > asynchronous worker, but can also happen in the stats reading context.
> > Let me run the same script on cgroup v2 just in case and report back.
>
> A few runs on mm-unstable with this patch:
>
> # time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null

Is this really representative test to make? I would have expected the
overhead would be mostly in mem_cgroup_css_rstat_flush (if it is visible
at all of course). This would be more likely visible in all cpus busy
situation (you can try heavy parallel kernel build from tmpfs for
example).
[...]

> It looks like there are no regressions on cgroup v2 when reading the
> stats. Please let me know if you want me to send a new version with
> the cgroup v2 results as well in the commit log -- or I can just send
> a new commit log. Whatever is easier for Andrew.

Updating the changelog should be good enough.
--
Michal Hocko
SUSE Labs

2023-08-02 08:45:14

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

On Wed, Aug 2, 2023 at 12:40 AM Michal Hocko <[email protected]> wrote:
>
> On Tue 01-08-23 10:29:39, Yosry Ahmed wrote:
> > On Tue, Aug 1, 2023 at 9:39 AM Yosry Ahmed <[email protected]> wrote:
> [...]
> > > > Have you measured any potential regression for cgroup v2 which collects
> > > > all this data without ever using it (AFAICS)?
> > >
> > > I did not. I did not expect noticeable regressions given that all the
> > > extra work is done during flushing, which should mostly be done by the
> > > asynchronous worker, but can also happen in the stats reading context.
> > > Let me run the same script on cgroup v2 just in case and report back.
> >
> > A few runs on mm-unstable with this patch:
> >
> > # time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null
>
> Is this really representative test to make? I would have expected the
> overhead would be mostly in mem_cgroup_css_rstat_flush (if it is visible
> at all of course). This would be more likely visible in all cpus busy
> situation (you can try heavy parallel kernel build from tmpfs for
> example).


I see. You are more worried about asynchronous flushing eating cpu
time rather than the synchronous flushing being slower. In fact, my
test is actually not representative at all because probably most of
the cgroups either do not have updates or the asynchronous flusher got
to them first.

Let me try a workload that is more parallel & cpu intensive and report
back. I am thinking of parallel reclaim/refault loops since both
reclaim and refault paths invoke stat updates and stat flushing.

>
> [...]
>
> > It looks like there are no regressions on cgroup v2 when reading the
> > stats. Please let me know if you want me to send a new version with
> > the cgroup v2 results as well in the commit log -- or I can just send
> > a new commit log. Whatever is easier for Andrew.
>
> Updating the changelog should be good enough.
> --
> Michal Hocko
> SUSE Labs

2023-08-02 22:59:56

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

On Wed, Aug 2, 2023 at 1:11 AM Yosry Ahmed <[email protected]> wrote:
>
> On Wed, Aug 2, 2023 at 12:40 AM Michal Hocko <[email protected]> wrote:
> >
> > On Tue 01-08-23 10:29:39, Yosry Ahmed wrote:
> > > On Tue, Aug 1, 2023 at 9:39 AM Yosry Ahmed <[email protected]> wrote:
> > [...]
> > > > > Have you measured any potential regression for cgroup v2 which collects
> > > > > all this data without ever using it (AFAICS)?
> > > >
> > > > I did not. I did not expect noticeable regressions given that all the
> > > > extra work is done during flushing, which should mostly be done by the
> > > > asynchronous worker, but can also happen in the stats reading context.
> > > > Let me run the same script on cgroup v2 just in case and report back.
> > >
> > > A few runs on mm-unstable with this patch:
> > >
> > > # time cat /sys/fs/cgroup/cg*/memory.stat > /dev/null
> >
> > Is this really representative test to make? I would have expected the
> > overhead would be mostly in mem_cgroup_css_rstat_flush (if it is visible
> > at all of course). This would be more likely visible in all cpus busy
> > situation (you can try heavy parallel kernel build from tmpfs for
> > example).
>
>
> I see. You are more worried about asynchronous flushing eating cpu
> time rather than the synchronous flushing being slower. In fact, my
> test is actually not representative at all because probably most of
> the cgroups either do not have updates or the asynchronous flusher got
> to them first.
>
> Let me try a workload that is more parallel & cpu intensive and report
> back. I am thinking of parallel reclaim/refault loops since both
> reclaim and refault paths invoke stat updates and stat flushing.
>

I am back with more data.

So I wrote a small reclaim/refault stress test that creates (NR_CPUS *
2) cgroups, assigns them limits, runs a worker process in each cgroup
that allocates tmpfs memory equal to quadruple the limit (to invoke
reclaim) continuously, and then reads back the entire file (to invoke
refaults). All workers are run in parallel, and zram is used as a
swapping backend. Both reclaim and refault have conditional stats
flushing. I ran this on a machine with 112 cpus, once on mm-unstable,
and once on mm-unstable with this patch reverted. The script is
attached.

(1) A few runs without this patch:

# time ./stress_reclaim_refault.sh
real 0m9.949s
user 0m0.496s
sys 14m44.974s

# time ./stress_reclaim_refault.sh
real 0m10.049s
user 0m0.486s
sys 14m55.791s

# time ./stress_reclaim_refault.sh
real 0m9.984s
user 0m0.481s
sys 14m53.841s

(2) A few runs with this patch:

# time ./stress_reclaim_refault.sh
real 0m9.885s
user 0m0.486s
sys 14m48.753s

# time ./stress_reclaim_refault.sh
real 0m9.903s
user 0m0.495s
sys 14m48.339s

# time ./stress_reclaim_refault.sh
real 0m9.861s
user 0m0.507s
sys 14m49.317s

I do not see any regressions from this patch. There is actually a very
slight improvement. If I have to guess, maybe it's because we avoid
the percpu loop in count_shadow_nodes() when calling
lruvec_page_state_local(), but I could not prove this using perf, it's
probably in the noise.

Let me know if the testing is satisfactory for you. I can send an
updated commit log accordingly with a summary of this conversation.

> > --
> > Michal Hocko
> > SUSE Labs


Attachments:
stress_reclaim_refault.sh (1.04 kB)

2023-08-03 16:50:06

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

On Wed 02-08-23 15:02:55, Yosry Ahmed wrote:
[...]
> Let me know if the testing is satisfactory for you. I can send an
> updated commit log accordingly with a summary of this conversation.

Yes this should be sufficient as it exercises all the CPUs so the
overhead in flushing should be visible if this was a real deal. I would
have gone with kernel build test as that has a broader code coverage but
this artificial test should give some red flags as well. So good enough.
Amending the changelog with this would be helpful as well so that future
us and others will know what kind of testing has been done.

Acked-by: Michal Hocko <[email protected]>

>
> > > --
> > > Michal Hocko
> > > SUSE Labs

> #!/bin/bash
>
> NR_CPUS=$(getconf _NPROCESSORS_ONLN)
> NR_CGROUPS=$(( NR_CPUS * 2 ))
> TEST_MB=50
> TOTAL_MB=$((TEST_MB * NR_CGROUPS))
> TMPFS=$(mktemp -d)
> ROOT="/sys/fs/cgroup/"
> ZRAM_DEV="/mnt/devtmpfs/zram0"
>
> cleanup() {
> umount $TMPFS
> rm -rf $TMPFS
> for i in $(seq $NR_CGROUPS); do
> cgroup="$ROOT/cg$i"
> rmdir $cgroup
> done
> swapoff $ZRAM_DEV
> echo 1 > "/sys/block/zram0/reset"
> }
> trap cleanup INT QUIT EXIT
>
> # Setup zram
> echo $((TOTAL_MB << 20)) > "/sys/block/zram0/disksize"
> mkswap $ZRAM_DEV
> swapon $ZRAM_DEV
> echo "Setup zram done"
>
> # Create cgroups, set limits
> echo "+memory" > "$ROOT/cgroup.subtree_control"
> for i in $(seq $NR_CGROUPS); do
> cgroup="$ROOT/cg$i"
> mkdir $cgroup
> echo $(( (TEST_MB << 20) / 4)) > "$cgroup/memory.max"
> done
> echo "Setup cgroups done"
>
> # Start workers to allocate tmpfs memory
> mount -t tmpfs none $TMPFS
> for i in $(seq $NR_CGROUPS); do
> cgroup="$ROOT/cg$i"
> f="$TMPFS/tmp$i"
> (echo 0 > "$cgroup/cgroup.procs" &&
> dd if=/dev/zero of=$f bs=1M count=$TEST_MB status=none &&
> cat $f > /dev/null)&
> done
>
> # Wait for workers
> wait


--
Michal Hocko
SUSE Labs

2023-08-03 19:17:59

by Yosry Ahmed

[permalink] [raw]
Subject: Re: [PATCH v3] mm: memcg: use rstat for non-hierarchical stats

On Thu, Aug 3, 2023 at 7:55 AM Michal Hocko <[email protected]> wrote:
>
> On Wed 02-08-23 15:02:55, Yosry Ahmed wrote:
> [...]
> > Let me know if the testing is satisfactory for you. I can send an
> > updated commit log accordingly with a summary of this conversation.
>
> Yes this should be sufficient as it exercises all the CPUs so the
> overhead in flushing should be visible if this was a real deal. I would
> have gone with kernel build test as that has a broader code coverage but
> this artificial test should give some red flags as well. So good enough.
> Amending the changelog with this would be helpful as well so that future
> us and others will know what kind of testing has been done.
>
> Acked-by: Michal Hocko <[email protected]>

Thanks! I sent a v4 with your Ack and an amended changelog that
describes the testing done and points to the script attached here.

>
> >
> > > > --
> > > > Michal Hocko
> > > > SUSE Labs
>
> > #!/bin/bash
> >
> > NR_CPUS=$(getconf _NPROCESSORS_ONLN)
> > NR_CGROUPS=$(( NR_CPUS * 2 ))
> > TEST_MB=50
> > TOTAL_MB=$((TEST_MB * NR_CGROUPS))
> > TMPFS=$(mktemp -d)
> > ROOT="/sys/fs/cgroup/"
> > ZRAM_DEV="/mnt/devtmpfs/zram0"
> >
> > cleanup() {
> > umount $TMPFS
> > rm -rf $TMPFS
> > for i in $(seq $NR_CGROUPS); do
> > cgroup="$ROOT/cg$i"
> > rmdir $cgroup
> > done
> > swapoff $ZRAM_DEV
> > echo 1 > "/sys/block/zram0/reset"
> > }
> > trap cleanup INT QUIT EXIT
> >
> > # Setup zram
> > echo $((TOTAL_MB << 20)) > "/sys/block/zram0/disksize"
> > mkswap $ZRAM_DEV
> > swapon $ZRAM_DEV
> > echo "Setup zram done"
> >
> > # Create cgroups, set limits
> > echo "+memory" > "$ROOT/cgroup.subtree_control"
> > for i in $(seq $NR_CGROUPS); do
> > cgroup="$ROOT/cg$i"
> > mkdir $cgroup
> > echo $(( (TEST_MB << 20) / 4)) > "$cgroup/memory.max"
> > done
> > echo "Setup cgroups done"
> >
> > # Start workers to allocate tmpfs memory
> > mount -t tmpfs none $TMPFS
> > for i in $(seq $NR_CGROUPS); do
> > cgroup="$ROOT/cg$i"
> > f="$TMPFS/tmp$i"
> > (echo 0 > "$cgroup/cgroup.procs" &&
> > dd if=/dev/zero of=$f bs=1M count=$TEST_MB status=none &&
> > cat $f > /dev/null)&
> > done
> >
> > # Wait for workers
> > wait
>
>
> --
> Michal Hocko
> SUSE Labs