2019-07-17 12:30:28

by Konstantin Khlebnikov

[permalink] [raw]
Subject: [PATCH 2/2] mm/memcontrol: split local and nested atomic vmstats/vmevents counters

This is alternative solution for problem addressed in commit 815744d75152
("mm: memcontrol: don't batch updates of local VM stats and events").

Instead of adding second set of percpu counters which wastes memory and
slows down showing statistics in cgroup-v1 this patch use two arrays of
atomic counters: local and nested statistics.

Then update has the same amount of atomic operations: local update and
one nested for each parent cgroup. Readers of hierarchical statistics
have to sum two atomics which isn't a big deal.

All updates are still batched using one set of percpu counters.

Signed-off-by: Konstantin Khlebnikov <[email protected]>
---
include/linux/memcontrol.h | 19 +++++++----------
mm/memcontrol.c | 48 +++++++++++++++++++-------------------------
2 files changed, 29 insertions(+), 38 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 44c41462be33..4dd75d50c200 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -269,16 +269,16 @@ struct mem_cgroup {
atomic_t moving_account;
struct task_struct *move_lock_task;

- /* Legacy local VM stats and events */
- struct memcg_vmstats_percpu __percpu *vmstats_local;
-
/* Subtree VM stats and events (batched updates) */
struct memcg_vmstats_percpu __percpu *vmstats_percpu;

MEMCG_PADDING(_pad2_);

- atomic_long_t vmstats[MEMCG_NR_STAT];
- atomic_long_t vmevents[NR_VM_EVENT_ITEMS];
+ atomic_long_t vmstats_local[MEMCG_NR_STAT];
+ atomic_long_t vmstats_nested[MEMCG_NR_STAT];
+
+ atomic_long_t vmevents_local[NR_VM_EVENT_ITEMS];
+ atomic_long_t vmevents_nested[NR_VM_EVENT_ITEMS];

/* memory.events */
atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS];
@@ -557,7 +557,8 @@ void unlock_page_memcg(struct page *page);
*/
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
- long x = atomic_long_read(&memcg->vmstats[idx]);
+ long x = atomic_long_read(&memcg->vmstats_local[idx]) +
+ atomic_long_read(&memcg->vmstats_nested[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
@@ -572,11 +573,7 @@ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg,
int idx)
{
- long x = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_local->stat[idx], cpu);
+ long x = atomic_long_read(&memcg->vmstats_local[idx]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06d33dfc4ec4..97debc8e4120 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -695,14 +695,13 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
if (mem_cgroup_disabled())
return;

- __this_cpu_add(memcg->vmstats_local->stat[idx], val);
-
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;

- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmstats[idx]);
+ atomic_long_add(x, &memcg->vmstats_local[idx]);
+ for (mi = memcg; (mi = parent_mem_cgroup(mi)); )
+ atomic_long_add(x, &mi->vmstats_nested[idx]);
x = 0;
}
__this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
@@ -777,14 +776,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;

- __this_cpu_add(memcg->vmstats_local->events[idx], count);
-
x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
if (unlikely(x > MEMCG_CHARGE_BATCH)) {
struct mem_cgroup *mi;

- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmevents[idx]);
+ atomic_long_add(x, &memcg->vmevents_local[idx]);
+ for (mi = memcg; (mi = parent_mem_cgroup(mi)); )
+ atomic_long_add(x, &mi->vmevents_nested[idx]);
x = 0;
}
__this_cpu_write(memcg->vmstats_percpu->events[idx], x);
@@ -792,17 +790,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,

static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
- return atomic_long_read(&memcg->vmevents[event]);
+ return atomic_long_read(&memcg->vmevents_local[event]) +
+ atomic_long_read(&memcg->vmevents_nested[event]);
}

static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
- long x = 0;
- int cpu;
-
- for_each_possible_cpu(cpu)
- x += per_cpu(memcg->vmstats_local->events[event], cpu);
- return x;
+ return atomic_long_read(&memcg->vmevents_local[event]);
}

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -2257,9 +2251,11 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
long x;

x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmstats[i]);
+ if (x) {
+ atomic_long_add(x, &memcg->vmstats_local[i]);
+ for (mi = memcg; (mi = parent_mem_cgroup(mi)); )
+ atomic_long_add(x, &mi->vmstats_nested[i]);
+ }

if (i >= NR_VM_NODE_STAT_ITEMS)
continue;
@@ -2280,9 +2276,11 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
long x;

x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
- if (x)
- for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- atomic_long_add(x, &mi->vmevents[i]);
+ if (x) {
+ atomic_long_add(x, &memcg->vmevents_local[i]);
+ for (mi = memcg; (mi = parent_mem_cgroup(mi)); )
+ atomic_long_add(x, &mi->vmevents_nested[i]);
+ }
}
}

@@ -4085,7 +4083,8 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
*/
static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
{
- long x = atomic_long_read(&memcg->vmstats[idx]);
+ long x = atomic_long_read(&memcg->vmstats_local[idx]) +
+ atomic_long_read(&memcg->vmstats_nested[idx]);
int cpu;

for_each_online_cpu(cpu)
@@ -4638,7 +4637,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->vmstats_percpu);
- free_percpu(memcg->vmstats_local);
kfree(memcg);
}

@@ -4667,10 +4665,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
if (memcg->id.id < 0)
goto fail;

- memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
- if (!memcg->vmstats_local)
- goto fail;
-
memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
if (!memcg->vmstats_percpu)
goto fail;


2019-07-17 17:55:38

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm/memcontrol: split local and nested atomic vmstats/vmevents counters

On Wed, Jul 17, 2019 at 03:29:19PM +0300, Konstantin Khlebnikov wrote:
> This is alternative solution for problem addressed in commit 815744d75152
> ("mm: memcontrol: don't batch updates of local VM stats and events").
>
> Instead of adding second set of percpu counters which wastes memory and
> slows down showing statistics in cgroup-v1 this patch use two arrays of
> atomic counters: local and nested statistics.
>
> Then update has the same amount of atomic operations: local update and
> one nested for each parent cgroup. Readers of hierarchical statistics
> have to sum two atomics which isn't a big deal.
>
> All updates are still batched using one set of percpu counters.
>
> Signed-off-by: Konstantin Khlebnikov <[email protected]>

Yeah that looks better. Note that it was never about the atomics,
though, but rather the number of cachelines dirtied. Your patch should
solve this problem as well, but it might be a good idea to run
will-it-scale on it to make sure the struct layout is still fine.

2019-07-18 15:08:59

by Konstantin Khlebnikov

[permalink] [raw]
Subject: Re: [PATCH 2/2] mm/memcontrol: split local and nested atomic vmstats/vmevents counters

On 17.07.2019 20:53, Johannes Weiner wrote:
> On Wed, Jul 17, 2019 at 03:29:19PM +0300, Konstantin Khlebnikov wrote:
>> This is alternative solution for problem addressed in commit 815744d75152
>> ("mm: memcontrol: don't batch updates of local VM stats and events").
>>
>> Instead of adding second set of percpu counters which wastes memory and
>> slows down showing statistics in cgroup-v1 this patch use two arrays of
>> atomic counters: local and nested statistics.
>>
>> Then update has the same amount of atomic operations: local update and
>> one nested for each parent cgroup. Readers of hierarchical statistics
>> have to sum two atomics which isn't a big deal.
>>
>> All updates are still batched using one set of percpu counters.
>>
>> Signed-off-by: Konstantin Khlebnikov <[email protected]>
>
> Yeah that looks better. Note that it was never about the atomics,
> though, but rather the number of cachelines dirtied. Your patch should
> solve this problem as well, but it might be a good idea to run
> will-it-scale on it to make sure the struct layout is still fine.
>

Looks like this patch shows 2% regression for 24 core 2 numa node
machine I have. Compete remove of these counters gives 2% boost.
Also I cannot reproduce regression fixed by commit 815744d75152 - revert
have no effect.

So, feel free to ignore second patch. I'll play with this a little more.

Maybe atomic per-numa counters could give nice balance between scalability add overhead.
Ideally this memory could be mapped in per-cpu manner to give atomic access via fs/gs.