2009-11-06 08:55:18

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 0/2] memcg make use of new percpu implementations

Hi,

Recent updates on dynamic percpu allocation looks good and I tries to rewrite
memcg's poor implementation of percpu status counter.
(It's not NUMA-aware ...)
Thanks for great works.

For this time. I added Christoph to CC because I'm not fully sure my usage of
__this_cpu_xxx is correct...I'm glad if you check the usage when you have time.


Patch 1/2 is just clean up (prepare for patch 2/2)
Patch 2/2 is for percpu.

Tested on my 8cpu box and works well.
Pathcesa are against the latest mmotm.

Thanks,
-Kame


2009-11-06 08:56:41

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 1/2] memcg : rename index to short name

From: KAMEZAWA Hiroyuki <[email protected]>

Before touching percpu counters on memcg, rename status to be short name.
Current too long name is my mistake, sorry.

By this, a change around percpu statistics can be in a line.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 70 ++++++++++++++++++++++++++------------------------------
1 file changed, 33 insertions(+), 37 deletions(-)

Index: mmotm-2.6.32-Nov2/mm/memcontrol.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memcontrol.c
+++ mmotm-2.6.32-Nov2/mm/memcontrol.c
@@ -65,19 +65,19 @@ enum mem_cgroup_stat_index {
/*
* For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
*/
- MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
- MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
- MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
- MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
- MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
- MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
- MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+ MEMCG_NR_CACHE, /* # of pages charged as cache */
+ MEMCG_NR_RSS, /* # of pages charged as anon rss */
+ MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */
+ MEMCG_PGPGIN, /* # of pages paged in */
+ MEMCG_PGPGOUT, /* # of pages paged out */
+ MEMCG_EVENTS, /* sum of pagein + pageout for internal use */
+ MEMCG_NR_SWAP, /* # of pages, swapped out */

- MEM_CGROUP_STAT_NSTATS,
+ MEMCG_STAT_NSTATS,
};

struct mem_cgroup_stat_cpu {
- s64 count[MEM_CGROUP_STAT_NSTATS];
+ s64 count[MEMCG_STAT_NSTATS];
} ____cacheline_aligned_in_smp;

struct mem_cgroup_stat {
@@ -121,8 +121,8 @@ static s64 mem_cgroup_local_usage(struct
{
s64 ret;

- ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
- ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
+ ret = mem_cgroup_read_stat(stat, MEMCG_NR_CACHE);
+ ret += mem_cgroup_read_stat(stat, MEMCG_NR_RSS);
return ret;
}

@@ -376,9 +376,9 @@ static bool mem_cgroup_soft_limit_check(

cpu = get_cpu();
cpustat = &mem->stat.cpustat[cpu];
- val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
+ val = __mem_cgroup_stat_read_local(cpustat, MEMCG_EVENTS);
if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
- __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
+ __mem_cgroup_stat_reset_safe(cpustat, MEMCG_EVENTS);
ret = true;
}
put_cpu();
@@ -486,7 +486,7 @@ static void mem_cgroup_swap_statistics(s
int cpu = get_cpu();

cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_SWAP, val);
put_cpu();
}

@@ -501,17 +501,15 @@ static void mem_cgroup_charge_statistics

cpustat = &stat->cpustat[cpu];
if (PageCgroupCache(pc))
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_CACHE, val);
else
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_RSS, val);

if (charge)
- __mem_cgroup_stat_add_safe(cpustat,
- MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_PGPGIN, 1);
else
- __mem_cgroup_stat_add_safe(cpustat,
- MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_PGPGOUT, 1);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_EVENTS, 1);
put_cpu();
}

@@ -1254,7 +1252,7 @@ void mem_cgroup_update_file_mapped(struc
stat = &mem->stat;
cpustat = &stat->cpustat[cpu];

- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, val);
done:
unlock_page_cgroup(pc);
}
@@ -1656,14 +1654,12 @@ static int mem_cgroup_move_account(struc
/* Update mapped_file data for mem_cgroup "from" */
stat = &from->stat;
cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
- -1);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, -1);

/* Update mapped_file data for mem_cgroup "to" */
stat = &to->stat;
cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
- 1);
+ __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, 1);
}

if (do_swap_account && !mem_cgroup_is_root(from))
@@ -2746,10 +2742,10 @@ static u64 mem_cgroup_read(struct cgroup
case _MEM:
if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_CACHE, &idx_val);
+ MEMCG_NR_CACHE, &idx_val);
val = idx_val;
mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_RSS, &idx_val);
+ MEMCG_NR_RSS, &idx_val);
val += idx_val;
val <<= PAGE_SHIFT;
} else
@@ -2758,13 +2754,13 @@ static u64 mem_cgroup_read(struct cgroup
case _MEMSWAP:
if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_CACHE, &idx_val);
+ MEMCG_NR_CACHE, &idx_val);
val = idx_val;
mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_RSS, &idx_val);
+ MEMCG_NR_RSS, &idx_val);
val += idx_val;
mem_cgroup_get_recursive_idx_stat(mem,
- MEM_CGROUP_STAT_SWAPOUT, &idx_val);
+ MEMCG_NR_SWAP, &idx_val);
val <<= PAGE_SHIFT;
} else
val = res_counter_read_u64(&mem->memsw, name);
@@ -2924,18 +2920,18 @@ static int mem_cgroup_get_local_stat(str
s64 val;

/* per cpu stat */
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
+ val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_CACHE);
s->stat[MCS_CACHE] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+ val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+ val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
+ val = mem_cgroup_read_stat(&mem->stat, MEMCG_PGPGIN);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+ val = mem_cgroup_read_stat(&mem->stat, MEMCG_PGPGOUT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
- val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+ val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_SWAP);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}

2009-11-06 08:58:18

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 2/2] memcg : rewrite percpu countings with new interfaces

From: KAMEZAWA Hiroyuki <[email protected]>

Now, alloc_percpu() alloc good dynamic allocations and
Recent updates on percpu.h gives us following kind of ops
- __this_cpu_add() etc...
This is designed to be a help for reduce code size in hot-path
and very useful to handle percpu area. Thanks for great works.

This patch rewrite memcg's (not-good) percpu status with new
percpu support macros. This decreases code size and instruction
size. By this, this area is now NUMA-aware and may have performance
benefit.

I got good result in parallel pagefault test. (my host is 8cpu/2socket)

before==
Performance counter stats for './runpause.sh' (5 runs):

474070.055912 task-clock-msecs # 7.881 CPUs ( +- 0.013% )
35829310 page-faults # 0.076 M/sec ( +- 0.217% )
3803016722 cache-references # 8.022 M/sec ( +- 0.215% ) (scaled from 100.00%)
1104083123 cache-misses # 2.329 M/sec ( +- 0.961% ) (scaled from 100.00%)

60.154982314 seconds time elapsed ( +- 0.018% )

after==
Performance counter stats for './runpause.sh' (5 runs):

474919.429670 task-clock-msecs # 7.896 CPUs ( +- 0.013% )
36520440 page-faults # 0.077 M/sec ( +- 1.854% )
3109834751 cache-references # 6.548 M/sec ( +- 0.276% )
1053275160 cache-misses # 2.218 M/sec ( +- 0.036% )

60.146585280 seconds time elapsed ( +- 0.019% )

This test is affected by cpu-utilization but I think more improvements
will be found in bigger system.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 168 +++++++++++++++++++-------------------------------------
1 file changed, 58 insertions(+), 110 deletions(-)

Index: mmotm-2.6.32-Nov2/mm/memcontrol.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memcontrol.c
+++ mmotm-2.6.32-Nov2/mm/memcontrol.c
@@ -39,6 +39,7 @@
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
+#include <linux/percpu.h>
#include "internal.h"

#include <asm/uaccess.h>
@@ -78,54 +79,8 @@ enum mem_cgroup_stat_index {

struct mem_cgroup_stat_cpu {
s64 count[MEMCG_STAT_NSTATS];
-} ____cacheline_aligned_in_smp;
-
-struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[0];
};

-static inline void
-__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx)
-{
- stat->count[idx] = 0;
-}
-
-static inline s64
-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx)
-{
- return stat->count[idx];
-}
-
-/*
- * For accounting under irq disable, no need for increment preempt count.
- */
-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx, int val)
-{
- stat->count[idx] += val;
-}
-
-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
- enum mem_cgroup_stat_index idx)
-{
- int cpu;
- s64 ret = 0;
- for_each_possible_cpu(cpu)
- ret += stat->cpustat[cpu].count[idx];
- return ret;
-}
-
-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
-{
- s64 ret;
-
- ret = mem_cgroup_read_stat(stat, MEMCG_NR_CACHE);
- ret += mem_cgroup_read_stat(stat, MEMCG_NR_RSS);
- return ret;
-}
-
/*
* per-zone information in memory controller.
*/
@@ -226,10 +181,7 @@ struct mem_cgroup {
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;

- /*
- * statistics. This must be placed at the end of memcg.
- */
- struct mem_cgroup_stat stat;
+ struct mem_cgroup_stat_cpu *cpustat;
};

/*
@@ -370,18 +322,13 @@ mem_cgroup_remove_exceeded(struct mem_cg
static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
{
bool ret = false;
- int cpu;
s64 val;
- struct mem_cgroup_stat_cpu *cpustat;

- cpu = get_cpu();
- cpustat = &mem->stat.cpustat[cpu];
- val = __mem_cgroup_stat_read_local(cpustat, MEMCG_EVENTS);
+ val = __this_cpu_read(mem->cpustat->count[MEMCG_EVENTS]);
if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
- __mem_cgroup_stat_reset_safe(cpustat, MEMCG_EVENTS);
+ __this_cpu_write(mem->cpustat->count[MEMCG_EVENTS], 0);
ret = true;
}
- put_cpu();
return ret;
}

@@ -477,17 +424,35 @@ mem_cgroup_largest_soft_limit_node(struc
return mz;
}

+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
+{
+ struct mem_cgroup_stat_cpu *cstat;
+ int cpu;
+ s64 ret = 0;
+
+ for_each_possible_cpu(cpu) {
+ cstat = per_cpu_ptr(mem->cpustat, cpu);
+ ret += cstat->count[idx];
+ }
+ return ret;
+}
+
+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+{
+ s64 ret;
+
+ ret = mem_cgroup_read_stat(mem, MEMCG_NR_CACHE);
+ ret += mem_cgroup_read_stat(mem, MEMCG_NR_RSS);
+ return ret;
+}
+
static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
bool charge)
{
int val = (charge) ? 1 : -1;
- struct mem_cgroup_stat *stat = &mem->stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu = get_cpu();

- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_SWAP, val);
- put_cpu();
+ __this_cpu_add(mem->cpustat->count[MEMCG_NR_SWAP], val);
}

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -495,22 +460,17 @@ static void mem_cgroup_charge_statistics
bool charge)
{
int val = (charge) ? 1 : -1;
- struct mem_cgroup_stat *stat = &mem->stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu = get_cpu();

- cpustat = &stat->cpustat[cpu];
if (PageCgroupCache(pc))
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_CACHE, val);
+ __this_cpu_add(mem->cpustat->count[MEMCG_NR_CACHE], val);
else
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_RSS, val);
+ __this_cpu_add(mem->cpustat->count[MEMCG_NR_RSS], val);

if (charge)
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_PGPGIN, 1);
+ __this_cpu_inc(mem->cpustat->count[MEMCG_PGPGIN]);
else
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_PGPGOUT, 1);
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_EVENTS, 1);
- put_cpu();
+ __this_cpu_inc(mem->cpustat->count[MEMCG_PGPGOUT]);
+ __this_cpu_inc(mem->cpustat->count[MEMCG_EVENTS]);
}

static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -1162,7 +1122,7 @@ static int mem_cgroup_hierarchical_recla
}
}
}
- if (!mem_cgroup_local_usage(&victim->stat)) {
+ if (!mem_cgroup_local_usage(victim)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
continue;
@@ -1228,9 +1188,6 @@ static void record_last_oom(struct mem_c
void mem_cgroup_update_file_mapped(struct page *page, int val)
{
struct mem_cgroup *mem;
- struct mem_cgroup_stat *stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu;
struct page_cgroup *pc;

pc = lookup_page_cgroup(page);
@@ -1245,14 +1202,7 @@ void mem_cgroup_update_file_mapped(struc
if (!PageCgroupUsed(pc))
goto done;

- /*
- * Preemption is already disabled, we don't need get_cpu()
- */
- cpu = smp_processor_id();
- stat = &mem->stat;
- cpustat = &stat->cpustat[cpu];
-
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, val);
+ __this_cpu_add(mem->cpustat->count[MEMCG_NR_FILE_MAPPED], val);
done:
unlock_page_cgroup(pc);
}
@@ -1623,9 +1573,6 @@ static int mem_cgroup_move_account(struc
int nid, zid;
int ret = -EBUSY;
struct page *page;
- int cpu;
- struct mem_cgroup_stat *stat;
- struct mem_cgroup_stat_cpu *cpustat;

VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
@@ -1650,16 +1597,11 @@ static int mem_cgroup_move_account(struc

page = pc->page;
if (page_mapped(page) && !PageAnon(page)) {
- cpu = smp_processor_id();
/* Update mapped_file data for mem_cgroup "from" */
- stat = &from->stat;
- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, -1);
+ __this_cpu_dec(from->cpustat->count[MEMCG_NR_FILE_MAPPED]);

/* Update mapped_file data for mem_cgroup "to" */
- stat = &to->stat;
- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, 1);
+ __this_cpu_inc(to->cpustat->count[MEMCG_NR_FILE_MAPPED]);
}

if (do_swap_account && !mem_cgroup_is_root(from))
@@ -2715,7 +2657,7 @@ static int
mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
{
struct mem_cgroup_idx_data *d = data;
- d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+ d->val += mem_cgroup_read_stat(mem, d->idx);
return 0;
}

@@ -2920,18 +2862,18 @@ static int mem_cgroup_get_local_stat(str
s64 val;

/* per cpu stat */
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_CACHE);
+ val = mem_cgroup_read_stat(mem, MEMCG_NR_CACHE);
s->stat[MCS_CACHE] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_RSS);
+ val = mem_cgroup_read_stat(mem, MEMCG_NR_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_FILE_MAPPED);
+ val = mem_cgroup_read_stat(mem, MEMCG_NR_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_PGPGIN);
+ val = mem_cgroup_read_stat(mem, MEMCG_PGPGIN);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_PGPGOUT);
+ val = mem_cgroup_read_stat(mem, MEMCG_PGPGOUT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_SWAP);
+ val = mem_cgroup_read_stat(mem, MEMCG_NR_SWAP);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}

@@ -3190,17 +3132,12 @@ static void free_mem_cgroup_per_zone_inf
kfree(mem->info.nodeinfo[node]);
}

-static int mem_cgroup_size(void)
-{
- int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
- return sizeof(struct mem_cgroup) + cpustat_size;
-}
-
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
- int size = mem_cgroup_size();
+ int size = sizeof(struct mem_cgroup);

+ /* Can be very big if MAX_NUMNODES is big */
if (size < PAGE_SIZE)
mem = kmalloc(size, GFP_KERNEL);
else
@@ -3208,6 +3145,15 @@ static struct mem_cgroup *mem_cgroup_all

if (mem)
memset(mem, 0, size);
+ mem->cpustat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!mem->cpustat) {
+ if (size < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+ mem = NULL;
+ }
+
return mem;
}

@@ -3232,7 +3178,9 @@ static void __mem_cgroup_free(struct mem
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);

- if (mem_cgroup_size() < PAGE_SIZE)
+ free_percpu(mem->cpustat);
+
+ if (sizeof(struct mem_cgroup) < PAGE_SIZE)
kfree(mem);
else
vfree(mem);

2009-11-06 17:29:11

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 2/2] memcg : rewrite percpu countings with new interfaces

On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote:

> @@ -370,18 +322,13 @@ mem_cgroup_remove_exceeded(struct mem_cg
> static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
> {
> bool ret = false;
> - int cpu;
> s64 val;
> - struct mem_cgroup_stat_cpu *cpustat;
>
> - cpu = get_cpu();
> - cpustat = &mem->stat.cpustat[cpu];
> - val = __mem_cgroup_stat_read_local(cpustat, MEMCG_EVENTS);
> + val = __this_cpu_read(mem->cpustat->count[MEMCG_EVENTS]);
> if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
> - __mem_cgroup_stat_reset_safe(cpustat, MEMCG_EVENTS);
> + __this_cpu_write(mem->cpustat->count[MEMCG_EVENTS], 0);
> ret = true;
> }
> - put_cpu();
> return ret;

If you want to use the __this_cpu_xx versions then you need to manage
preempt on your own.

You need to keep preempt_disable/enable here because otherwise the per
cpu variable zeroed may be on a different cpu than the per cpu variable
where you got the value from.

> +static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
> + enum mem_cgroup_stat_index idx)
> +{
> + struct mem_cgroup_stat_cpu *cstat;
> + int cpu;
> + s64 ret = 0;
> +
> + for_each_possible_cpu(cpu) {
> + cstat = per_cpu_ptr(mem->cpustat, cpu);
> + ret += cstat->count[idx];
> + }

== ret += per_cpu(mem->cpustat->cstat->count[idx], cpu)

> static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
> bool charge)
> {
> int val = (charge) ? 1 : -1;
> - struct mem_cgroup_stat *stat = &mem->stat;
> - struct mem_cgroup_stat_cpu *cpustat;
> - int cpu = get_cpu();
>
> - cpustat = &stat->cpustat[cpu];
> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_SWAP, val);
> - put_cpu();
> + __this_cpu_add(mem->cpustat->count[MEMCG_NR_SWAP], val);
> }

You do not disable preempt on your own so you have to use

this_cpu_add()

There is no difference between __this_cpu_add and this_cpu_add on x86 but
they will differ on platforms that do not have atomic per cpu
instructions. The fallback for this_cpu_add is to protect the add with
preempt_disable()/enable. The fallback fro __this_cpu_add is just to rely
on the caller to ensure that preempt is disabled somehow.


> @@ -495,22 +460,17 @@ static void mem_cgroup_charge_statistics
> bool charge)
> {
> int val = (charge) ? 1 : -1;
> - struct mem_cgroup_stat *stat = &mem->stat;
> - struct mem_cgroup_stat_cpu *cpustat;
> - int cpu = get_cpu();
>
> - cpustat = &stat->cpustat[cpu];
> if (PageCgroupCache(pc))
> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_CACHE, val);
> + __this_cpu_add(mem->cpustat->count[MEMCG_NR_CACHE], val);

Remove __
> else
> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_RSS, val);
> + __this_cpu_add(mem->cpustat->count[MEMCG_NR_RSS], val);

Remove __
>
> if (charge)
> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_PGPGIN, 1);
> + __this_cpu_inc(mem->cpustat->count[MEMCG_PGPGIN]);

Remove __
> else
> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_PGPGOUT, 1);
> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_EVENTS, 1);
> - put_cpu();
> + __this_cpu_inc(mem->cpustat->count[MEMCG_PGPGOUT]);
> + __this_cpu_inc(mem->cpustat->count[MEMCG_EVENTS]);

Remove __

> - /*
> - * Preemption is already disabled, we don't need get_cpu()
> - */
> - cpu = smp_processor_id();
> - stat = &mem->stat;
> - cpustat = &stat->cpustat[cpu];
> -
> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, val);
> + __this_cpu_add(mem->cpustat->count[MEMCG_NR_FILE_MAPPED], val);

Remove __


> @@ -1650,16 +1597,11 @@ static int mem_cgroup_move_account(struc
>
> page = pc->page;
> if (page_mapped(page) && !PageAnon(page)) {
> - cpu = smp_processor_id();
> /* Update mapped_file data for mem_cgroup "from" */
> - stat = &from->stat;
> - cpustat = &stat->cpustat[cpu];
> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, -1);
> + __this_cpu_dec(from->cpustat->count[MEMCG_NR_FILE_MAPPED]);

You can keep it here since the context already has preempt disabled it
seems.

2009-11-06 18:44:34

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 2/2] memcg : rewrite percpu countings with new interfaces

Christoph Lameter wrote:
> On Fri, 6 Nov 2009, KAMEZAWA Hiroyuki wrote:
>> - __mem_cgroup_stat_reset_safe(cpustat, MEMCG_EVENTS);
>> + __this_cpu_write(mem->cpustat->count[MEMCG_EVENTS], 0);
>> ret = true;
>> }
>> - put_cpu();
>> return ret;
>
> If you want to use the __this_cpu_xx versions then you need to manage
> preempt on your own.
>
Ah, I see. I understand I haven't understood.

> You need to keep preempt_disable/enable here because otherwise the per
> cpu variable zeroed may be on a different cpu than the per cpu variable
> where you got the value from.
>
Thank you. I think I can do well in the next version.


>> +static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
>> + enum mem_cgroup_stat_index idx)
>> +{
>> + struct mem_cgroup_stat_cpu *cstat;
>> + int cpu;
>> + s64 ret = 0;
>> +
>> + for_each_possible_cpu(cpu) {
>> + cstat = per_cpu_ptr(mem->cpustat, cpu);
>> + ret += cstat->count[idx];
>> + }
>
> == ret += per_cpu(mem->cpustat->cstat->count[idx], cpu)
>
Hmm, Hmm. Will use that.

>> static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
>> bool charge)
>> {
>> int val = (charge) ? 1 : -1;
>> - struct mem_cgroup_stat *stat = &mem->stat;
>> - struct mem_cgroup_stat_cpu *cpustat;
>> - int cpu = get_cpu();
>>
>> - cpustat = &stat->cpustat[cpu];
>> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_SWAP, val);
>> - put_cpu();
>> + __this_cpu_add(mem->cpustat->count[MEMCG_NR_SWAP], val);
>> }
>
> You do not disable preempt on your own so you have to use
>
> this_cpu_add()
>
> There is no difference between __this_cpu_add and this_cpu_add on x86 but
> they will differ on platforms that do not have atomic per cpu
> instructions. The fallback for this_cpu_add is to protect the add with
> preempt_disable()/enable. The fallback fro __this_cpu_add is just to rely
> on the caller to ensure that preempt is disabled somehow.
>
Ok.


>> - /*
>> - * Preemption is already disabled, we don't need get_cpu()
>> - */
>> - cpu = smp_processor_id();
>> - stat = &mem->stat;
>> - cpustat = &stat->cpustat[cpu];
>> -
>> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, val);
>> + __this_cpu_add(mem->cpustat->count[MEMCG_NR_FILE_MAPPED], val);
>
> Remove __
>
>
>> @@ -1650,16 +1597,11 @@ static int mem_cgroup_move_account(struc
>>
>> page = pc->page;
>> if (page_mapped(page) && !PageAnon(page)) {
>> - cpu = smp_processor_id();
>> /* Update mapped_file data for mem_cgroup "from" */
>> - stat = &from->stat;
>> - cpustat = &stat->cpustat[cpu];
>> - __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, -1);
>> + __this_cpu_dec(from->cpustat->count[MEMCG_NR_FILE_MAPPED]);
>
> You can keep it here since the context already has preempt disabled it
> seems.
>
Thank you for kindly review.

Regards,
-Kame


2009-11-09 06:47:22

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH 2/2] memcg : rewrite percpu countings with new interfaces v2

Thank you Chirsotph for review.

I think this version works well.
==
From: KAMEZAWA Hiroyuki <[email protected]>

Now, alloc_percpu() alloc good dynamic allocations and
Recent updates on percpu.h gives us following kind of ops
- this_cpu_add() etc...
This is designed to be a help for reduce code size in hot-path
and very useful to handle percpu area. Thanks for great works.

This patch rewrite memcg's (not-good) percpu status with new
percpu support macros. This decreases code size and instruction
size. By this, this area is now NUMA-aware and may have performance
benefit.

Changelog: 2009/11/09
- fixed misusage of __this_cpu_xxx

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 172 +++++++++++++++++++-------------------------------------
1 file changed, 61 insertions(+), 111 deletions(-)

Index: mmotm-2.6.32-Nov2/mm/memcontrol.c
===================================================================
--- mmotm-2.6.32-Nov2.orig/mm/memcontrol.c
+++ mmotm-2.6.32-Nov2/mm/memcontrol.c
@@ -39,6 +39,7 @@
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
+#include <linux/percpu.h>
#include "internal.h"

#include <asm/uaccess.h>
@@ -78,54 +79,8 @@ enum mem_cgroup_stat_index {

struct mem_cgroup_stat_cpu {
s64 count[MEMCG_STAT_NSTATS];
-} ____cacheline_aligned_in_smp;
-
-struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[0];
};

-static inline void
-__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx)
-{
- stat->count[idx] = 0;
-}
-
-static inline s64
-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx)
-{
- return stat->count[idx];
-}
-
-/*
- * For accounting under irq disable, no need for increment preempt count.
- */
-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
- enum mem_cgroup_stat_index idx, int val)
-{
- stat->count[idx] += val;
-}
-
-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
- enum mem_cgroup_stat_index idx)
-{
- int cpu;
- s64 ret = 0;
- for_each_possible_cpu(cpu)
- ret += stat->cpustat[cpu].count[idx];
- return ret;
-}
-
-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
-{
- s64 ret;
-
- ret = mem_cgroup_read_stat(stat, MEMCG_NR_CACHE);
- ret += mem_cgroup_read_stat(stat, MEMCG_NR_RSS);
- return ret;
-}
-
/*
* per-zone information in memory controller.
*/
@@ -226,10 +181,7 @@ struct mem_cgroup {
/* set when res.limit == memsw.limit */
bool memsw_is_minimum;

- /*
- * statistics. This must be placed at the end of memcg.
- */
- struct mem_cgroup_stat stat;
+ struct mem_cgroup_stat_cpu *cpustat;
};

/*
@@ -370,18 +322,13 @@ mem_cgroup_remove_exceeded(struct mem_cg
static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
{
bool ret = false;
- int cpu;
s64 val;
- struct mem_cgroup_stat_cpu *cpustat;

- cpu = get_cpu();
- cpustat = &mem->stat.cpustat[cpu];
- val = __mem_cgroup_stat_read_local(cpustat, MEMCG_EVENTS);
+ val = this_cpu_read(mem->cpustat->count[MEMCG_EVENTS]);
if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
- __mem_cgroup_stat_reset_safe(cpustat, MEMCG_EVENTS);
+ this_cpu_write(mem->cpustat->count[MEMCG_EVENTS], 0);
ret = true;
}
- put_cpu();
return ret;
}

@@ -477,17 +424,34 @@ mem_cgroup_largest_soft_limit_node(struc
return mz;
}

+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+ enum mem_cgroup_stat_index idx)
+{
+ struct mem_cgroup_stat_cpu *cstat;
+ int cpu;
+ s64 ret = 0;
+
+ for_each_possible_cpu(cpu)
+ ret += per_cpu_ptr(mem->cpustat->count[idx], cpu);
+
+ return ret;
+}
+
+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+{
+ s64 ret;
+
+ ret = mem_cgroup_read_stat(mem, MEMCG_NR_CACHE);
+ ret += mem_cgroup_read_stat(mem, MEMCG_NR_RSS);
+ return ret;
+}
+
static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
bool charge)
{
int val = (charge) ? 1 : -1;
- struct mem_cgroup_stat *stat = &mem->stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu = get_cpu();

- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_SWAP, val);
- put_cpu();
+ this_cpu_add(mem->cpustat->count[MEMCG_NR_SWAP], val);
}

static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -495,22 +459,19 @@ static void mem_cgroup_charge_statistics
bool charge)
{
int val = (charge) ? 1 : -1;
- struct mem_cgroup_stat *stat = &mem->stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu = get_cpu();

- cpustat = &stat->cpustat[cpu];
+ preempt_disable();
if (PageCgroupCache(pc))
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_CACHE, val);
+ __this_cpu_add(mem->cpustat->count[MEMCG_NR_CACHE], val);
else
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_RSS, val);
+ __this_cpu_add(mem->cpustat->count[MEMCG_NR_RSS], val);

if (charge)
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_PGPGIN, 1);
+ __this_cpu_inc(mem->cpustat->count[MEMCG_PGPGIN]);
else
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_PGPGOUT, 1);
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_EVENTS, 1);
- put_cpu();
+ __this_cpu_inc(mem->cpustat->count[MEMCG_PGPGOUT]);
+ __this_cpu_inc(mem->cpustat->count[MEMCG_EVENTS]);
+ preempt_enable();
}

static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -1162,7 +1123,7 @@ static int mem_cgroup_hierarchical_recla
}
}
}
- if (!mem_cgroup_local_usage(&victim->stat)) {
+ if (!mem_cgroup_local_usage(victim)) {
/* this cgroup's local usage == 0 */
css_put(&victim->css);
continue;
@@ -1228,9 +1189,6 @@ static void record_last_oom(struct mem_c
void mem_cgroup_update_file_mapped(struct page *page, int val)
{
struct mem_cgroup *mem;
- struct mem_cgroup_stat *stat;
- struct mem_cgroup_stat_cpu *cpustat;
- int cpu;
struct page_cgroup *pc;

pc = lookup_page_cgroup(page);
@@ -1245,14 +1203,7 @@ void mem_cgroup_update_file_mapped(struc
if (!PageCgroupUsed(pc))
goto done;

- /*
- * Preemption is already disabled, we don't need get_cpu()
- */
- cpu = smp_processor_id();
- stat = &mem->stat;
- cpustat = &stat->cpustat[cpu];
-
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, val);
+ this_cpu_add(mem->cpustat->count[MEMCG_NR_FILE_MAPPED], val);
done:
unlock_page_cgroup(pc);
}
@@ -1623,9 +1574,6 @@ static int mem_cgroup_move_account(struc
int nid, zid;
int ret = -EBUSY;
struct page *page;
- int cpu;
- struct mem_cgroup_stat *stat;
- struct mem_cgroup_stat_cpu *cpustat;

VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(pc->page));
@@ -1650,16 +1598,12 @@ static int mem_cgroup_move_account(struc

page = pc->page;
if (page_mapped(page) && !PageAnon(page)) {
- cpu = smp_processor_id();
+ preempt_disable();
/* Update mapped_file data for mem_cgroup "from" */
- stat = &from->stat;
- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, -1);
-
+ __this_cpu_dec(from->cpustat->count[MEMCG_NR_FILE_MAPPED]);
/* Update mapped_file data for mem_cgroup "to" */
- stat = &to->stat;
- cpustat = &stat->cpustat[cpu];
- __mem_cgroup_stat_add_safe(cpustat, MEMCG_NR_FILE_MAPPED, 1);
+ __this_cpu_inc(to->cpustat->count[MEMCG_NR_FILE_MAPPED]);
+ preempt_enable();
}

if (do_swap_account && !mem_cgroup_is_root(from))
@@ -2715,7 +2659,7 @@ static int
mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
{
struct mem_cgroup_idx_data *d = data;
- d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+ d->val += mem_cgroup_read_stat(mem, d->idx);
return 0;
}

@@ -2920,18 +2864,18 @@ static int mem_cgroup_get_local_stat(str
s64 val;

/* per cpu stat */
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_CACHE);
+ val = mem_cgroup_read_stat(mem, MEMCG_NR_CACHE);
s->stat[MCS_CACHE] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_RSS);
+ val = mem_cgroup_read_stat(mem, MEMCG_NR_RSS);
s->stat[MCS_RSS] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_FILE_MAPPED);
+ val = mem_cgroup_read_stat(mem, MEMCG_NR_FILE_MAPPED);
s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_PGPGIN);
+ val = mem_cgroup_read_stat(mem, MEMCG_PGPGIN);
s->stat[MCS_PGPGIN] += val;
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_PGPGOUT);
+ val = mem_cgroup_read_stat(mem, MEMCG_PGPGOUT);
s->stat[MCS_PGPGOUT] += val;
if (do_swap_account) {
- val = mem_cgroup_read_stat(&mem->stat, MEMCG_NR_SWAP);
+ val = mem_cgroup_read_stat(mem, MEMCG_NR_SWAP);
s->stat[MCS_SWAP] += val * PAGE_SIZE;
}

@@ -3190,17 +3134,12 @@ static void free_mem_cgroup_per_zone_inf
kfree(mem->info.nodeinfo[node]);
}

-static int mem_cgroup_size(void)
-{
- int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
- return sizeof(struct mem_cgroup) + cpustat_size;
-}
-
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
- int size = mem_cgroup_size();
+ int size = sizeof(struct mem_cgroup);

+ /* Can be very big if MAX_NUMNODES is big */
if (size < PAGE_SIZE)
mem = kmalloc(size, GFP_KERNEL);
else
@@ -3208,6 +3147,15 @@ static struct mem_cgroup *mem_cgroup_all

if (mem)
memset(mem, 0, size);
+ mem->cpustat = alloc_percpu(struct mem_cgroup_stat_cpu);
+ if (!mem->cpustat) {
+ if (size < PAGE_SIZE)
+ kfree(mem);
+ else
+ vfree(mem);
+ mem = NULL;
+ }
+
return mem;
}

@@ -3232,7 +3180,9 @@ static void __mem_cgroup_free(struct mem
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);

- if (mem_cgroup_size() < PAGE_SIZE)
+ free_percpu(mem->cpustat);
+
+ if (sizeof(struct mem_cgroup) < PAGE_SIZE)
kfree(mem);
else
vfree(mem);

2009-11-09 07:04:23

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH 0/2] memcg make use of new percpu implementations

* KAMEZAWA Hiroyuki <[email protected]> [2009-11-06 17:52:42]:

> Hi,
>
> Recent updates on dynamic percpu allocation looks good and I tries to rewrite
> memcg's poor implementation of percpu status counter.
> (It's not NUMA-aware ...)
> Thanks for great works.
>
> For this time. I added Christoph to CC because I'm not fully sure my usage of
> __this_cpu_xxx is correct...I'm glad if you check the usage when you have time.
>
>
> Patch 1/2 is just clean up (prepare for patch 2/2)
> Patch 2/2 is for percpu.
>
> Tested on my 8cpu box and works well.
> Pathcesa are against the latest mmotm.

How do the test results look? DO you see a significant boost? BTW,
I've been experimenting a bit with the earlier percpu counter patches,
I might post an iteration once I have some good results.

--
Balbir

2009-11-09 07:07:41

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH 2/2] memcg : rewrite percpu countings with new interfaces

* KAMEZAWA Hiroyuki <[email protected]> [2009-11-06 17:55:45]:

> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, alloc_percpu() alloc good dynamic allocations and
> Recent updates on percpu.h gives us following kind of ops
> - __this_cpu_add() etc...
> This is designed to be a help for reduce code size in hot-path
> and very useful to handle percpu area. Thanks for great works.
>
> This patch rewrite memcg's (not-good) percpu status with new
> percpu support macros. This decreases code size and instruction
> size. By this, this area is now NUMA-aware and may have performance
> benefit.
>
> I got good result in parallel pagefault test. (my host is 8cpu/2socket)
>
> before==
> Performance counter stats for './runpause.sh' (5 runs):
>
> 474070.055912 task-clock-msecs # 7.881 CPUs ( +- 0.013% )
> 35829310 page-faults # 0.076 M/sec ( +- 0.217% )
> 3803016722 cache-references # 8.022 M/sec ( +- 0.215% ) (scaled from 100.00%)
> 1104083123 cache-misses # 2.329 M/sec ( +- 0.961% ) (scaled from 100.00%)
>
> 60.154982314 seconds time elapsed ( +- 0.018% )
>
> after==
> Performance counter stats for './runpause.sh' (5 runs):
>
> 474919.429670 task-clock-msecs # 7.896 CPUs ( +- 0.013% )
> 36520440 page-faults # 0.077 M/sec ( +- 1.854% )
> 3109834751 cache-references # 6.548 M/sec ( +- 0.276% )
> 1053275160 cache-misses # 2.218 M/sec ( +- 0.036% )
>
> 60.146585280 seconds time elapsed ( +- 0.019% )
>
> This test is affected by cpu-utilization but I think more improvements
> will be found in bigger system.
>

Hi, Kamezawa-San,

Could you please post the IPC results as well?

--
Balbir

2009-11-09 07:36:53

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 0/2] memcg make use of new percpu implementations

On Mon, 9 Nov 2009 12:34:21 +0530
Balbir Singh <[email protected]> wrote:

> * KAMEZAWA Hiroyuki <[email protected]> [2009-11-06 17:52:42]:
>
> > Hi,
> >
> > Recent updates on dynamic percpu allocation looks good and I tries to rewrite
> > memcg's poor implementation of percpu status counter.
> > (It's not NUMA-aware ...)
> > Thanks for great works.
> >
> > For this time. I added Christoph to CC because I'm not fully sure my usage of
> > __this_cpu_xxx is correct...I'm glad if you check the usage when you have time.
> >
> >
> > Patch 1/2 is just clean up (prepare for patch 2/2)
> > Patch 2/2 is for percpu.
> >
> > Tested on my 8cpu box and works well.
> > Pathcesa are against the latest mmotm.
>
> How do the test results look? DO you see a significant boost?

Because my test enviroment is just an SMP (not NUMA), improvement will
not be siginificant (visible), I think.

But It's good to make use of recent updates of per_cpu implementation. We
can make use of offset calculation methods of them and one-instruction-access
macros.

This is code size (PREEMPT=y)

[Before patch]
[kamezawa@bluextal mmotm-2.6.32-Nov2]$ size mm/memcontrol.o
text data bss dec hex filename
22403 3420 4132 29955 7503 mm/memcontrol.o

[After patch]
[kamezawa@bluextal mmotm-2.6.32-Nov2]$ size mm/memcontrol.o
text data bss dec hex filename
22188 3420 4132 29740 742c mm/memcontrol.o

Then, text size is surely reduced.

One example is mem_cgroup_swap_statistics(). This function is inlined
after this patch.

this code: mem_cgroup_uncharge_swap(), modifies percpu counter.
==
memcg = mem_cgroup_lookup(id);
if (memcg) {
/*
* We uncharge this because swap is freed.
* This memcg can be obsolete one. We avoid calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
==


[before patch]
mem_cgroup_swap_statistics() is not inlined and uses 0x69 bytes.

0000000000001d30 <mem_cgroup_swap_statistics>:
1d30: push %rbp
1d31: mov %rsp,%rbp
1d34: push %r12
<snip>
1d88: callq 1d8d <mem_cgroup_swap_statistics+0x5d>
1d8d: nopl (%rax)
1d90: jmp 1d83 <mem_cgroup_swap_statistics+0x53>
1d92: nopw %cs:0x0(%rax,%rax,1)
1d99:

[After patch]
mem_cgroup_uncharge_swap()'s inlined code.

3b67: cmp 0x0(%rip),%rax # 3b6e <mem_cgroup_uncharge_swap+0
xbe>
3b6e: je 3b81 <mem_cgroup_uncharge_swap+0xd1> <=== check mem_cgroup_is_root
3b70: lea 0x90(%rax),%rdi
3b77: mov $0x1000,%esi
3b7c: callq 3b81 <mem_cgroup_uncharge_swap+0xd1> <=== calling res_counter_uncahrge()
3b81: 48 89 df mov %rbx,%rdi
3b84: 48 8b 83 70 01 00 00 mov 0x170(%rbx),%rax <=== get offset of mem->cpustat
3b8b: 65 48 83 40 30 ff addq $0xffffffffffffffff,%gs:0x30(%rax) mem->cpustat.count[index]--;

3b91: e8 6a e0 ff ff callq 1c00 <mem_cgroup_put>

This uses 2 instruction.

Then, code size reduction is enough large, I think.

>BTW, I've been experimenting a bit with the earlier percpu counter patches,
> I might post an iteration once I have some good results.
>
Thank you, it's helpful.

Regards,
-Kame

2009-11-09 08:38:47

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 2/2] memcg : rewrite percpu countings with new interfaces

On Mon, 9 Nov 2009 12:37:37 +0530
Balbir Singh <[email protected]> wrote:

> > after==
> > Performance counter stats for './runpause.sh' (5 runs):
> >
> > 474919.429670 task-clock-msecs # 7.896 CPUs ( +- 0.013% )
> > 36520440 page-faults # 0.077 M/sec ( +- 1.854% )
> > 3109834751 cache-references # 6.548 M/sec ( +- 0.276% )
> > 1053275160 cache-misses # 2.218 M/sec ( +- 0.036% )
> >
> > 60.146585280 seconds time elapsed ( +- 0.019% )
> >
> > This test is affected by cpu-utilization but I think more improvements
> > will be found in bigger system.
> >
>
> Hi, Kamezawa-San,
>
> Could you please post the IPC results as well?
>
Because PREEMPT=n, no differnce between v1/v2, basically.

Here.
==
Performance counter stats for './runpause.sh' (5 runs):

475884.969949 task-clock-msecs # 7.913 CPUs ( +- 0.005% )
36592060 page-faults # 0.077 M/sec ( +- 0.301% )
3037784893 cache-references # 6.383 M/sec ( +- 0.361% ) (scaled from 99.71%)
1130761297 cache-misses # 2.376 M/sec ( +- 0.244% ) (scaled from 98.24%)

60.136803969 seconds time elapsed ( +- 0.006% )
==

But this program is highly affected by cpu utilization etc...

Thanks,
-Kame