When increasing NR_CPUS to 4096 the size of struct mem_cgroup is growing to
507904 bytes per instance on x86_64. This patch changes the allocation of
struct mem_cgroup_stat_cpu to be based on the number of configured CPUs during
boot time. The init_mem_cgroup still is that huge since it stays statically
allocated and therefore uses the compile-time maximum.
Signed-off-by: Jan Blunck <[email protected]>
---
mm/memcontrol.c | 52 +++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 43 insertions(+), 9 deletions(-)
Index: b/mm/memcontrol.c
===================================================================
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,7 +59,7 @@ struct mem_cgroup_stat_cpu {
} ____cacheline_aligned_in_smp;
struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+ struct mem_cgroup_stat_cpu *cpustat;
};
/*
@@ -142,7 +142,10 @@ struct mem_cgroup {
*/
struct mem_cgroup_stat stat;
};
-static struct mem_cgroup init_mem_cgroup;
+static struct mem_cgroup_stat_cpu init_mem_cgroup_stat_cpu[NR_CPUS];
+static struct mem_cgroup init_mem_cgroup = {
+ .stat = { .cpustat = init_mem_cgroup_stat_cpu },
+};
/*
* We use the lower bit of the page->page_cgroup pointer as a bit spin
@@ -1097,23 +1100,54 @@ static void free_mem_cgroup_per_zone_inf
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
+ struct mem_cgroup_stat_cpu *cpustat;
+ size_t statsize = nr_cpu_ids * sizeof(*cpustat);
- if (sizeof(*mem) < PAGE_SIZE)
- mem = kmalloc(sizeof(*mem), GFP_KERNEL);
- else
+ if (sizeof(*mem) > PAGE_SIZE) {
mem = vmalloc(sizeof(*mem));
-
- if (mem)
+ if (!mem)
+ goto out;
memset(mem, 0, sizeof(*mem));
+ } else
+ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+
+ if (!mem)
+ goto out;
+
+ if (statsize > PAGE_SIZE) {
+ cpustat = vmalloc(statsize);
+ if (!cpustat)
+ goto out_mem;
+ memset(cpustat, 0, statsize);
+ } else
+ cpustat = kzalloc(statsize, GFP_KERNEL);
+
+ if (!cpustat)
+ goto out_mem;
+
+ mem->stat.cpustat = cpustat;
return mem;
+
+out_mem:
+ if (is_vmalloc_addr(mem))
+ vfree(mem);
+ else
+ kfree(mem);
+out:
+ return NULL;
}
static void mem_cgroup_free(struct mem_cgroup *mem)
{
- if (sizeof(*mem) < PAGE_SIZE)
- kfree(mem);
+ if (is_vmalloc_addr(mem->stat.cpustat))
+ vfree(mem->stat.cpustat);
else
+ kfree(mem->stat.cpustat);
+
+ if (is_vmalloc_addr(mem))
vfree(mem);
+ else
+ kfree(mem);
}
(cc [email protected])
On Thu, 13 Nov 2008 17:42:01 +0100 Jan Blunck <[email protected]> wrote:
> When increasing NR_CPUS to 4096 the size of struct mem_cgroup is growing to
> 507904 bytes per instance on x86_64. This patch changes the allocation of
> struct mem_cgroup_stat_cpu to be based on the number of configured CPUs during
> boot time. The init_mem_cgroup still is that huge since it stays statically
> allocated and therefore uses the compile-time maximum.
>
> Signed-off-by: Jan Blunck <[email protected]>
> ---
> mm/memcontrol.c | 52 +++++++++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 43 insertions(+), 9 deletions(-)
>
> Index: b/mm/memcontrol.c
> ===================================================================
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -59,7 +59,7 @@ struct mem_cgroup_stat_cpu {
> } ____cacheline_aligned_in_smp;
>
> struct mem_cgroup_stat {
> - struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
> + struct mem_cgroup_stat_cpu *cpustat;
> };
>
> /*
> @@ -142,7 +142,10 @@ struct mem_cgroup {
> */
> struct mem_cgroup_stat stat;
> };
> -static struct mem_cgroup init_mem_cgroup;
> +static struct mem_cgroup_stat_cpu init_mem_cgroup_stat_cpu[NR_CPUS];
> +static struct mem_cgroup init_mem_cgroup = {
> + .stat = { .cpustat = init_mem_cgroup_stat_cpu },
> +};
>
> /*
> * We use the lower bit of the page->page_cgroup pointer as a bit spin
> @@ -1097,23 +1100,54 @@ static void free_mem_cgroup_per_zone_inf
> static struct mem_cgroup *mem_cgroup_alloc(void)
> {
> struct mem_cgroup *mem;
> + struct mem_cgroup_stat_cpu *cpustat;
> + size_t statsize = nr_cpu_ids * sizeof(*cpustat);
>
> - if (sizeof(*mem) < PAGE_SIZE)
> - mem = kmalloc(sizeof(*mem), GFP_KERNEL);
> - else
> + if (sizeof(*mem) > PAGE_SIZE) {
> mem = vmalloc(sizeof(*mem));
> -
> - if (mem)
> + if (!mem)
> + goto out;
> memset(mem, 0, sizeof(*mem));
> + } else
> + mem = kzalloc(sizeof(*mem), GFP_KERNEL);
> +
> + if (!mem)
> + goto out;
> +
> + if (statsize > PAGE_SIZE) {
> + cpustat = vmalloc(statsize);
> + if (!cpustat)
> + goto out_mem;
> + memset(cpustat, 0, statsize);
> + } else
> + cpustat = kzalloc(statsize, GFP_KERNEL);
> +
> + if (!cpustat)
> + goto out_mem;
> +
> + mem->stat.cpustat = cpustat;
> return mem;
> +
> +out_mem:
> + if (is_vmalloc_addr(mem))
> + vfree(mem);
> + else
> + kfree(mem);
> +out:
> + return NULL;
> }
>
> static void mem_cgroup_free(struct mem_cgroup *mem)
> {
> - if (sizeof(*mem) < PAGE_SIZE)
> - kfree(mem);
> + if (is_vmalloc_addr(mem->stat.cpustat))
> + vfree(mem->stat.cpustat);
> else
> + kfree(mem->stat.cpustat);
> +
> + if (is_vmalloc_addr(mem))
> vfree(mem);
> + else
> + kfree(mem);
> }
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
CC: KAMEZAWA Hiroyuki <[email protected]>
CC: Balbir Singh <[email protected]>
Andrew Morton wrote:
> (cc [email protected])
>
> On Thu, 13 Nov 2008 17:42:01 +0100 Jan Blunck <[email protected]> wrote:
>
>> When increasing NR_CPUS to 4096 the size of struct mem_cgroup is growing to
>> 507904 bytes per instance on x86_64. This patch changes the allocation of
>> struct mem_cgroup_stat_cpu to be based on the number of configured CPUs during
>> boot time. The init_mem_cgroup still is that huge since it stays statically
>> allocated and therefore uses the compile-time maximum.
>>
I think you can just remove init_mem_cgroup, because memcg doesn't require
early initialization (when kmalloc is not avaiable), and I found init_mem_cgroup
is not treated specially after greping 'init_mem_cgroup' in the code.
On Fri, 14 Nov 2008 11:52:41 +0800
Li Zefan <[email protected]> wrote:
> CC: KAMEZAWA Hiroyuki <[email protected]>
> CC: Balbir Singh <[email protected]>
>
> Andrew Morton wrote:
> > (cc [email protected])
> >
> > On Thu, 13 Nov 2008 17:42:01 +0100 Jan Blunck <[email protected]> wrote:
> >
> >> When increasing NR_CPUS to 4096 the size of struct mem_cgroup is growing to
> >> 507904 bytes per instance on x86_64. This patch changes the allocation of
> >> struct mem_cgroup_stat_cpu to be based on the number of configured CPUs during
> >> boot time. The init_mem_cgroup still is that huge since it stays statically
> >> allocated and therefore uses the compile-time maximum.
> >>
>
> I think you can just remove init_mem_cgroup, because memcg doesn't require
> early initialization (when kmalloc is not avaiable), and I found init_mem_cgroup
> is not treated specially after greping 'init_mem_cgroup' in the code.
>
yes. but if you want to make changes minimum, just leave init_mem_cgroup.cpustat=NULL
and initialize it later. maybe not so difficult.
Thanks,
-Kame
How about this one ?
tested on x86-64 + mmotm-Nov10, works well.
(test on other arch is welcome.)
-Kame
==
As Jan Blunck <[email protected]> pointed out, allocating
per-cpu stat for memcg to the size of NR_CPUS is not good.
This patch changes mem_cgroup's cpustat allocation not based
on NR_CPUS but based on nr_cpu_ids.
From: Jan Blunck <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 34 ++++++++++++++++++----------------
1 file changed, 18 insertions(+), 16 deletions(-)
Index: mmotm-2.6.28-Nov10/mm/memcontrol.c
===================================================================
--- mmotm-2.6.28-Nov10.orig/mm/memcontrol.c
+++ mmotm-2.6.28-Nov10/mm/memcontrol.c
@@ -60,7 +60,7 @@ struct mem_cgroup_stat_cpu {
} ____cacheline_aligned_in_smp;
struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+ struct mem_cgroup_stat_cpu cpustat[0];
};
/*
@@ -129,11 +129,10 @@ struct mem_cgroup {
int prev_priority; /* for recording reclaim priority */
/*
- * statistics.
+ * statistics. This must be placed at the end of memcg.
*/
struct mem_cgroup_stat stat;
};
-static struct mem_cgroup init_mem_cgroup;
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -1292,42 +1291,45 @@ static void free_mem_cgroup_per_zone_inf
kfree(mem->info.nodeinfo[node]);
}
+static int mem_cgroup_size(void)
+{
+ int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
+ return sizeof(struct mem_cgroup) + cpustat_size;
+}
+
+
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
+ int size = mem_cgroup_size();
- if (sizeof(*mem) < PAGE_SIZE)
- mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+ if (size < PAGE_SIZE)
+ mem = kmalloc(size, GFP_KERNEL);
else
- mem = vmalloc(sizeof(*mem));
+ mem = vmalloc(size);
if (mem)
- memset(mem, 0, sizeof(*mem));
+ memset(mem, 0, size);
return mem;
}
static void mem_cgroup_free(struct mem_cgroup *mem)
{
- if (sizeof(*mem) < PAGE_SIZE)
+ if (mem_cgroup_size() < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
}
-
static struct cgroup_subsys_state *
mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
{
struct mem_cgroup *mem;
int node;
- if (unlikely((cont->parent) == NULL)) {
- mem = &init_mem_cgroup;
- } else {
- mem = mem_cgroup_alloc();
- if (!mem)
- return ERR_PTR(-ENOMEM);
- }
+ mem = mem_cgroup_alloc();
+ if (!mem)
+ return ERR_PTR(-ENOMEM);
res_counter_init(&mem->res);
KAMEZAWA Hiroyuki wrote:
> How about this one ?
> tested on x86-64 + mmotm-Nov10, works well.
> (test on other arch is welcome.)
>
> -Kame
> ==
> As Jan Blunck <[email protected]> pointed out, allocating
> per-cpu stat for memcg to the size of NR_CPUS is not good.
>
> This patch changes mem_cgroup's cpustat allocation not based
> on NR_CPUS but based on nr_cpu_ids.
>
> From: Jan Blunck <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
Looks good :) . It's ok to use nr_cpu_ids when cgroup_init() is called
at boot.
Reviewed-by: Li Zefan <[email protected]>
except..see comments for mem_cgroup_create()
> +static int mem_cgroup_size(void)
> +{
> + int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
> + return sizeof(struct mem_cgroup) + cpustat_size;
> +}
> +
> +
minor comment: one empty line is suffice.
...
> static struct cgroup_subsys_state *
> mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> {
> struct mem_cgroup *mem;
> int node;
>
> - if (unlikely((cont->parent) == NULL)) {
> - mem = &init_mem_cgroup;
> - } else {
> - mem = mem_cgroup_alloc();
> - if (!mem)
> - return ERR_PTR(-ENOMEM);
> - }
> + mem = mem_cgroup_alloc();
> + if (!mem)
> + return ERR_PTR(-ENOMEM);
>
> res_counter_init(&mem->res);
>
free_out:
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
if (cont->parent != NULL) <---- this check should be removed
mem_cgroup_free(mem);
On Fri, 14 Nov 2008 14:26:31 +0800
Li Zefan <[email protected]> wrote:
> free_out:
> for_each_node_state(node, N_POSSIBLE)
> free_mem_cgroup_per_zone_info(mem, node);
> if (cont->parent != NULL) <---- this check should be removed
> mem_cgroup_free(mem);
>
>
Exactrly. fixed one is here.
-Kame
==
As Jan Blunck <[email protected]> pointed out, allocating
per-cpu stat for memcg to the size of NR_CPUS is not good.
This patch changes mem_cgroup's cpustat allocation not based
on NR_CPUS but based on nr_cpu_ids.
Changelog:
- fixed lack of logic in error path.
From: Jan Blunck <[email protected]>
Reviewed-by: Li Zefan <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 35 ++++++++++++++++++-----------------
1 file changed, 18 insertions(+), 17 deletions(-)
Index: mmotm-2.6.28-Nov10/mm/memcontrol.c
===================================================================
--- mmotm-2.6.28-Nov10.orig/mm/memcontrol.c
+++ mmotm-2.6.28-Nov10/mm/memcontrol.c
@@ -60,7 +60,7 @@ struct mem_cgroup_stat_cpu {
} ____cacheline_aligned_in_smp;
struct mem_cgroup_stat {
- struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+ struct mem_cgroup_stat_cpu cpustat[0];
};
/*
@@ -129,11 +129,10 @@ struct mem_cgroup {
int prev_priority; /* for recording reclaim priority */
/*
- * statistics.
+ * statistics. This must be placed at the end of memcg.
*/
struct mem_cgroup_stat stat;
};
-static struct mem_cgroup init_mem_cgroup;
enum charge_type {
MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -1292,23 +1291,30 @@ static void free_mem_cgroup_per_zone_inf
kfree(mem->info.nodeinfo[node]);
}
+static int mem_cgroup_size(void)
+{
+ int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
+ return sizeof(struct mem_cgroup) + cpustat_size;
+}
+
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *mem;
+ int size = mem_cgroup_size();
- if (sizeof(*mem) < PAGE_SIZE)
- mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+ if (size < PAGE_SIZE)
+ mem = kmalloc(size, GFP_KERNEL);
else
- mem = vmalloc(sizeof(*mem));
+ mem = vmalloc(size);
if (mem)
- memset(mem, 0, sizeof(*mem));
+ memset(mem, 0, size);
return mem;
}
static void mem_cgroup_free(struct mem_cgroup *mem)
{
- if (sizeof(*mem) < PAGE_SIZE)
+ if (mem_cgroup_size() < PAGE_SIZE)
kfree(mem);
else
vfree(mem);
@@ -1321,13 +1327,9 @@ mem_cgroup_create(struct cgroup_subsys *
struct mem_cgroup *mem;
int node;
- if (unlikely((cont->parent) == NULL)) {
- mem = &init_mem_cgroup;
- } else {
- mem = mem_cgroup_alloc();
- if (!mem)
- return ERR_PTR(-ENOMEM);
- }
+ mem = mem_cgroup_alloc();
+ if (!mem)
+ return ERR_PTR(-ENOMEM);
res_counter_init(&mem->res);
@@ -1339,8 +1341,7 @@ mem_cgroup_create(struct cgroup_subsys *
free_out:
for_each_node_state(node, N_POSSIBLE)
free_mem_cgroup_per_zone_info(mem, node);
- if (cont->parent != NULL)
- mem_cgroup_free(mem);
+ mem_cgroup_free(mem);
return ERR_PTR(-ENOMEM);
}
KAMEZAWA Hiroyuki wrote:
> How about this one ?
> tested on x86-64 + mmotm-Nov10, works well.
> (test on other arch is welcome.)
>
> -Kame
> ==
> As Jan Blunck <[email protected]> pointed out, allocating
> per-cpu stat for memcg to the size of NR_CPUS is not good.
>
> This patch changes mem_cgroup's cpustat allocation not based
> on NR_CPUS but based on nr_cpu_ids.
>
> From: Jan Blunck <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
> ---
> mm/memcontrol.c | 34 ++++++++++++++++++----------------
> 1 file changed, 18 insertions(+), 16 deletions(-)
>
> Index: mmotm-2.6.28-Nov10/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.28-Nov10.orig/mm/memcontrol.c
> +++ mmotm-2.6.28-Nov10/mm/memcontrol.c
> @@ -60,7 +60,7 @@ struct mem_cgroup_stat_cpu {
> } ____cacheline_aligned_in_smp;
>
> struct mem_cgroup_stat {
> - struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
> + struct mem_cgroup_stat_cpu cpustat[0];
> };
>
> /*
> @@ -129,11 +129,10 @@ struct mem_cgroup {
>
> int prev_priority; /* for recording reclaim priority */
> /*
> - * statistics.
> + * statistics. This must be placed at the end of memcg.
> */
> struct mem_cgroup_stat stat;
> };
> -static struct mem_cgroup init_mem_cgroup;
>
> enum charge_type {
> MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
> @@ -1292,42 +1291,45 @@ static void free_mem_cgroup_per_zone_inf
> kfree(mem->info.nodeinfo[node]);
> }
>
> +static int mem_cgroup_size(void)
inline this function?
Other than that, I think the cont->parent check for freeing has already been
spotted and pointed out
--
Balbir
On Fri, 14 Nov 2008 13:13:29 +0530
Balbir Singh <[email protected]> wrote:
> KAMEZAWA Hiroyuki wrote:
> > How about this one ?
> > tested on x86-64 + mmotm-Nov10, works well.
> > (test on other arch is welcome.)
> >
> > -Kame
> > ==
> > As Jan Blunck <[email protected]> pointed out, allocating
> > per-cpu stat for memcg to the size of NR_CPUS is not good.
> >
> > This patch changes mem_cgroup's cpustat allocation not based
> > on NR_CPUS but based on nr_cpu_ids.
> >
> > From: Jan Blunck <[email protected]>
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> >
> > ---
> > mm/memcontrol.c | 34 ++++++++++++++++++----------------
> > 1 file changed, 18 insertions(+), 16 deletions(-)
> >
> > Index: mmotm-2.6.28-Nov10/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-2.6.28-Nov10.orig/mm/memcontrol.c
> > +++ mmotm-2.6.28-Nov10/mm/memcontrol.c
> > @@ -60,7 +60,7 @@ struct mem_cgroup_stat_cpu {
> > } ____cacheline_aligned_in_smp;
> >
> > struct mem_cgroup_stat {
> > - struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
> > + struct mem_cgroup_stat_cpu cpustat[0];
> > };
> >
> > /*
> > @@ -129,11 +129,10 @@ struct mem_cgroup {
> >
> > int prev_priority; /* for recording reclaim priority */
> > /*
> > - * statistics.
> > + * statistics. This must be placed at the end of memcg.
> > */
> > struct mem_cgroup_stat stat;
> > };
> > -static struct mem_cgroup init_mem_cgroup;
> >
> > enum charge_type {
> > MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
> > @@ -1292,42 +1291,45 @@ static void free_mem_cgroup_per_zone_inf
> > kfree(mem->info.nodeinfo[node]);
> > }
> >
> > +static int mem_cgroup_size(void)
>
> inline this function?
>
necessary ?
> Other than that, I think the cont->parent check for freeing has already been
> spotted and pointed out
>
Ah, yes. fixed in v2.
Thanks,
-Kame
>>> +static int mem_cgroup_size(void)
>> inline this function?
>>
> necessary ?
>
Not so necessary IMO. It's called when a cgroup is created and removed, that
is mkdir and rmdir, where performance is not critical.
KAMEZAWA Hiroyuki wrote:
> On Fri, 14 Nov 2008 13:13:29 +0530
> Balbir Singh <[email protected]> wrote:
>>> +static int mem_cgroup_size(void)
>> inline this function?
>>
> necessary ?
Not really, but might be good for older compilers.
--
Balbir
Li Zefan wrote:
>>>> +static int mem_cgroup_size(void)
>>> inline this function?
>>>
>> necessary ?
>>
>
> Not so necessary IMO. It's called when a cgroup is created and removed, that
> is mkdir and rmdir, where performance is not critical.
>
I think most new compilers can automatically inline such functions, but a hint
for the older ones is always a good practice.
--
Balbir
On Fri, 14 Nov 2008 13:33:49 +0530
Balbir Singh <[email protected]> wrote:
> Li Zefan wrote:
> >>>> +static int mem_cgroup_size(void)
> >>> inline this function?
> >>>
> >> necessary ?
> >>
> >
> > Not so necessary IMO. It's called when a cgroup is created and removed, that
> > is mkdir and rmdir, where performance is not critical.
> >
>
> I think most new compilers can automatically inline such functions, but a hint
> for the older ones is always a good practice.
>
Adding "inline" is easy. But, IMHO, adding "inline" to not-performance-critical part
is just confusing people.
I want to see "inline" where inline is obviously good/necessary for performance.
Thanks,
-Kame