2011-02-17 01:48:19

by Li Zefan

[permalink] [raw]
Subject: [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()

It's not necessary to copy cpuset->mems_allowed to a buffer
allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf().

Signed-off-by: Li Zefan <[email protected]>
---
kernel/cpuset.c | 10 +---------
1 files changed, 1 insertions(+), 9 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10f1835..f13ff2e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1620,20 +1620,12 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)

static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
- NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
int retval;

- if (mask == NULL)
- return -ENOMEM;
-
mutex_lock(&callback_mutex);
- *mask = cs->mems_allowed;
+ retval = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
mutex_unlock(&callback_mutex);

- retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
-
- NODEMASK_FREE(mask);
-
return retval;
}

--
1.7.3.1


2011-02-17 01:48:38

by Li Zefan

[permalink] [raw]
Subject: [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch()

oldcs->mems_allowed is not modified during cpuset_attch(), so
we don't have to copy it to a buffer allocated by NODEMASK_ALLOC().
Just pass it to cpuset_migrate_mm().

Signed-off-by: Li Zefan <[email protected]>
---
kernel/cpuset.c | 7 ++-----
1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f13ff2e..70c9ca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);

- if (from == NULL || to == NULL)
+ if (to == NULL)
goto alloc_fail;

if (cs == &top_cpuset) {
@@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
}

/* change mm; only needs to be done once even if threadgroup */
- *from = oldcs->mems_allowed;
*to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
mpol_rebind_mm(mm, to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, from, to);
+ cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
mmput(mm);
}

alloc_fail:
- NODEMASK_FREE(from);
NODEMASK_FREE(to);
}

--
1.7.3.1

2011-02-17 01:48:59

by Li Zefan

[permalink] [raw]
Subject: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

Those functions that use NODEMASK_ALLOC() can't propogate errno
to users, but will fail silently.

Since all of them are called with cgroup_mutex held, here we use
a global nodemask_t variable.

Signed-off-by: Li Zefan <[email protected]>
---
kernel/cpuset.c | 45 +++++++++++++++------------------------------
1 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 70c9ca2..cc414ac 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -79,6 +79,15 @@ int number_of_cpusets __read_mostly;
struct cgroup_subsys cpuset_subsys;
struct cpuset;

+/*
+ * In functions that can't propogate errno to users, to avoid declaring a
+ * nodemask_t variable, and avoid using NODEMASK_ALLOC that can return
+ * -ENOMEM, we use this global cpuset_mems.
+ *
+ * It should be used with cgroup_lock held.
+ */
+static nodemask_t cpuset_mems;
+
/* See "Frequency meter" comments, below. */

struct fmeter {
@@ -1015,17 +1024,11 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
- if (!newmems)
- return;

cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, newmems);
-
- cpuset_change_task_nodemask(p, newmems);
+ guarantee_online_mems(cs, &cpuset_mems);

- NODEMASK_FREE(newmems);
+ cpuset_change_task_nodemask(p, &cpuset_mems);

mm = get_task_mm(p);
if (!mm)
@@ -1096,13 +1099,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
+ nodemask_t *oldmem = &cpuset_mems;
int retval;
struct ptr_heap heap;

- if (!oldmem)
- return -ENOMEM;
-
/*
* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
* it's read-only
@@ -1152,7 +1152,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,

heap_free(&heap);
done:
- NODEMASK_FREE(oldmem);
return retval;
}

@@ -1438,10 +1437,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
- if (to == NULL)
- goto alloc_fail;
+ nodemask_t *to = &cpuset_mems;

if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1470,9 +1466,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(to);
}

/* The various types of files and directories in a cpuset file system */
@@ -2051,10 +2044,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return;
+ nodemask_t *oldmems = &cpuset_mems;

list_add_tail((struct list_head *)&root->stack_list, &queue);

@@ -2090,7 +2080,6 @@ static void scan_for_empty_cpusets(struct cpuset *root)
update_tasks_nodemask(cp, oldmems, NULL);
}
}
- NODEMASK_FREE(oldmems);
}

/*
@@ -2132,10 +2121,7 @@ void cpuset_update_active_cpus(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return NOTIFY_DONE;
+ nodemask_t *oldmems = &cpuset_mems;

cgroup_lock();
switch (action) {
@@ -2158,7 +2144,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
}
cgroup_unlock();

- NODEMASK_FREE(oldmems);
return NOTIFY_OK;
}
#endif
--
1.7.3.1

2011-02-17 01:49:42

by Li Zefan

[permalink] [raw]
Subject: [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone()

Chaning cpuset->mems/cpuset->cpus should be protected under
callback_mutex.

cpuset_clone() doesn't follow this rule. It's ok because it's
called when creating and initializing a cgroup, but we'd better
hold the lock to avoid subtil break in the future.

Signed-off-by: Li Zefan <[email protected]>
---
kernel/cpuset.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1e18d26..445573b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1840,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
cs = cgroup_cs(cgroup);
parent_cs = cgroup_cs(parent);

+ mutex_lock(&callback_mutex);
cs->mems_allowed = parent_cs->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+ mutex_unlock(&callback_mutex);
return;
}

--
1.7.3.1

2011-02-17 22:47:22

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

On Thu, 17 Feb 2011 09:50:09 +0800
Li Zefan <[email protected]> wrote:

> +/*
> + * In functions that can't propogate errno to users, to avoid declaring a
> + * nodemask_t variable, and avoid using NODEMASK_ALLOC that can return
> + * -ENOMEM, we use this global cpuset_mems.
> + *
> + * It should be used with cgroup_lock held.

I'll do s/should/must/ - that would be a nasty bug.

I'd be more comfortable about the maintainability of this optimisation
if we had

WARN_ON(!cgroup_is_locked());

at each site.

> + */
> +static nodemask_t cpuset_mems;

2011-02-17 23:35:59

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()

On Wed, Feb 16, 2011 at 5:49 PM, Li Zefan <[email protected]> wrote:
> It's not necessary to copy cpuset->mems_allowed to a buffer
> allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf().
>
> Signed-off-by: Li Zefan <[email protected]>

Acked-by: Paul Menage <[email protected]>

The only downside is that we're now doing more work (and more complex
work) inside callback_mutex, but I guess that's OK compared to having
to do a memory allocation. (I poked around in lib/vsprintf.c and I
couldn't see any cases where it might allocate memory, but it would be
particularly bad if there was any way to trigger an Oops.)

> ---
> ?kernel/cpuset.c | ? 10 +---------
> ?1 files changed, 1 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 10f1835..f13ff2e 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1620,20 +1620,12 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
>
> ?static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
> ?{
> - ? ? ? NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
> ? ? ? ?int retval;
>
> - ? ? ? if (mask == NULL)
> - ? ? ? ? ? ? ? return -ENOMEM;
> -

And this was particularly broken since the only caller of
cpuset_sprintf_memlist() doesn't handle a negative error response
anyway and would then overwrite byte 4083 on the preceding page with a
'\n'. And then since the (size_t)(s-page) that's passed to
simple_read_from_buffer() would be a very large number, it would write
arbitrary (user-controlled) amounts of kernel data to the userspace
buffer.

Maybe we could also rename 'retval' to 'count' in this function (and
cpuset_sprintf_cpulist()) to make it clearer that callers don't expect
negative error values?

> ? ? ? ?mutex_lock(&callback_mutex);
> - ? ? ? *mask = cs->mems_allowed;
> + ? ? ? retval = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
> ? ? ? ?mutex_unlock(&callback_mutex);
>
> - ? ? ? retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
> -
> - ? ? ? NODEMASK_FREE(mask);
> -
> ? ? ? ?return retval;
> ?}
>
> --
> 1.7.3.1
>

2011-02-17 23:46:46

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch()

On Wed, Feb 16, 2011 at 5:49 PM, Li Zefan <[email protected]> wrote:
> oldcs->mems_allowed is not modified during cpuset_attch(), so
> we don't have to copy it to a buffer allocated by NODEMASK_ALLOC().
> Just pass it to cpuset_migrate_mm().
>
> Signed-off-by: Li Zefan <[email protected]>

I'd be inclined to skip this one - we're already allocating one
nodemask, so one more isn't really any extra complexity, and we're
doing horrendously complicated stuff in cpuset_migrate_mm() that's
much more likely to fail in low-memory situations.

It's true that mems_allowed can't change during the call to
cpuset_attach(), but that's due to the fact that both cgroup_attach()
and the cpuset.mems write paths take cgroup_mutex. I might prefer to
leave the allocated nodemask here and wrap callback_mutex around the
places in cpuset_attach() where we're reading from a cpuset's
mems_allowed - that would remove the implicit synchronization via
cgroup_mutex and leave the code a little more understandable.

> ---
> ?kernel/cpuset.c | ? ?7 ++-----
> ?1 files changed, 2 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index f13ff2e..70c9ca2 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> ? ? ? ?struct mm_struct *mm;
> ? ? ? ?struct cpuset *cs = cgroup_cs(cont);
> ? ? ? ?struct cpuset *oldcs = cgroup_cs(oldcont);
> - ? ? ? NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
> ? ? ? ?NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
>
> - ? ? ? if (from == NULL || to == NULL)
> + ? ? ? if (to == NULL)
> ? ? ? ? ? ? ? ?goto alloc_fail;
>
> ? ? ? ?if (cs == &top_cpuset) {
> @@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> ? ? ? ?}
>
> ? ? ? ?/* change mm; only needs to be done once even if threadgroup */
> - ? ? ? *from = oldcs->mems_allowed;
> ? ? ? ?*to = cs->mems_allowed;
> ? ? ? ?mm = get_task_mm(tsk);
> ? ? ? ?if (mm) {
> ? ? ? ? ? ? ? ?mpol_rebind_mm(mm, to);
> ? ? ? ? ? ? ? ?if (is_memory_migrate(cs))
> - ? ? ? ? ? ? ? ? ? ? ? cpuset_migrate_mm(mm, from, to);
> + ? ? ? ? ? ? ? ? ? ? ? cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
> ? ? ? ? ? ? ? ?mmput(mm);
> ? ? ? ?}
>
> ?alloc_fail:
> - ? ? ? NODEMASK_FREE(from);
> ? ? ? ?NODEMASK_FREE(to);
> ?}
>
> --
> 1.7.3.1
>

2011-02-17 23:52:15

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone()

On Wed, Feb 16, 2011 at 5:50 PM, Li Zefan <[email protected]> wrote:
> Chaning cpuset->mems/cpuset->cpus should be protected under
> callback_mutex.
>
> cpuset_clone() doesn't follow this rule. It's ok because it's
> called when creating and initializing a cgroup, but we'd better
> hold the lock to avoid subtil break in the future.
>
> Signed-off-by: Li Zefan <[email protected]>

Acked-by: Paul Menage <[email protected]>

Patch title should be s/cpuset_clone/cpuset_post_clone/

> ---
> ?kernel/cpuset.c | ? ?2 ++
> ?1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 1e18d26..445573b 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1840,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
> ? ? ? ?cs = cgroup_cs(cgroup);
> ? ? ? ?parent_cs = cgroup_cs(parent);
>
> + ? ? ? mutex_lock(&callback_mutex);
> ? ? ? ?cs->mems_allowed = parent_cs->mems_allowed;
> ? ? ? ?cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
> + ? ? ? mutex_unlock(&callback_mutex);
> ? ? ? ?return;
> ?}
>
> --
> 1.7.3.1
>

2011-02-17 23:57:20

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

On Thu, Feb 17, 2011 at 2:46 PM, Andrew Morton
<[email protected]> wrote:
> On Thu, 17 Feb 2011 09:50:09 +0800
> Li Zefan <[email protected]> wrote:
>
>> +/*
>> + * In functions that can't propogate errno to users, to avoid declaring a
>> + * nodemask_t variable, and avoid using NODEMASK_ALLOC that can return
>> + * -ENOMEM, we use this global cpuset_mems.
>> + *
>> + * It should be used with cgroup_lock held.
>
> I'll do s/should/must/ - that would be a nasty bug.
>
> I'd be more comfortable about the maintainability of this optimisation
> if we had
>
> ? ? ? ?WARN_ON(!cgroup_is_locked());
>
> at each site.
>

Agreed - that was my first thought on reading the patch. How about:

static nodemask_t *cpuset_static_nodemask() {
static nodemask_t nodemask;
WARN_ON(!cgroup_is_locked());
return &nodemask;
}

and then just call cpuset_static_nodemask() in the various locations
being patched?

Paul

2011-02-18 02:21:33

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()

Paul Menage wrote:
> On Wed, Feb 16, 2011 at 5:49 PM, Li Zefan <[email protected]> wrote:
>> It's not necessary to copy cpuset->mems_allowed to a buffer
>> allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf().
>>
>> Signed-off-by: Li Zefan <[email protected]>
>
> Acked-by: Paul Menage <[email protected]>
>
> The only downside is that we're now doing more work (and more complex
> work) inside callback_mutex, but I guess that's OK compared to having
> to do a memory allocation. (I poked around in lib/vsprintf.c and I
> couldn't see any cases where it might allocate memory, but it would be
> particularly bad if there was any way to trigger an Oops.)
>
>> ---
>> kernel/cpuset.c | 10 +---------
>> 1 files changed, 1 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
>> index 10f1835..f13ff2e 100644
>> --- a/kernel/cpuset.c
>> +++ b/kernel/cpuset.c
>> @@ -1620,20 +1620,12 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
>>
>> static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
>> {
>> - NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
>> int retval;
>>
>> - if (mask == NULL)
>> - return -ENOMEM;
>> -
>
> And this was particularly broken since the only caller of
> cpuset_sprintf_memlist() doesn't handle a negative error response
> anyway and would then overwrite byte 4083 on the preceding page with a
> '\n'. And then since the (size_t)(s-page) that's passed to
> simple_read_from_buffer() would be a very large number, it would write
> arbitrary (user-controlled) amounts of kernel data to the userspace
> buffer.
>
> Maybe we could also rename 'retval' to 'count' in this function (and
> cpuset_sprintf_cpulist()) to make it clearer that callers don't expect
> negative error values?
>

Good spot!

2011-02-18 02:36:27

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch()

Paul Menage wrote:
> On Wed, Feb 16, 2011 at 5:49 PM, Li Zefan <[email protected]> wrote:
>> oldcs->mems_allowed is not modified during cpuset_attch(), so
>> we don't have to copy it to a buffer allocated by NODEMASK_ALLOC().
>> Just pass it to cpuset_migrate_mm().
>>
>> Signed-off-by: Li Zefan <[email protected]>
>
> I'd be inclined to skip this one - we're already allocating one
> nodemask, so one more isn't really any extra complexity, and we're
> doing horrendously complicated stuff in cpuset_migrate_mm() that's
> much more likely to fail in low-memory situations.

That's true, but it's not a reason to add more cases that can fail.

>
> It's true that mems_allowed can't change during the call to

Sorry to lead you to mistake what I meant. I meant 'from' is not modified
after it's copied from oldcs->mems_allowed, so the two are exactly the
same and thus we only need one.

> cpuset_attach(), but that's due to the fact that both cgroup_attach()
> and the cpuset.mems write paths take cgroup_mutex. I might prefer to
> leave the allocated nodemask here and wrap callback_mutex around the
> places in cpuset_attach() where we're reading from a cpuset's
> mems_allowed - that would remove the implicit synchronization via
> cgroup_mutex and leave the code a little more understandable.

It's not an implicit synchronization, but instead the lock rule for
reading/writing a cpuset's mems/cpus is described in the comment.

>
>> ---
>> kernel/cpuset.c | 7 ++-----
>> 1 files changed, 2 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
>> index f13ff2e..70c9ca2 100644
>> --- a/kernel/cpuset.c
>> +++ b/kernel/cpuset.c
>> @@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>> struct mm_struct *mm;
>> struct cpuset *cs = cgroup_cs(cont);
>> struct cpuset *oldcs = cgroup_cs(oldcont);
>> - NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
>> NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
>>
>> - if (from == NULL || to == NULL)
>> + if (to == NULL)
>> goto alloc_fail;
>>
>> if (cs == &top_cpuset) {
>> @@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>> }
>>
>> /* change mm; only needs to be done once even if threadgroup */
>> - *from = oldcs->mems_allowed;
>> *to = cs->mems_allowed;
>> mm = get_task_mm(tsk);
>> if (mm) {
>> mpol_rebind_mm(mm, to);
>> if (is_memory_migrate(cs))
>> - cpuset_migrate_mm(mm, from, to);
>> + cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
>> mmput(mm);
>> }
>>
>> alloc_fail:
>> - NODEMASK_FREE(from);
>> NODEMASK_FREE(to);
>> }
>>
>> --
>> 1.7.3.1
>>
>

2011-02-18 02:46:36

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

Paul Menage wrote:
> On Thu, Feb 17, 2011 at 2:46 PM, Andrew Morton
> <[email protected]> wrote:
>> On Thu, 17 Feb 2011 09:50:09 +0800
>> Li Zefan <[email protected]> wrote:
>>
>>> +/*
>>> + * In functions that can't propogate errno to users, to avoid declaring a
>>> + * nodemask_t variable, and avoid using NODEMASK_ALLOC that can return
>>> + * -ENOMEM, we use this global cpuset_mems.
>>> + *
>>> + * It should be used with cgroup_lock held.
>>
>> I'll do s/should/must/ - that would be a nasty bug.
>>
>> I'd be more comfortable about the maintainability of this optimisation
>> if we had
>>
>> WARN_ON(!cgroup_is_locked());
>>
>> at each site.
>>
>
> Agreed - that was my first thought on reading the patch. How about:
>
> static nodemask_t *cpuset_static_nodemask() {

Then this should be 'noinline', otherwise we'll have one copy for each
function that calls it.

> static nodemask_t nodemask;
> WARN_ON(!cgroup_is_locked());
> return &nodemask;
> }
>
> and then just call cpuset_static_nodemask() in the various locations
> being patched?
>

I think a defect of this is people might call it twice in one function
but don't know it returns the same variable?

For example in cpuset_attach():

void cpuset_attach(...)
{
nodemask_t *from = cpuset_static_nodemask();
nodemask_t *to = cpuset_static_nodemask();
...
}

2011-02-19 02:29:19

by Paul Menage

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

On Thu, Feb 17, 2011 at 6:47 PM, Li Zefan <[email protected]> wrote:
>
> I think a defect of this is people might call it twice in one function
> but don't know it returns the same variable?

Hopefully they'd read the comments...

But it's not a big issue either way - having the WARN_ON() statements
in front of each use works OK too, given that there are only a few of
them.

Paul

2011-02-20 01:51:42

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone()

On Thu, 17 Feb 2011, Li Zefan wrote:

> Chaning cpuset->mems/cpuset->cpus should be protected under
> callback_mutex.
>
> cpuset_clone() doesn't follow this rule. It's ok because it's
> called when creating and initializing a cgroup, but we'd better
> hold the lock to avoid subtil break in the future.
>
> Signed-off-by: Li Zefan <[email protected]>

Acked-by: David Rientjes <[email protected]>

2011-02-20 01:52:09

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

On Thu, 17 Feb 2011, Li Zefan wrote:

> Those functions that use NODEMASK_ALLOC() can't propogate errno
> to users, but will fail silently.
>
> Since all of them are called with cgroup_mutex held, here we use
> a global nodemask_t variable.
>
> Signed-off-by: Li Zefan <[email protected]>

I like the idea and the comment is explicit enough that we don't need any
refcounting to ensure double usage under cgroup_lock. I think each
function should be modified to use cpuset_mems directly, though, instead
of defining local variables that indirectly access it which only serves to
make this patch smaller. Then we can ensure that all occurrences of
cpuset_mems appear within the lock without being concerned about other
references.

2011-02-20 01:52:27

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch()

On Thu, 17 Feb 2011, Li Zefan wrote:

> oldcs->mems_allowed is not modified during cpuset_attch(), so
> we don't have to copy it to a buffer allocated by NODEMASK_ALLOC().
> Just pass it to cpuset_migrate_mm().
>
> Signed-off-by: Li Zefan <[email protected]>

Acked-by: David Rientjes <[email protected]>

2011-02-20 01:52:58

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()

On Thu, 17 Feb 2011, Li Zefan wrote:

> It's not necessary to copy cpuset->mems_allowed to a buffer
> allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf().
>
> Signed-off-by: Li Zefan <[email protected]>

Acked-by: David Rientjes <[email protected]>

2011-02-21 03:19:10

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

David Rientjes wrote:
> On Thu, 17 Feb 2011, Li Zefan wrote:
>
>> Those functions that use NODEMASK_ALLOC() can't propogate errno
>> to users, but will fail silently.
>>
>> Since all of them are called with cgroup_mutex held, here we use
>> a global nodemask_t variable.
>>
>> Signed-off-by: Li Zefan <[email protected]>
>
> I like the idea and the comment is explicit enough that we don't need any
> refcounting to ensure double usage under cgroup_lock. I think each
> function should be modified to use cpuset_mems directly, though, instead
> of defining local variables that indirectly access it which only serves to
> make this patch smaller. Then we can ensure that all occurrences of
> cpuset_mems appear within the lock without being concerned about other
> references.
>

Unfortunately, as I looked into the code again I found cpuset_change_nodemask()
is called by other functions that use the global cpuset_mems, so I
think we'd better check the refcnt of cpuset_mems.

How about this:

[PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

Those functions that use NODEMASK_ALLOC() can't propogate errno
to users, so might fail silently.

Based on the fact that all of them are called with cgroup_mutex
held, we fix this by using a global nodemask.

cpuset_change_nodemask() is an exception, because it's called
by other functions. Here we declare a static nodemask in the
function for its own use.

Signed-off-by: Li Zefan <[email protected]>
---
kernel/cpuset.c | 82 +++++++++++++++++++++++++++++++++----------------------
1 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 70c9ca2..da620d2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -79,6 +79,38 @@ int number_of_cpusets __read_mostly;
struct cgroup_subsys cpuset_subsys;
struct cpuset;

+static nodemask_t cpuset_mems;
+static int cpuset_mems_ref;
+
+/*
+ * In functions that can't propagate errno to users, to avoid declaring a
+ * nodemask_t variable in stack, and avoid using NODEMASK_ALLOC that can
+ * return -ENOMEM, we use a global cpuset_mems.
+ *
+ * It must be used with cgroup_lock held.
+ */
+static nodemask_t *cpuset_static_nodemask(void)
+{
+ WARN_ON(!cgroup_lock_is_held());
+ WARN_ON(cpuset_mems_ref);
+
+ cpuset_mems_ref++;
+ return &cpuset_mems;
+}
+
+/*
+ * Calling cpuset_static_nodemask() should be paired with this function,
+ * so we insure the global nodemask won't be used by more than one user
+ * at the one time.
+ */
+static void cpuset_release_static_nodemask(void)
+{
+ WARN_ON(!cgroup_lock_is_held());
+
+ cpuset_mems_ref--;
+ WARN_ON(!cpuset_mems_ref);
+}
+
/* See "Frequency meter" comments, below. */

struct fmeter {
@@ -1015,17 +1047,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
- if (!newmems)
- return;
+ static nodemask_t newmems; /* protected by cgroup_mutex */

cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, newmems);
-
- cpuset_change_task_nodemask(p, newmems);
+ guarantee_online_mems(cs, &newmems);

- NODEMASK_FREE(newmems);
+ cpuset_change_task_nodemask(p, &newmems);

mm = get_task_mm(p);
if (!mm)
@@ -1096,13 +1123,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
- NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
+ nodemask_t *oldmem = cpuset_static_nodemask();
int retval;
struct ptr_heap heap;

- if (!oldmem)
- return -ENOMEM;
-
/*
* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
* it's read-only
@@ -1152,7 +1176,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,

heap_free(&heap);
done:
- NODEMASK_FREE(oldmem);
+ cpuset_release_static_nodemask();
return retval;
}

@@ -1438,10 +1462,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
- if (to == NULL)
- goto alloc_fail;
+ nodemask_t *to = cpuset_static_nodemask();

if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1461,18 +1482,17 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
rcu_read_unlock();
}

+ cpuset_release_static_nodemask();
+
/* change mm; only needs to be done once even if threadgroup */
- *to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, to);
+ mpol_rebind_mm(mm, &cs->mems_allowed);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
+ cpuset_migrate_mm(mm, &oldcs->mems_allowed,
+ &cs->mems_allowed);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(to);
}

/* The various types of files and directories in a cpuset file system */
@@ -2051,10 +2071,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return;
+ nodemask_t *oldmems = cpuset_static_nodemask();

list_add_tail((struct list_head *)&root->stack_list, &queue);

@@ -2090,7 +2107,8 @@ static void scan_for_empty_cpusets(struct cpuset *root)
update_tasks_nodemask(cp, oldmems, NULL);
}
}
- NODEMASK_FREE(oldmems);
+
+ cpuset_release_static_nodemask();
}

/*
@@ -2132,19 +2150,18 @@ void cpuset_update_active_cpus(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return NOTIFY_DONE;
+ nodemask_t *oldmems;

cgroup_lock();
switch (action) {
case MEM_ONLINE:
+ oldmems = cpuset_static_nodemask();
*oldmems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+ cpuset_release_static_nodemask();
break;
case MEM_OFFLINE:
/*
@@ -2158,7 +2175,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
}
cgroup_unlock();

- NODEMASK_FREE(oldmems);
return NOTIFY_OK;
}
#endif
--
1.7.3.1

2011-02-21 05:29:08

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

> +/*
> + * Calling cpuset_static_nodemask() should be paired with this function,
> + * so we insure the global nodemask won't be used by more than one user
> + * at the one time.
> + */
> +static void cpuset_release_static_nodemask(void)
> +{
> + WARN_ON(!cgroup_lock_is_held());
> +
> + cpuset_mems_ref--;
> + WARN_ON(!cpuset_mems_ref);

WARN_ON(cpuset_mems_ref);

> +}

2011-02-22 00:25:24

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

On Mon, 21 Feb 2011, Li Zefan wrote:

> Unfortunately, as I looked into the code again I found cpuset_change_nodemask()
> is called by other functions that use the global cpuset_mems, so I
> think we'd better check the refcnt of cpuset_mems.
>
> How about this:
>
> [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
>
> Those functions that use NODEMASK_ALLOC() can't propogate errno
> to users, so might fail silently.
>
> Based on the fact that all of them are called with cgroup_mutex
> held, we fix this by using a global nodemask.
>

If all of the functions that require a nodemask are protected by
cgroup_mutex, then I think it would be much better to just statically
allocate them within the function and avoid any nodemask in file scope.
cpuset_mems cannot be shared so introducing it with a refcount would
probably just be confusing.

2011-02-22 02:14:18

by Li Zefan

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

David Rientjes wrote:
> On Mon, 21 Feb 2011, Li Zefan wrote:
>
>> Unfortunately, as I looked into the code again I found cpuset_change_nodemask()
>> is called by other functions that use the global cpuset_mems, so I
>> think we'd better check the refcnt of cpuset_mems.
>>
>> How about this:
>>
>> [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
>>
>> Those functions that use NODEMASK_ALLOC() can't propogate errno
>> to users, so might fail silently.
>>
>> Based on the fact that all of them are called with cgroup_mutex
>> held, we fix this by using a global nodemask.
>>
>
> If all of the functions that require a nodemask are protected by
> cgroup_mutex, then I think it would be much better to just statically
> allocate them within the function and avoid any nodemask in file scope.
> cpuset_mems cannot be shared so introducing it with a refcount would
> probably just be confusing.
>

I'll repost the patchset after you ack this patch.

[PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

Those functions that use NODEMASK_ALLOC() can't propogate errno
to users, so might fail silently.

Fix it by using one static nodemask_t variable for each function, and
those variables are protected by cgroup_mutex.

Signed-off-by: Li Zefan <[email protected]>
---
kernel/cpuset.c | 50 ++++++++++++++++----------------------------------
1 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8fef8c6..073ce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
- if (!newmems)
- return;
+ static nodemask_t newmems; /* protected by cgroup_mutex */

cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, newmems);
-
- cpuset_change_task_nodemask(p, newmems);
+ guarantee_online_mems(cs, &newmems);

- NODEMASK_FREE(newmems);
+ cpuset_change_task_nodemask(p, &newmems);

mm = get_task_mm(p);
if (!mm)
@@ -1438,41 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
- if (to == NULL)
- goto alloc_fail;
+ static nodemask_t to; /* protected by cgroup_mutex */

if (cs == &top_cpuset) {
cpumask_copy(cpus_attach, cpu_possible_mask);
} else {
guarantee_online_cpus(cs, cpus_attach);
}
- guarantee_online_mems(cs, to);
+ guarantee_online_mems(cs, &to);

/* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, to, cs);
+ cpuset_attach_task(tsk, &to, cs);
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, to, cs);
+ cpuset_attach_task(c, &to, cs);
}
rcu_read_unlock();
}

/* change mm; only needs to be done once even if threadgroup */
- *to = cs->mems_allowed;
+ to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, to);
+ mpol_rebind_mm(mm, &to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
+ cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(to);
}

/* The various types of files and directories in a cpuset file system */
@@ -2051,10 +2040,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */

list_add_tail((struct list_head *)&root->stack_list, &queue);

@@ -2071,7 +2057,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;

- *oldmems = cp->mems_allowed;
+ oldmems = cp->mems_allowed;

/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
@@ -2087,10 +2073,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, oldmems, NULL);
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
- NODEMASK_FREE(oldmems);
}

/*
@@ -2132,19 +2117,16 @@ void cpuset_update_active_cpus(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return NOTIFY_DONE;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */

cgroup_lock();
switch (action) {
case MEM_ONLINE:
- *oldmems = top_cpuset.mems_allowed;
+ oldmems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+ update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
break;
case MEM_OFFLINE:
/*
--
1.7.3.1

2011-02-22 20:30:24

by David Rientjes

[permalink] [raw]
Subject: Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

On Tue, 22 Feb 2011, Li Zefan wrote:

> [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
>
> Those functions that use NODEMASK_ALLOC() can't propogate errno
> to users, so might fail silently.
>
> Fix it by using one static nodemask_t variable for each function, and
> those variables are protected by cgroup_mutex.
>

I think there would also be incentive to do the same thing for
update_nodemask() even though its caller can handle -ENOMEM. Imagine
current being out of memory and the NODEMASK_ALLOC() subsequently failing
because it is oom yet it may be trying to give itself more memory. It's
also protected by cgroup_mutex so the only thing we're sacrificing is 8
bytes on the defconfig and 256 bytes even with CONFIG_NODES_SHIFT == 10.
On machines that large, this seems like an acceptable cost to ensure we
can give ourselves more memory :)

> Signed-off-by: Li Zefan <[email protected]>
> ---
> kernel/cpuset.c | 50 ++++++++++++++++----------------------------------
> 1 files changed, 16 insertions(+), 34 deletions(-)
>
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 8fef8c6..073ce91 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
> struct cpuset *cs;
> int migrate;
> const nodemask_t *oldmem = scan->data;
> - NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
> -
> - if (!newmems)
> - return;
> + static nodemask_t newmems; /* protected by cgroup_mutex */
>
> cs = cgroup_cs(scan->cg);
> - guarantee_online_mems(cs, newmems);
> -
> - cpuset_change_task_nodemask(p, newmems);
> + guarantee_online_mems(cs, &newmems);

The newmems nodemask is going to be persistant across calls since it is
static, so we have to be careful that nothing depends on it being
NODE_MASK_NONE. Indeed, NODEMASK_ALLOC() with just GFP_KERNEL doesn't
guarantee anything different. guarantee_online_mems() sets the nodemask,
so this looks good.

>
> - NODEMASK_FREE(newmems);
> + cpuset_change_task_nodemask(p, &newmems);
>
> mm = get_task_mm(p);
> if (!mm)
> @@ -1438,41 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
> struct mm_struct *mm;
> struct cpuset *cs = cgroup_cs(cont);
> struct cpuset *oldcs = cgroup_cs(oldcont);
> - NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
> -
> - if (to == NULL)
> - goto alloc_fail;
> + static nodemask_t to; /* protected by cgroup_mutex */
>
> if (cs == &top_cpuset) {
> cpumask_copy(cpus_attach, cpu_possible_mask);
> } else {
> guarantee_online_cpus(cs, cpus_attach);
> }
> - guarantee_online_mems(cs, to);
> + guarantee_online_mems(cs, &to);
>
> /* do per-task migration stuff possibly for each in the threadgroup */
> - cpuset_attach_task(tsk, to, cs);
> + cpuset_attach_task(tsk, &to, cs);
> if (threadgroup) {
> struct task_struct *c;
> rcu_read_lock();
> list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> - cpuset_attach_task(c, to, cs);
> + cpuset_attach_task(c, &to, cs);
> }
> rcu_read_unlock();
> }
>
> /* change mm; only needs to be done once even if threadgroup */
> - *to = cs->mems_allowed;
> + to = cs->mems_allowed;
> mm = get_task_mm(tsk);
> if (mm) {
> - mpol_rebind_mm(mm, to);
> + mpol_rebind_mm(mm, &to);
> if (is_memory_migrate(cs))
> - cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
> + cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
> mmput(mm);
> }
> -
> -alloc_fail:
> - NODEMASK_FREE(to);
> }
>
> /* The various types of files and directories in a cpuset file system */
> @@ -2051,10 +2040,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
> struct cpuset *cp; /* scans cpusets being updated */
> struct cpuset *child; /* scans child cpusets of cp */
> struct cgroup *cont;
> - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
> -
> - if (oldmems == NULL)
> - return;
> + static nodemask_t oldmems; /* protected by cgroup_mutex */
>
> list_add_tail((struct list_head *)&root->stack_list, &queue);
>
> @@ -2071,7 +2057,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
> nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
> continue;
>
> - *oldmems = cp->mems_allowed;
> + oldmems = cp->mems_allowed;
>
> /* Remove offline cpus and mems from this cpuset. */
> mutex_lock(&callback_mutex);
> @@ -2087,10 +2073,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
> remove_tasks_in_empty_cpuset(cp);
> else {
> update_tasks_cpumask(cp, NULL);
> - update_tasks_nodemask(cp, oldmems, NULL);
> + update_tasks_nodemask(cp, &oldmems, NULL);
> }
> }
> - NODEMASK_FREE(oldmems);
> }
>
> /*
> @@ -2132,19 +2117,16 @@ void cpuset_update_active_cpus(void)
> static int cpuset_track_online_nodes(struct notifier_block *self,
> unsigned long action, void *arg)
> {
> - NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
> -
> - if (oldmems == NULL)
> - return NOTIFY_DONE;
> + static nodemask_t oldmems; /* protected by cgroup_mutex */
>
> cgroup_lock();
> switch (action) {
> case MEM_ONLINE:
> - *oldmems = top_cpuset.mems_allowed;
> + oldmems = top_cpuset.mems_allowed;
> mutex_lock(&callback_mutex);
> top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
> mutex_unlock(&callback_mutex);
> - update_tasks_nodemask(&top_cpuset, oldmems, NULL);
> + update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
> break;
> case MEM_OFFLINE:
> /*

The NODEMASK_FREE() wasn't removed from cpuset_track_online_nodes().
After that's fixed:

Acked-by: David Rientjes <[email protected]>