2012-05-01 22:28:50

by Suleiman Souhlal

[permalink] [raw]
Subject: Re: [RFC][PATCH 9/9 v2] memcg: never return error at pre_destroy()

2012/4/26 KAMEZAWA Hiroyuki <[email protected]>:
> When force_empty() called by ->pre_destroy(), no memory reclaim happens
> and it doesn't take very long time which requires signal_pending() check.
> And if we return -EINTR from pre_destroy(), cgroup.c show warning.
>
> This patch removes signal check in force_empty(). By this, ->pre_destroy()
> returns success always.
>
> Note: check for 'cgroup is empty' remains for force_empty interface.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> ?mm/hugetlb.c ? ?| ? 10 +---------
> ?mm/memcontrol.c | ? 14 +++++---------
> ?2 files changed, 6 insertions(+), 18 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 4dd6b39..770f1642 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1922,20 +1922,12 @@ int hugetlb_force_memcg_empty(struct cgroup *cgroup)
> ? ? ? ?int ret = 0, idx = 0;
>
> ? ? ? ?do {
> + ? ? ? ? ? ? ? /* see memcontrol.c::mem_cgroup_force_empty() */
> ? ? ? ? ? ? ? ?if (cgroup_task_count(cgroup)
> ? ? ? ? ? ? ? ? ? ? ? ?|| !list_empty(&cgroup->children)) {
> ? ? ? ? ? ? ? ? ? ? ? ?ret = -EBUSY;
> ? ? ? ? ? ? ? ? ? ? ? ?goto out;
> ? ? ? ? ? ? ? ?}
> - ? ? ? ? ? ? ? /*
> - ? ? ? ? ? ? ? ?* If the task doing the cgroup_rmdir got a signal
> - ? ? ? ? ? ? ? ?* we don't really need to loop till the hugetlb resource
> - ? ? ? ? ? ? ? ?* usage become zero.
> - ? ? ? ? ? ? ? ?*/
> - ? ? ? ? ? ? ? if (signal_pending(current)) {
> - ? ? ? ? ? ? ? ? ? ? ? ret = -EINTR;
> - ? ? ? ? ? ? ? ? ? ? ? goto out;
> - ? ? ? ? ? ? ? }
> ? ? ? ? ? ? ? ?for_each_hstate(h) {
> ? ? ? ? ? ? ? ? ? ? ? ?spin_lock(&hugetlb_lock);
> ? ? ? ? ? ? ? ? ? ? ? ?list_for_each_entry(page, &h->hugepage_activelist, lru) {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2715223..ee350c5 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3852,8 +3852,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
> ? ? ? ? ? ? ? ?pc = lookup_page_cgroup(page);
>
> ? ? ? ? ? ? ? ?ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
> - ? ? ? ? ? ? ? if (ret == -ENOMEM || ret == -EINTR)
> - ? ? ? ? ? ? ? ? ? ? ? break;
>
> ? ? ? ? ? ? ? ?if (ret == -EBUSY || ret == -EINVAL) {
> ? ? ? ? ? ? ? ? ? ? ? ?/* found lock contention or "pc" is obsolete. */
> @@ -3863,7 +3861,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
> ? ? ? ? ? ? ? ? ? ? ? ?busy = NULL;
> ? ? ? ?}
>
> - ? ? ? if (!ret && !list_empty(list))
> + ? ? ? if (!loop)
> ? ? ? ? ? ? ? ?return -EBUSY;
> ? ? ? ?return ret;
> ?}
> @@ -3893,11 +3891,12 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
> ?move_account:
> ? ? ? ?do {
> ? ? ? ? ? ? ? ?ret = -EBUSY;
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* This never happens when this is called by ->pre_destroy().
> + ? ? ? ? ? ? ? ?* But we need to take care of force_empty interface.
> + ? ? ? ? ? ? ? ?*/
> ? ? ? ? ? ? ? ?if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
> ? ? ? ? ? ? ? ? ? ? ? ?goto out;

Are you sure this never happens when called by ->pre_destroy()?
Can't a task still get attached to the cgroup while ->pre_destroy() is running?

At least, I don't see anything in the cgroup code that prevents
someone from newly attaching a task at that point.
In fact, there is code that seems to handle the case when someone
attached to the cgroup after pre_destroy() has run: See the
cgroup_wakeup_rmdir_waiter() call in cgroup_attach_task().

-- Suleiman


2012-05-02 03:34:38

by Hiroyuki Kamezawa

[permalink] [raw]
Subject: Re: [RFC][PATCH 9/9 v2] memcg: never return error at pre_destroy()

On Wed, May 2, 2012 at 7:28 AM, Suleiman Souhlal <[email protected]> wrote:
> 2012/4/26 KAMEZAWA Hiroyuki <[email protected]>:
>> When force_empty() called by ->pre_destroy(), no memory reclaim happens
>> and it doesn't take very long time which requires signal_pending() check.
>> And if we return -EINTR from pre_destroy(), cgroup.c show warning.
>>
>> This patch removes signal check in force_empty(). By this, ->pre_destroy()
>> returns success always.
>>
>> Note: check for 'cgroup is empty' remains for force_empty interface.
>>
>> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>> ---
>> ?mm/hugetlb.c ? ?| ? 10 +---------
>> ?mm/memcontrol.c | ? 14 +++++---------
>> ?2 files changed, 6 insertions(+), 18 deletions(-)
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index 4dd6b39..770f1642 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -1922,20 +1922,12 @@ int hugetlb_force_memcg_empty(struct cgroup *cgroup)
>> ? ? ? ?int ret = 0, idx = 0;
>>
>> ? ? ? ?do {
>> + ? ? ? ? ? ? ? /* see memcontrol.c::mem_cgroup_force_empty() */
>> ? ? ? ? ? ? ? ?if (cgroup_task_count(cgroup)
>> ? ? ? ? ? ? ? ? ? ? ? ?|| !list_empty(&cgroup->children)) {
>> ? ? ? ? ? ? ? ? ? ? ? ?ret = -EBUSY;
>> ? ? ? ? ? ? ? ? ? ? ? ?goto out;
>> ? ? ? ? ? ? ? ?}
>> - ? ? ? ? ? ? ? /*
>> - ? ? ? ? ? ? ? ?* If the task doing the cgroup_rmdir got a signal
>> - ? ? ? ? ? ? ? ?* we don't really need to loop till the hugetlb resource
>> - ? ? ? ? ? ? ? ?* usage become zero.
>> - ? ? ? ? ? ? ? ?*/
>> - ? ? ? ? ? ? ? if (signal_pending(current)) {
>> - ? ? ? ? ? ? ? ? ? ? ? ret = -EINTR;
>> - ? ? ? ? ? ? ? ? ? ? ? goto out;
>> - ? ? ? ? ? ? ? }
>> ? ? ? ? ? ? ? ?for_each_hstate(h) {
>> ? ? ? ? ? ? ? ? ? ? ? ?spin_lock(&hugetlb_lock);
>> ? ? ? ? ? ? ? ? ? ? ? ?list_for_each_entry(page, &h->hugepage_activelist, lru) {
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 2715223..ee350c5 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -3852,8 +3852,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
>> ? ? ? ? ? ? ? ?pc = lookup_page_cgroup(page);
>>
>> ? ? ? ? ? ? ? ?ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
>> - ? ? ? ? ? ? ? if (ret == -ENOMEM || ret == -EINTR)
>> - ? ? ? ? ? ? ? ? ? ? ? break;
>>
>> ? ? ? ? ? ? ? ?if (ret == -EBUSY || ret == -EINVAL) {
>> ? ? ? ? ? ? ? ? ? ? ? ?/* found lock contention or "pc" is obsolete. */
>> @@ -3863,7 +3861,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
>> ? ? ? ? ? ? ? ? ? ? ? ?busy = NULL;
>> ? ? ? ?}
>>
>> - ? ? ? if (!ret && !list_empty(list))
>> + ? ? ? if (!loop)
>> ? ? ? ? ? ? ? ?return -EBUSY;
>> ? ? ? ?return ret;
>> ?}
>> @@ -3893,11 +3891,12 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
>> ?move_account:
>> ? ? ? ?do {
>> ? ? ? ? ? ? ? ?ret = -EBUSY;
>> + ? ? ? ? ? ? ? /*
>> + ? ? ? ? ? ? ? ?* This never happens when this is called by ->pre_destroy().
>> + ? ? ? ? ? ? ? ?* But we need to take care of force_empty interface.
>> + ? ? ? ? ? ? ? ?*/
>> ? ? ? ? ? ? ? ?if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
>> ? ? ? ? ? ? ? ? ? ? ? ?goto out;
>
> Are you sure this never happens when called by ->pre_destroy()?
> Can't a task still get attached to the cgroup while ->pre_destroy() is running?
>
see whole series of patch series, 7 & 8 is against that probelm.
But they will be dropped and this race will remain. And this patch's
title will be
changed to be "remove -EINTR" rather than "remove failure of pre_destroy*.
pre_destrou() will continue to fail until cgroup core is fixed.

Thanks,
-Kame