Updated from previous one.
- Updated comments.
- add cgroup_exclude_rmdir()/cgroup_release_rmdir().
Patch 1/2 is tested by Nishimura (look is modified but the same algorithm..)
Patch 2/2 is a new. but no difficulty.
Thank you for all your helps
Regards,
-Kame
From: KAMEZAWA Hiroyuki <[email protected]>
After commit: cgroup: fix frequent -EBUSY at rmdir
ec64f51545fffbc4cb968f0cea56341a4b07e85a
cgroup's rmdir (especially against memcg) doesn't return -EBUSY
by temporal ref counts. That commit expects all refs after pre_destroy()
is temporary but...it wasn't. Then, rmdir can wait permanently.
This patch tries to fix that and change followings.
- set CGRP_WAIT_ON_RMDIR flag before pre_destroy().
- clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case.
if there are sleeping ones, wakes them up.
- rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set.
Changelog v3->v4:
- rewrite/add comments.
- rename cgroup_wakeup_rmdir_waiters() to cgroup_wakeup_rmdir_waiter().
Changelog v2->v3:
- removed retry_rmdir() callback.
- make use of CGRP_WAIT_ON_RMDIR flag more.
Tested-by: Daisuke Nishimura <[email protected]>
Reported-by: Daisuke Nishimura <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/cgroup.h | 13 +++++++++++++
kernel/cgroup.c | 44 ++++++++++++++++++++++++++------------------
mm/memcontrol.c | 25 +++++++++++++++++++++++--
3 files changed, 62 insertions(+), 20 deletions(-)
Index: mmotm-2.6.31-Jun25/include/linux/cgroup.h
===================================================================
--- mmotm-2.6.31-Jun25.orig/include/linux/cgroup.h
+++ mmotm-2.6.31-Jun25/include/linux/cgroup.h
@@ -366,6 +366,19 @@ int cgroup_task_count(const struct cgrou
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
/*
+ * Allow to use CGRP_WAIT_ON_RMDIR flag to check race with rmdir() for subsys.
+ * Subsys can call this function if it's necessary to call pre_destroy() again
+ * because it adds not-temporary refs to css after or while pre_destroy().
+ * The caller of this function should use css_tryget(), too.
+ */
+void __cgroup_wakeup_rmdir_waiters(void);
+static inline void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
+{
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ __cgroup_wakeup_rmdir_waiters();
+}
+
+/*
* Control Group subsystem type.
* See Documentation/cgroups/cgroups.txt for details
*/
Index: mmotm-2.6.31-Jun25/kernel/cgroup.c
===================================================================
--- mmotm-2.6.31-Jun25.orig/kernel/cgroup.c
+++ mmotm-2.6.31-Jun25/kernel/cgroup.c
@@ -734,14 +734,13 @@ static void cgroup_d_remove_dir(struct d
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
- * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+void __cgroup_wakeup_rmdir_waiters(void)
{
- if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
+ wake_up_all(&cgroup_rmdir_waitq);
}
static int rebind_subsystems(struct cgroupfs_root *root,
@@ -1357,7 +1356,7 @@ int cgroup_attach_task(struct cgroup *cg
* wake up rmdir() waiter. the rmdir should fail since the cgroup
* is no longer empty.
*/
- cgroup_wakeup_rmdir_waiters(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
return 0;
}
@@ -2696,33 +2695,42 @@ again:
mutex_unlock(&cgroup_mutex);
/*
+ * In general, subsystem has no css->refcnt after pre_destroy(). But
+ * in racy cases, subsystem may have to get css->refcnt after
+ * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
+ * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
+ * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
+ * and subsystem's reference count handling. Please see css_get/put
+ * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ */
+ set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+
+ /*
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
ret = cgroup_call_pre_destroy(cgrp);
- if (ret)
+ if (ret) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
return ret;
+ }
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
- /*
- * css_put/get is provided for subsys to grab refcnt to css. In typical
- * case, subsystem has no reference after pre_destroy(). But, under
- * hierarchy management, some *temporal* refcnt can be hold.
- * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
- * is really busy, it should return -EBUSY at pre_destroy(). wake_up
- * is called when css_put() is called and refcnt goes down to 0.
- */
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
-
if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
- schedule();
+ /*
+ * Because someone may call cgroup_wakeup_rmdir_waiter() before
+ * prepare_to_wait(), we need to check this flag.
+ */
+ if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
+ schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
@@ -3294,7 +3302,7 @@ void __css_put(struct cgroup_subsys_stat
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiters(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
}
rcu_read_unlock();
}
Index: mmotm-2.6.31-Jun25/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Jun25.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Jun25/mm/memcontrol.c
@@ -1234,6 +1234,12 @@ static int mem_cgroup_move_account(struc
ret = 0;
out:
unlock_page_cgroup(pc);
+ /*
+ * We charges against "to" which may not have any tasks. Then, "to"
+ * can be under rmdir(). But in current implementation, caller of
+ * this function is just force_empty() and it's garanteed that
+ * "to" is never removed. So, we don't check rmdir status here.
+ */
return ret;
}
@@ -1455,6 +1461,7 @@ __mem_cgroup_commit_charge_swapin(struct
return;
if (!ptr)
return;
+ css_get(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
__mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1484,7 +1491,13 @@ __mem_cgroup_commit_charge_swapin(struct
}
rcu_read_unlock();
}
- /* add this page(page_cgroup) to the LRU we want. */
+ /*
+ * At swapin, we may charge account against cgroup which has no tasks.
+ * So, rmdir()->pre_destroy() can be called while we do this charge.
+ * In that case, we need to call pre_destroy() again. check it here.
+ */
+ cgroup_wakeup_rmdir_waiter(ptr->css.cgroup);
+ css_put(&ptr->css);
}
@@ -1691,7 +1704,7 @@ void mem_cgroup_end_migration(struct mem
if (!mem)
return;
-
+ css_get(&mem->css);
/* at migration success, oldpage->mapping is NULL. */
if (oldpage->mapping) {
target = oldpage;
@@ -1731,6 +1744,14 @@ void mem_cgroup_end_migration(struct mem
*/
if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
mem_cgroup_uncharge_page(target);
+ /*
+ * At migration, we may charge account against cgroup which has no tasks
+ * So, rmdir()->pre_destroy() can be called while we do this charge.
+ * In that case, we need to call pre_destroy() again. check it here.
+ */
+ cgroup_wakeup_rmdir_waiter(mem->css.cgroup);
+ css_put(&mem->css);
+
}
/*
From: KAMEZAWA Hiroyuki <[email protected]>
Paul Menage pointed out that css_get()/put() only for avoiding race with
rmdir() is complicated and these should be treated as it is for.
This adds
- cgroup_exclude_rmdir() ....prevent rmdir() for a while.
- cgroup_release_rmdir() ....rerun rmdir() if necessary.
And hides cgroup_wakeup_rmdir_waiter() into kernel/cgroup.c, again.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/cgroup.h | 21 +++++++++++----------
kernel/cgroup.c | 17 +++++++++++++++--
mm/memcontrol.c | 12 ++++--------
3 files changed, 30 insertions(+), 20 deletions(-)
Index: mmotm-2.6.31-Jun25/include/linux/cgroup.h
===================================================================
--- mmotm-2.6.31-Jun25.orig/include/linux/cgroup.h
+++ mmotm-2.6.31-Jun25/include/linux/cgroup.h
@@ -366,17 +366,18 @@ int cgroup_task_count(const struct cgrou
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
/*
- * Allow to use CGRP_WAIT_ON_RMDIR flag to check race with rmdir() for subsys.
- * Subsys can call this function if it's necessary to call pre_destroy() again
- * because it adds not-temporary refs to css after or while pre_destroy().
- * The caller of this function should use css_tryget(), too.
+ * When the subsys has to access css and may add permanent refcnt to css,
+ * it should take care of racy conditions with rmdir(). Following set of
+ * functions, is for stop/restart rmdir if necessary.
+ * Because these will call css_get/put, "css" should be alive css.
+ *
+ * cgroup_exclude_rmdir();
+ * ...do some jobs which may access arbitrary empty cgroup
+ * cgroup_release_rmdir();
*/
-void __cgroup_wakeup_rmdir_waiters(void);
-static inline void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- __cgroup_wakeup_rmdir_waiters();
-}
+
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
+void cgroup_release_rmdir(struct cgroup_subsys_state *css);
/*
* Control Group subsystem type.
Index: mmotm-2.6.31-Jun25/kernel/cgroup.c
===================================================================
--- mmotm-2.6.31-Jun25.orig/kernel/cgroup.c
+++ mmotm-2.6.31-Jun25/kernel/cgroup.c
@@ -738,11 +738,24 @@ static void cgroup_d_remove_dir(struct d
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-void __cgroup_wakeup_rmdir_waiters(void)
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
{
- wake_up_all(&cgroup_rmdir_waitq);
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ wake_up_all(&cgroup_rmdir_waitq);
}
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+ css_get(css);
+}
+
+void cgroup_release_rmdir(struct cgroup_subsys_state *css)
+{
+ cgroup_wakeup_rmdir_waiter(css->cgroup);
+ css_put(css);
+}
+
+
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
Index: mmotm-2.6.31-Jun25/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Jun25.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Jun25/mm/memcontrol.c
@@ -1461,7 +1461,7 @@ __mem_cgroup_commit_charge_swapin(struct
return;
if (!ptr)
return;
- css_get(&ptr->css);
+ cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
__mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1496,9 +1496,7 @@ __mem_cgroup_commit_charge_swapin(struct
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
- cgroup_wakeup_rmdir_waiter(ptr->css.cgroup);
- css_put(&ptr->css);
-
+ cgroup_release_rmdir(&ptr->css);
}
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1704,7 +1702,7 @@ void mem_cgroup_end_migration(struct mem
if (!mem)
return;
- css_get(&mem->css);
+ cgroup_exclude_rmdir(&mem->css);
/* at migration success, oldpage->mapping is NULL. */
if (oldpage->mapping) {
target = oldpage;
@@ -1749,9 +1747,7 @@ void mem_cgroup_end_migration(struct mem
* So, rmdir()->pre_destroy() can be called while we do this charge.
* In that case, we need to call pre_destroy() again. check it here.
*/
- cgroup_wakeup_rmdir_waiter(mem->css.cgroup);
- css_put(&mem->css);
-
+ cgroup_release_rmdir(&mem->css);
}
/*
On Tue, Jun 30, 2009 at 2:03 AM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Paul Menage pointed out that css_get()/put() only for avoiding race with
> rmdir() is complicated and these should be treated as it is for.
>
> This adds
> ? - cgroup_exclude_rmdir() ....prevent rmdir() for a while.
> ? - cgroup_release_rmdir() ....rerun rmdir() if necessary.
> And hides cgroup_wakeup_rmdir_waiter() into kernel/cgroup.c, again.
Wouldn't it be better to merge these into a single patch? Having one
patch that exposes complexity only to take it away in the following
patch seems unnecessary; the combined patch would be simpler than the
constituents.
Paul
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
> ---
> ?include/linux/cgroup.h | ? 21 +++++++++++----------
> ?kernel/cgroup.c ? ? ? ?| ? 17 +++++++++++++++--
> ?mm/memcontrol.c ? ? ? ?| ? 12 ++++--------
> ?3 files changed, 30 insertions(+), 20 deletions(-)
>
> Index: mmotm-2.6.31-Jun25/include/linux/cgroup.h
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/include/linux/cgroup.h
> +++ mmotm-2.6.31-Jun25/include/linux/cgroup.h
> @@ -366,17 +366,18 @@ int cgroup_task_count(const struct cgrou
> ?int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
>
> ?/*
> - * Allow to use CGRP_WAIT_ON_RMDIR flag to check race with rmdir() for subsys.
> - * Subsys can call this function if it's necessary to call pre_destroy() again
> - * because it adds not-temporary refs to css after or while pre_destroy().
> - * The caller of this function should use css_tryget(), too.
> + * When the subsys has to access css and may add permanent refcnt to css,
> + * it should take care of racy conditions with rmdir(). Following set of
> + * functions, is for stop/restart rmdir if necessary.
> + * Because these will call css_get/put, "css" should be alive css.
> + *
> + * ?cgroup_exclude_rmdir();
> + * ?...do some jobs which may access arbitrary empty cgroup
> + * ?cgroup_release_rmdir();
> ?*/
> -void __cgroup_wakeup_rmdir_waiters(void);
> -static inline void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
> -{
> - ? ? ? if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> - ? ? ? ? ? ? ? __cgroup_wakeup_rmdir_waiters();
> -}
> +
> +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
> +void cgroup_release_rmdir(struct cgroup_subsys_state *css);
>
> ?/*
> ?* Control Group subsystem type.
> Index: mmotm-2.6.31-Jun25/kernel/cgroup.c
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/kernel/cgroup.c
> +++ mmotm-2.6.31-Jun25/kernel/cgroup.c
> @@ -738,11 +738,24 @@ static void cgroup_d_remove_dir(struct d
> ?*/
> ?DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
>
> -void __cgroup_wakeup_rmdir_waiters(void)
> +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
> ?{
> - ? ? ? wake_up_all(&cgroup_rmdir_waitq);
> + ? ? ? if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> + ? ? ? ? ? ? ? wake_up_all(&cgroup_rmdir_waitq);
> ?}
>
> +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
> +{
> + ? ? ? css_get(css);
> +}
> +
> +void cgroup_release_rmdir(struct cgroup_subsys_state *css)
> +{
> + ? ? ? cgroup_wakeup_rmdir_waiter(css->cgroup);
> + ? ? ? css_put(css);
> +}
> +
> +
> ?static int rebind_subsystems(struct cgroupfs_root *root,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?unsigned long final_bits)
> ?{
> Index: mmotm-2.6.31-Jun25/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Jun25/mm/memcontrol.c
> @@ -1461,7 +1461,7 @@ __mem_cgroup_commit_charge_swapin(struct
> ? ? ? ? ? ? ? ?return;
> ? ? ? ?if (!ptr)
> ? ? ? ? ? ? ? ?return;
> - ? ? ? css_get(&ptr->css);
> + ? ? ? cgroup_exclude_rmdir(&ptr->css);
> ? ? ? ?pc = lookup_page_cgroup(page);
> ? ? ? ?mem_cgroup_lru_del_before_commit_swapcache(page);
> ? ? ? ?__mem_cgroup_commit_charge(ptr, pc, ctype);
> @@ -1496,9 +1496,7 @@ __mem_cgroup_commit_charge_swapin(struct
> ? ? ? ? * So, rmdir()->pre_destroy() can be called while we do this charge.
> ? ? ? ? * In that case, we need to call pre_destroy() again. check it here.
> ? ? ? ? */
> - ? ? ? cgroup_wakeup_rmdir_waiter(ptr->css.cgroup);
> - ? ? ? css_put(&ptr->css);
> -
> + ? ? ? cgroup_release_rmdir(&ptr->css);
> ?}
>
> ?void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
> @@ -1704,7 +1702,7 @@ void mem_cgroup_end_migration(struct mem
>
> ? ? ? ?if (!mem)
> ? ? ? ? ? ? ? ?return;
> - ? ? ? css_get(&mem->css);
> + ? ? ? cgroup_exclude_rmdir(&mem->css);
> ? ? ? ?/* at migration success, oldpage->mapping is NULL. */
> ? ? ? ?if (oldpage->mapping) {
> ? ? ? ? ? ? ? ?target = oldpage;
> @@ -1749,9 +1747,7 @@ void mem_cgroup_end_migration(struct mem
> ? ? ? ? * So, rmdir()->pre_destroy() can be called while we do this charge.
> ? ? ? ? * In that case, we need to call pre_destroy() again. check it here.
> ? ? ? ? */
> - ? ? ? cgroup_wakeup_rmdir_waiter(mem->css.cgroup);
> - ? ? ? css_put(&mem->css);
> -
> + ? ? ? cgroup_release_rmdir(&mem->css);
> ?}
>
> ?/*
>
>
On Tue, 30 Jun 2009 02:15:03 -0700
Paul Menage <[email protected]> wrote:
> On Tue, Jun 30, 2009 at 2:03 AM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
> > From: KAMEZAWA Hiroyuki <[email protected]>
> >
> > Paul Menage pointed out that css_get()/put() only for avoiding race with
> > rmdir() is complicated and these should be treated as it is for.
> >
> > This adds
> > - cgroup_exclude_rmdir() ....prevent rmdir() for a while.
> > - cgroup_release_rmdir() ....rerun rmdir() if necessary.
> > And hides cgroup_wakeup_rmdir_waiter() into kernel/cgroup.c, again.
>
> Wouldn't it be better to merge these into a single patch? Having one
> patch that exposes complexity only to take it away in the following
> patch seems unnecessary; the combined patch would be simpler than the
> constituents.
>
This patch is _not_ tested by Nishimura.
What I want is patch 1/2, it's BUGFIX and passed tests by him.
I trust his test very much.
I want the patch 1/2 should be on fast-path as BUGFIX.
But, I think this patch 2/2 is not for fast-path.
This is something new but just a refactoring.
Anyway, I can postpone this until things are settled. Only merging patch 1/2
is okay for me now.
Thanks,
-Kame
> Paul
>
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> >
> > ---
> > include/linux/cgroup.h | 21 +++++++++++----------
> > kernel/cgroup.c | 17 +++++++++++++++--
> > mm/memcontrol.c | 12 ++++--------
> > 3 files changed, 30 insertions(+), 20 deletions(-)
> >
> > Index: mmotm-2.6.31-Jun25/include/linux/cgroup.h
> > ===================================================================
> > --- mmotm-2.6.31-Jun25.orig/include/linux/cgroup.h
> > +++ mmotm-2.6.31-Jun25/include/linux/cgroup.h
> > @@ -366,17 +366,18 @@ int cgroup_task_count(const struct cgrou
> > int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
> >
> > /*
> > - * Allow to use CGRP_WAIT_ON_RMDIR flag to check race with rmdir() for subsys.
> > - * Subsys can call this function if it's necessary to call pre_destroy() again
> > - * because it adds not-temporary refs to css after or while pre_destroy().
> > - * The caller of this function should use css_tryget(), too.
> > + * When the subsys has to access css and may add permanent refcnt to css,
> > + * it should take care of racy conditions with rmdir(). Following set of
> > + * functions, is for stop/restart rmdir if necessary.
> > + * Because these will call css_get/put, "css" should be alive css.
> > + *
> > + * cgroup_exclude_rmdir();
> > + * ...do some jobs which may access arbitrary empty cgroup
> > + * cgroup_release_rmdir();
> > */
> > -void __cgroup_wakeup_rmdir_waiters(void);
> > -static inline void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
> > -{
> > - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> > - __cgroup_wakeup_rmdir_waiters();
> > -}
> > +
> > +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
> > +void cgroup_release_rmdir(struct cgroup_subsys_state *css);
> >
> > /*
> > * Control Group subsystem type.
> > Index: mmotm-2.6.31-Jun25/kernel/cgroup.c
> > ===================================================================
> > --- mmotm-2.6.31-Jun25.orig/kernel/cgroup.c
> > +++ mmotm-2.6.31-Jun25/kernel/cgroup.c
> > @@ -738,11 +738,24 @@ static void cgroup_d_remove_dir(struct d
> > */
> > DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
> >
> > -void __cgroup_wakeup_rmdir_waiters(void)
> > +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
> > {
> > - wake_up_all(&cgroup_rmdir_waitq);
> > + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> > + wake_up_all(&cgroup_rmdir_waitq);
> > }
> >
> > +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
> > +{
> > + css_get(css);
> > +}
> > +
> > +void cgroup_release_rmdir(struct cgroup_subsys_state *css)
> > +{
> > + cgroup_wakeup_rmdir_waiter(css->cgroup);
> > + css_put(css);
> > +}
> > +
> > +
> > static int rebind_subsystems(struct cgroupfs_root *root,
> > unsigned long final_bits)
> > {
> > Index: mmotm-2.6.31-Jun25/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-2.6.31-Jun25.orig/mm/memcontrol.c
> > +++ mmotm-2.6.31-Jun25/mm/memcontrol.c
> > @@ -1461,7 +1461,7 @@ __mem_cgroup_commit_charge_swapin(struct
> > return;
> > if (!ptr)
> > return;
> > - css_get(&ptr->css);
> > + cgroup_exclude_rmdir(&ptr->css);
> > pc = lookup_page_cgroup(page);
> > mem_cgroup_lru_del_before_commit_swapcache(page);
> > __mem_cgroup_commit_charge(ptr, pc, ctype);
> > @@ -1496,9 +1496,7 @@ __mem_cgroup_commit_charge_swapin(struct
> > * So, rmdir()->pre_destroy() can be called while we do this charge.
> > * In that case, we need to call pre_destroy() again. check it here.
> > */
> > - cgroup_wakeup_rmdir_waiter(ptr->css.cgroup);
> > - css_put(&ptr->css);
> > -
> > + cgroup_release_rmdir(&ptr->css);
> > }
> >
> > void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
> > @@ -1704,7 +1702,7 @@ void mem_cgroup_end_migration(struct mem
> >
> > if (!mem)
> > return;
> > - css_get(&mem->css);
> > + cgroup_exclude_rmdir(&mem->css);
> > /* at migration success, oldpage->mapping is NULL. */
> > if (oldpage->mapping) {
> > target = oldpage;
> > @@ -1749,9 +1747,7 @@ void mem_cgroup_end_migration(struct mem
> > * So, rmdir()->pre_destroy() can be called while we do this charge.
> > * In that case, we need to call pre_destroy() again. check it here.
> > */
> > - cgroup_wakeup_rmdir_waiter(mem->css.cgroup);
> > - css_put(&mem->css);
> > -
> > + cgroup_release_rmdir(&mem->css);
> > }
> >
> > /*
> >
> >
>
On Tue, Jun 30, 2009 at 2:23 AM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> This patch is _not_ tested by Nishimura.
True, but it's functionally identical to, and simpler than, the one
that was tested.
Paul
On Tue, 30 Jun 2009 09:18:03 -0700, Paul Menage <[email protected]> wrote:
> On Tue, Jun 30, 2009 at 2:23 AM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
> > This patch is _not_ tested by Nishimura.
>
> True, but it's functionally identical to, and simpler than, the one
> that was tested.
>
I agree.
I'll test with both of these patches folded.
Daisuke Nishimura.
On Wed, 1 Jul 2009 08:40:37 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Tue, 30 Jun 2009 09:18:03 -0700, Paul Menage <[email protected]> wrote:
> > On Tue, Jun 30, 2009 at 2:23 AM, KAMEZAWA
> > Hiroyuki<[email protected]> wrote:
> > > This patch is _not_ tested by Nishimura.
> >
> > True, but it's functionally identical to, and simpler than, the one
> > that was tested.
> >
> I agree.
> I'll test with both of these patches folded.
>
Hm,ok. I'll post merged one today.
But I don't like cosmeticized bugfix patch ;(
-Kame
>
> Daisuke Nishimura.
>
On Tue, Jun 30, 2009 at 5:09 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> On Wed, 1 Jul 2009 08:40:37 +0900
> Daisuke Nishimura <[email protected]> wrote:
>
>> On Tue, 30 Jun 2009 09:18:03 -0700, Paul Menage <[email protected]> wrote:
>> > On Tue, Jun 30, 2009 at 2:23 AM, KAMEZAWA
>> > Hiroyuki<[email protected]> wrote:
>> > > This patch is _not_ tested by Nishimura.
>> >
>> > True, but it's functionally identical to, and simpler than, the one
>> > that was tested.
>> >
>> I agree.
>> I'll test with both of these patches folded.
>>
> Hm,ok. I'll post merged one today.
> But I don't like cosmeticized bugfix patch ;(
>
It only looks "cosmeticized" because of the evolution of your fix. The
first patch added a new function that exposed internal details of
cgroups, and the second patch removes the addition in favour of a
different new function that doesn't expose internal details as much; a
single patch that just adds the simpler new function is easier to
judge as intuitively correct (separately from Daisuke's testing) than
one that exposes more internal details.
Paul
On Tue, 30 Jun 2009 17:27:02 -0700
Paul Menage <[email protected]> wrote:
> It only looks "cosmeticized" because of the evolution of your fix. The
> first patch added a new function that exposed internal details of
> cgroups, and the second patch removes the addition in favour of a
> different new function that doesn't expose internal details as much; a
> single patch that just adds the simpler new function is easier to
> judge as intuitively correct (separately from Daisuke's testing) than
> one that exposes more internal details.
>
ok, I'll post again.
BTW, do you have patches for NOOP/signal cgroup we discussed a half year ago ?
Thanks,
-Kame
> Paul
>
ok, here.
==
From: KAMEZAWA Hiroyuki <[email protected]>
After commit: cgroup: fix frequent -EBUSY at rmdir
ec64f51545fffbc4cb968f0cea56341a4b07e85a
cgroup's rmdir (especially against memcg) doesn't return -EBUSY
by temporal ref counts. That commit expects all refs after pre_destroy()
is temporary but...it wasn't. Then, rmdir can wait permanently.
This patch tries to fix that and change followings.
- set CGRP_WAIT_ON_RMDIR flag before pre_destroy().
- clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case.
if there are sleeping ones, wakes them up.
- rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set.
Changelog v4->v5:
- added cgroup_exclude_rmdir(), cgroup_release_rmdir().
Changelog v3->v4:
- rewrite/add comments.
- remane cgroup_wakeup_rmdir_waiters() to cgroup_wakeup_rmdir_waiter().
Changelog v2->v3:
- removed retry_rmdir() callback.
- make use of CGRP_WAIT_ON_RMDIR flag more.
Reported-by: Daisuke Nishimura <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/cgroup.h | 14 ++++++++++++
kernel/cgroup.c | 55 +++++++++++++++++++++++++++++++++----------------
mm/memcontrol.c | 23 +++++++++++++++++---
3 files changed, 72 insertions(+), 20 deletions(-)
Index: mmotm-2.6.31-Jun25/include/linux/cgroup.h
===================================================================
--- mmotm-2.6.31-Jun25.orig/include/linux/cgroup.h
+++ mmotm-2.6.31-Jun25/include/linux/cgroup.h
@@ -366,6 +366,20 @@ int cgroup_task_count(const struct cgrou
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
/*
+ * When the subsys has to access css and may add permanent refcnt to css,
+ * it should take care of racy conditions with rmdir(). Following set of
+ * functions, is for stop/restart rmdir if necessary.
+ * Because these will call css_get/put, "css" should be alive css.
+ *
+ * cgroup_exclude_rmdir();
+ * ...do some jobs which may access arbitrary empty cgroup
+ * cgroup_release_rmdir();
+ */
+
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
+void cgroup_release_rmdir(struct cgroup_subsys_state *css);
+
+/*
* Control Group subsystem type.
* See Documentation/cgroups/cgroups.txt for details
*/
Index: mmotm-2.6.31-Jun25/kernel/cgroup.c
===================================================================
--- mmotm-2.6.31-Jun25.orig/kernel/cgroup.c
+++ mmotm-2.6.31-Jun25/kernel/cgroup.c
@@ -734,16 +734,28 @@ static void cgroup_d_remove_dir(struct d
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
- * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
{
- if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+ css_get(css);
+}
+
+void cgroup_release_rmdir(struct cgroup_subsys_state *css)
+{
+ cgroup_wakeup_rmdir_waiter(css->cgroup);
+ css_put(css);
+}
+
+
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
@@ -1357,7 +1369,7 @@ int cgroup_attach_task(struct cgroup *cg
* wake up rmdir() waiter. the rmdir should fail since the cgroup
* is no longer empty.
*/
- cgroup_wakeup_rmdir_waiters(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
return 0;
}
@@ -2696,33 +2708,42 @@ again:
mutex_unlock(&cgroup_mutex);
/*
+ * In general, subsystem has no css->refcnt after pre_destroy(). But
+ * in racy cases, subsystem may have to get css->refcnt after
+ * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
+ * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
+ * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
+ * and subsystem's reference count handling. Please see css_get/put
+ * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ */
+ set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+
+ /*
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
ret = cgroup_call_pre_destroy(cgrp);
- if (ret)
+ if (ret) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
return ret;
+ }
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+ clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
- /*
- * css_put/get is provided for subsys to grab refcnt to css. In typical
- * case, subsystem has no reference after pre_destroy(). But, under
- * hierarchy management, some *temporal* refcnt can be hold.
- * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
- * is really busy, it should return -EBUSY at pre_destroy(). wake_up
- * is called when css_put() is called and refcnt goes down to 0.
- */
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
-
if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
- schedule();
+ /*
+ * Because someone may call cgroup_wakeup_rmdir_waiter() before
+ * prepare_to_wait(), we need to check this flag.
+ */
+ if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
+ schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
@@ -3294,7 +3315,7 @@ void __css_put(struct cgroup_subsys_stat
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiters(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
}
rcu_read_unlock();
}
Index: mmotm-2.6.31-Jun25/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Jun25.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Jun25/mm/memcontrol.c
@@ -1234,6 +1234,12 @@ static int mem_cgroup_move_account(struc
ret = 0;
out:
unlock_page_cgroup(pc);
+ /*
+ * We charges against "to" which may not have any tasks. Then, "to"
+ * can be under rmdir(). But in current implementation, caller of
+ * this function is just force_empty() and it's garanteed that
+ * "to" is never removed. So, we don't check rmdir status here.
+ */
return ret;
}
@@ -1455,6 +1461,7 @@ __mem_cgroup_commit_charge_swapin(struct
return;
if (!ptr)
return;
+ cgroup_exclude_rmdir(&ptr->css);
pc = lookup_page_cgroup(page);
mem_cgroup_lru_del_before_commit_swapcache(page);
__mem_cgroup_commit_charge(ptr, pc, ctype);
@@ -1484,8 +1491,12 @@ __mem_cgroup_commit_charge_swapin(struct
}
rcu_read_unlock();
}
- /* add this page(page_cgroup) to the LRU we want. */
-
+ /*
+ * At swapin, we may charge account against cgroup which has no tasks.
+ * So, rmdir()->pre_destroy() can be called while we do this charge.
+ * In that case, we need to call pre_destroy() again. check it here.
+ */
+ cgroup_release_rmdir(&ptr->css);
}
void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
@@ -1691,7 +1702,7 @@ void mem_cgroup_end_migration(struct mem
if (!mem)
return;
-
+ cgroup_exclude_rmdir(&mem->css);
/* at migration success, oldpage->mapping is NULL. */
if (oldpage->mapping) {
target = oldpage;
@@ -1731,6 +1742,12 @@ void mem_cgroup_end_migration(struct mem
*/
if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
mem_cgroup_uncharge_page(target);
+ /*
+ * At migration, we may charge account against cgroup which has no tasks
+ * So, rmdir()->pre_destroy() can be called while we do this charge.
+ * In that case, we need to call pre_destroy() again. check it here.
+ */
+ cgroup_release_rmdir(&mem->css);
}
/*
On Tue, Jun 30, 2009 at 6:47 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> ok, here.
>
> ==
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> After commit: cgroup: fix frequent -EBUSY at rmdir
> ? ? ? ? ? ? ?ec64f51545fffbc4cb968f0cea56341a4b07e85a
> cgroup's rmdir (especially against memcg) doesn't return -EBUSY
> by temporal ref counts. That commit expects all refs after pre_destroy()
> is temporary but...it wasn't. Then, rmdir can wait permanently.
> This patch tries to fix that and change followings.
>
> ?- set CGRP_WAIT_ON_RMDIR flag before pre_destroy().
> ?- clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case.
> ? if there are sleeping ones, wakes them up.
> ?- rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set.
>
> Changelog v4->v5:
> ?- added cgroup_exclude_rmdir(), cgroup_release_rmdir().
>
> Changelog v3->v4:
> ?- rewrite/add comments.
> ?- remane cgroup_wakeup_rmdir_waiters() to cgroup_wakeup_rmdir_waiter().
> Changelog v2->v3:
> ?- removed retry_rmdir() callback.
> ?- make use of CGRP_WAIT_ON_RMDIR flag more.
>
> Reported-by: Daisuke Nishimura <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
Reviewed-by: Paul Menage <[email protected]>
Looks great, thanks.
Paul
> ---
> ?include/linux/cgroup.h | ? 14 ++++++++++++
> ?kernel/cgroup.c ? ? ? ?| ? 55 +++++++++++++++++++++++++++++++++----------------
> ?mm/memcontrol.c ? ? ? ?| ? 23 +++++++++++++++++---
> ?3 files changed, 72 insertions(+), 20 deletions(-)
>
> Index: mmotm-2.6.31-Jun25/include/linux/cgroup.h
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/include/linux/cgroup.h
> +++ mmotm-2.6.31-Jun25/include/linux/cgroup.h
> @@ -366,6 +366,20 @@ int cgroup_task_count(const struct cgrou
> ?int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
>
> ?/*
> + * When the subsys has to access css and may add permanent refcnt to css,
> + * it should take care of racy conditions with rmdir(). Following set of
> + * functions, is for stop/restart rmdir if necessary.
> + * Because these will call css_get/put, "css" should be alive css.
> + *
> + * ?cgroup_exclude_rmdir();
> + * ?...do some jobs which may access arbitrary empty cgroup
> + * ?cgroup_release_rmdir();
> + */
> +
> +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
> +void cgroup_release_rmdir(struct cgroup_subsys_state *css);
> +
> +/*
> ?* Control Group subsystem type.
> ?* See Documentation/cgroups/cgroups.txt for details
> ?*/
> Index: mmotm-2.6.31-Jun25/kernel/cgroup.c
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/kernel/cgroup.c
> +++ mmotm-2.6.31-Jun25/kernel/cgroup.c
> @@ -734,16 +734,28 @@ static void cgroup_d_remove_dir(struct d
> ?* reference to css->refcnt. In general, this refcnt is expected to goes down
> ?* to zero, soon.
> ?*
> - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
> + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
> ?*/
> ?DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
>
> -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
> +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
> ?{
> - ? ? ? if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> + ? ? ? if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> ? ? ? ? ? ? ? ?wake_up_all(&cgroup_rmdir_waitq);
> ?}
>
> +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
> +{
> + ? ? ? css_get(css);
> +}
> +
> +void cgroup_release_rmdir(struct cgroup_subsys_state *css)
> +{
> + ? ? ? cgroup_wakeup_rmdir_waiter(css->cgroup);
> + ? ? ? css_put(css);
> +}
> +
> +
> ?static int rebind_subsystems(struct cgroupfs_root *root,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?unsigned long final_bits)
> ?{
> @@ -1357,7 +1369,7 @@ int cgroup_attach_task(struct cgroup *cg
> ? ? ? ? * wake up rmdir() waiter. the rmdir should fail since the cgroup
> ? ? ? ? * is no longer empty.
> ? ? ? ? */
> - ? ? ? cgroup_wakeup_rmdir_waiters(cgrp);
> + ? ? ? cgroup_wakeup_rmdir_waiter(cgrp);
> ? ? ? ?return 0;
> ?}
>
> @@ -2696,33 +2708,42 @@ again:
> ? ? ? ?mutex_unlock(&cgroup_mutex);
>
> ? ? ? ?/*
> + ? ? ? ?* In general, subsystem has no css->refcnt after pre_destroy(). But
> + ? ? ? ?* in racy cases, subsystem may have to get css->refcnt after
> + ? ? ? ?* pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
> + ? ? ? ?* make rmdir return -EBUSY too often. To avoid that, we use waitqueue
> + ? ? ? ?* for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
> + ? ? ? ?* and subsystem's reference count handling. Please see css_get/put
> + ? ? ? ?* and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
> + ? ? ? ?*/
> + ? ? ? set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> +
> + ? ? ? /*
> ? ? ? ? * Call pre_destroy handlers of subsys. Notify subsystems
> ? ? ? ? * that rmdir() request comes.
> ? ? ? ? */
> ? ? ? ?ret = cgroup_call_pre_destroy(cgrp);
> - ? ? ? if (ret)
> + ? ? ? if (ret) {
> + ? ? ? ? ? ? ? clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> ? ? ? ? ? ? ? ?return ret;
> + ? ? ? }
>
> ? ? ? ?mutex_lock(&cgroup_mutex);
> ? ? ? ?parent = cgrp->parent;
> ? ? ? ?if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
> + ? ? ? ? ? ? ? clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> ? ? ? ? ? ? ? ?mutex_unlock(&cgroup_mutex);
> ? ? ? ? ? ? ? ?return -EBUSY;
> ? ? ? ?}
> - ? ? ? /*
> - ? ? ? ?* css_put/get is provided for subsys to grab refcnt to css. In typical
> - ? ? ? ?* case, subsystem has no reference after pre_destroy(). But, under
> - ? ? ? ?* hierarchy management, some *temporal* refcnt can be hold.
> - ? ? ? ?* To avoid returning -EBUSY to a user, waitqueue is used. If subsys
> - ? ? ? ?* is really busy, it should return -EBUSY at pre_destroy(). wake_up
> - ? ? ? ?* is called when css_put() is called and refcnt goes down to 0.
> - ? ? ? ?*/
> - ? ? ? set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> ? ? ? ?prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
> -
> ? ? ? ?if (!cgroup_clear_css_refs(cgrp)) {
> ? ? ? ? ? ? ? ?mutex_unlock(&cgroup_mutex);
> - ? ? ? ? ? ? ? schedule();
> + ? ? ? ? ? ? ? /*
> + ? ? ? ? ? ? ? ?* Because someone may call cgroup_wakeup_rmdir_waiter() before
> + ? ? ? ? ? ? ? ?* prepare_to_wait(), we need to check this flag.
> + ? ? ? ? ? ? ? ?*/
> + ? ? ? ? ? ? ? if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
> + ? ? ? ? ? ? ? ? ? ? ? schedule();
> ? ? ? ? ? ? ? ?finish_wait(&cgroup_rmdir_waitq, &wait);
> ? ? ? ? ? ? ? ?clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> ? ? ? ? ? ? ? ?if (signal_pending(current))
> @@ -3294,7 +3315,7 @@ void __css_put(struct cgroup_subsys_stat
> ? ? ? ? ? ? ? ? ? ? ? ?set_bit(CGRP_RELEASABLE, &cgrp->flags);
> ? ? ? ? ? ? ? ? ? ? ? ?check_for_release(cgrp);
> ? ? ? ? ? ? ? ?}
> - ? ? ? ? ? ? ? cgroup_wakeup_rmdir_waiters(cgrp);
> + ? ? ? ? ? ? ? cgroup_wakeup_rmdir_waiter(cgrp);
> ? ? ? ?}
> ? ? ? ?rcu_read_unlock();
> ?}
> Index: mmotm-2.6.31-Jun25/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Jun25/mm/memcontrol.c
> @@ -1234,6 +1234,12 @@ static int mem_cgroup_move_account(struc
> ? ? ? ?ret = 0;
> ?out:
> ? ? ? ?unlock_page_cgroup(pc);
> + ? ? ? /*
> + ? ? ? ?* We charges against "to" which may not have any tasks. Then, "to"
> + ? ? ? ?* can be under rmdir(). But in current implementation, caller of
> + ? ? ? ?* this function is just force_empty() and it's garanteed that
> + ? ? ? ?* "to" is never removed. So, we don't check rmdir status here.
> + ? ? ? ?*/
> ? ? ? ?return ret;
> ?}
>
> @@ -1455,6 +1461,7 @@ __mem_cgroup_commit_charge_swapin(struct
> ? ? ? ? ? ? ? ?return;
> ? ? ? ?if (!ptr)
> ? ? ? ? ? ? ? ?return;
> + ? ? ? cgroup_exclude_rmdir(&ptr->css);
> ? ? ? ?pc = lookup_page_cgroup(page);
> ? ? ? ?mem_cgroup_lru_del_before_commit_swapcache(page);
> ? ? ? ?__mem_cgroup_commit_charge(ptr, pc, ctype);
> @@ -1484,8 +1491,12 @@ __mem_cgroup_commit_charge_swapin(struct
> ? ? ? ? ? ? ? ?}
> ? ? ? ? ? ? ? ?rcu_read_unlock();
> ? ? ? ?}
> - ? ? ? /* add this page(page_cgroup) to the LRU we want. */
> -
> + ? ? ? /*
> + ? ? ? ?* At swapin, we may charge account against cgroup which has no tasks.
> + ? ? ? ?* So, rmdir()->pre_destroy() can be called while we do this charge.
> + ? ? ? ?* In that case, we need to call pre_destroy() again. check it here.
> + ? ? ? ?*/
> + ? ? ? cgroup_release_rmdir(&ptr->css);
> ?}
>
> ?void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
> @@ -1691,7 +1702,7 @@ void mem_cgroup_end_migration(struct mem
>
> ? ? ? ?if (!mem)
> ? ? ? ? ? ? ? ?return;
> -
> + ? ? ? cgroup_exclude_rmdir(&mem->css);
> ? ? ? ?/* at migration success, oldpage->mapping is NULL. */
> ? ? ? ?if (oldpage->mapping) {
> ? ? ? ? ? ? ? ?target = oldpage;
> @@ -1731,6 +1742,12 @@ void mem_cgroup_end_migration(struct mem
> ? ? ? ? */
> ? ? ? ?if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
> ? ? ? ? ? ? ? ?mem_cgroup_uncharge_page(target);
> + ? ? ? /*
> + ? ? ? ?* At migration, we may charge account against cgroup which has no tasks
> + ? ? ? ?* So, rmdir()->pre_destroy() can be called while we do this charge.
> + ? ? ? ?* In that case, we need to call pre_destroy() again. check it here.
> + ? ? ? ?*/
> + ? ? ? cgroup_release_rmdir(&mem->css);
> ?}
>
> ?/*
>
>
On Tue, Jun 30, 2009 at 6:04 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
>
> BTW, do you have patches for NOOP/signal cgroup we discussed a half year ago ?
>
Yes - very nearly ready. They were sitting gathering dust for a while,
but I've just been polishing them up again this week and am planning
to send them out this week or next.
Paul
On Wed, 1 Jul 2009 10:47:47 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> ok, here.
>
> ==
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> After commit: cgroup: fix frequent -EBUSY at rmdir
> ec64f51545fffbc4cb968f0cea56341a4b07e85a
> cgroup's rmdir (especially against memcg) doesn't return -EBUSY
> by temporal ref counts. That commit expects all refs after pre_destroy()
> is temporary but...it wasn't. Then, rmdir can wait permanently.
> This patch tries to fix that and change followings.
>
> - set CGRP_WAIT_ON_RMDIR flag before pre_destroy().
> - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case.
> if there are sleeping ones, wakes them up.
> - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set.
>
> Changelog v4->v5:
> - added cgroup_exclude_rmdir(), cgroup_release_rmdir().
>
> Changelog v3->v4:
> - rewrite/add comments.
> - remane cgroup_wakeup_rmdir_waiters() to cgroup_wakeup_rmdir_waiter().
> Changelog v2->v3:
> - removed retry_rmdir() callback.
> - make use of CGRP_WAIT_ON_RMDIR flag more.
>
> Reported-by: Daisuke Nishimura <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
It works very well.
I believe it's ready to go :)
Tested-by: Daisuke Nishimura <[email protected]>
Daisuke Nishimura
> ---
> include/linux/cgroup.h | 14 ++++++++++++
> kernel/cgroup.c | 55 +++++++++++++++++++++++++++++++++----------------
> mm/memcontrol.c | 23 +++++++++++++++++---
> 3 files changed, 72 insertions(+), 20 deletions(-)
>
> Index: mmotm-2.6.31-Jun25/include/linux/cgroup.h
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/include/linux/cgroup.h
> +++ mmotm-2.6.31-Jun25/include/linux/cgroup.h
> @@ -366,6 +366,20 @@ int cgroup_task_count(const struct cgrou
> int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
>
> /*
> + * When the subsys has to access css and may add permanent refcnt to css,
> + * it should take care of racy conditions with rmdir(). Following set of
> + * functions, is for stop/restart rmdir if necessary.
> + * Because these will call css_get/put, "css" should be alive css.
> + *
> + * cgroup_exclude_rmdir();
> + * ...do some jobs which may access arbitrary empty cgroup
> + * cgroup_release_rmdir();
> + */
> +
> +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
> +void cgroup_release_rmdir(struct cgroup_subsys_state *css);
> +
> +/*
> * Control Group subsystem type.
> * See Documentation/cgroups/cgroups.txt for details
> */
> Index: mmotm-2.6.31-Jun25/kernel/cgroup.c
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/kernel/cgroup.c
> +++ mmotm-2.6.31-Jun25/kernel/cgroup.c
> @@ -734,16 +734,28 @@ static void cgroup_d_remove_dir(struct d
> * reference to css->refcnt. In general, this refcnt is expected to goes down
> * to zero, soon.
> *
> - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
> + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
> */
> DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
>
> -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
> +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
> {
> - if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> wake_up_all(&cgroup_rmdir_waitq);
> }
>
> +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
> +{
> + css_get(css);
> +}
> +
> +void cgroup_release_rmdir(struct cgroup_subsys_state *css)
> +{
> + cgroup_wakeup_rmdir_waiter(css->cgroup);
> + css_put(css);
> +}
> +
> +
> static int rebind_subsystems(struct cgroupfs_root *root,
> unsigned long final_bits)
> {
> @@ -1357,7 +1369,7 @@ int cgroup_attach_task(struct cgroup *cg
> * wake up rmdir() waiter. the rmdir should fail since the cgroup
> * is no longer empty.
> */
> - cgroup_wakeup_rmdir_waiters(cgrp);
> + cgroup_wakeup_rmdir_waiter(cgrp);
> return 0;
> }
>
> @@ -2696,33 +2708,42 @@ again:
> mutex_unlock(&cgroup_mutex);
>
> /*
> + * In general, subsystem has no css->refcnt after pre_destroy(). But
> + * in racy cases, subsystem may have to get css->refcnt after
> + * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
> + * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
> + * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
> + * and subsystem's reference count handling. Please see css_get/put
> + * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
> + */
> + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> +
> + /*
> * Call pre_destroy handlers of subsys. Notify subsystems
> * that rmdir() request comes.
> */
> ret = cgroup_call_pre_destroy(cgrp);
> - if (ret)
> + if (ret) {
> + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> return ret;
> + }
>
> mutex_lock(&cgroup_mutex);
> parent = cgrp->parent;
> if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
> + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> mutex_unlock(&cgroup_mutex);
> return -EBUSY;
> }
> - /*
> - * css_put/get is provided for subsys to grab refcnt to css. In typical
> - * case, subsystem has no reference after pre_destroy(). But, under
> - * hierarchy management, some *temporal* refcnt can be hold.
> - * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
> - * is really busy, it should return -EBUSY at pre_destroy(). wake_up
> - * is called when css_put() is called and refcnt goes down to 0.
> - */
> - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
> -
> if (!cgroup_clear_css_refs(cgrp)) {
> mutex_unlock(&cgroup_mutex);
> - schedule();
> + /*
> + * Because someone may call cgroup_wakeup_rmdir_waiter() before
> + * prepare_to_wait(), we need to check this flag.
> + */
> + if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
> + schedule();
> finish_wait(&cgroup_rmdir_waitq, &wait);
> clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> if (signal_pending(current))
> @@ -3294,7 +3315,7 @@ void __css_put(struct cgroup_subsys_stat
> set_bit(CGRP_RELEASABLE, &cgrp->flags);
> check_for_release(cgrp);
> }
> - cgroup_wakeup_rmdir_waiters(cgrp);
> + cgroup_wakeup_rmdir_waiter(cgrp);
> }
> rcu_read_unlock();
> }
> Index: mmotm-2.6.31-Jun25/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Jun25/mm/memcontrol.c
> @@ -1234,6 +1234,12 @@ static int mem_cgroup_move_account(struc
> ret = 0;
> out:
> unlock_page_cgroup(pc);
> + /*
> + * We charges against "to" which may not have any tasks. Then, "to"
> + * can be under rmdir(). But in current implementation, caller of
> + * this function is just force_empty() and it's garanteed that
> + * "to" is never removed. So, we don't check rmdir status here.
> + */
> return ret;
> }
>
> @@ -1455,6 +1461,7 @@ __mem_cgroup_commit_charge_swapin(struct
> return;
> if (!ptr)
> return;
> + cgroup_exclude_rmdir(&ptr->css);
> pc = lookup_page_cgroup(page);
> mem_cgroup_lru_del_before_commit_swapcache(page);
> __mem_cgroup_commit_charge(ptr, pc, ctype);
> @@ -1484,8 +1491,12 @@ __mem_cgroup_commit_charge_swapin(struct
> }
> rcu_read_unlock();
> }
> - /* add this page(page_cgroup) to the LRU we want. */
> -
> + /*
> + * At swapin, we may charge account against cgroup which has no tasks.
> + * So, rmdir()->pre_destroy() can be called while we do this charge.
> + * In that case, we need to call pre_destroy() again. check it here.
> + */
> + cgroup_release_rmdir(&ptr->css);
> }
>
> void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
> @@ -1691,7 +1702,7 @@ void mem_cgroup_end_migration(struct mem
>
> if (!mem)
> return;
> -
> + cgroup_exclude_rmdir(&mem->css);
> /* at migration success, oldpage->mapping is NULL. */
> if (oldpage->mapping) {
> target = oldpage;
> @@ -1731,6 +1742,12 @@ void mem_cgroup_end_migration(struct mem
> */
> if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
> mem_cgroup_uncharge_page(target);
> + /*
> + * At migration, we may charge account against cgroup which has no tasks
> + * So, rmdir()->pre_destroy() can be called while we do this charge.
> + * In that case, we need to call pre_destroy() again. check it here.
> + */
> + cgroup_release_rmdir(&mem->css);
> }
>
> /*
>
Paul Menage wrote:
> On Tue, Jun 30, 2009 at 6:04 PM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
>> BTW, do you have patches for NOOP/signal cgroup we discussed a half year ago ?
>>
>
> Yes - very nearly ready. They were sitting gathering dust for a while,
> but I've just been polishing them up again this week and am planning
> to send them out this week or next.
>
Glad to hear this. :)
* KAMEZAWA Hiroyuki <[email protected]> [2009-07-01 10:47:47]:
> ok, here.
>
> ==
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> After commit: cgroup: fix frequent -EBUSY at rmdir
> ec64f51545fffbc4cb968f0cea56341a4b07e85a
> cgroup's rmdir (especially against memcg) doesn't return -EBUSY
> by temporal ref counts. That commit expects all refs after pre_destroy()
> is temporary but...it wasn't. Then, rmdir can wait permanently.
> This patch tries to fix that and change followings.
>
Sorry for the late review, a few comments below
> - set CGRP_WAIT_ON_RMDIR flag before pre_destroy().
> - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case.
> if there are sleeping ones, wakes them up.
> - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set.
>
> Changelog v4->v5:
> - added cgroup_exclude_rmdir(), cgroup_release_rmdir().
>
> Changelog v3->v4:
> - rewrite/add comments.
> - remane cgroup_wakeup_rmdir_waiters() to cgroup_wakeup_rmdir_waiter().
> Changelog v2->v3:
> - removed retry_rmdir() callback.
> - make use of CGRP_WAIT_ON_RMDIR flag more.
>
> Reported-by: Daisuke Nishimura <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> include/linux/cgroup.h | 14 ++++++++++++
> kernel/cgroup.c | 55 +++++++++++++++++++++++++++++++++----------------
> mm/memcontrol.c | 23 +++++++++++++++++---
> 3 files changed, 72 insertions(+), 20 deletions(-)
>
> Index: mmotm-2.6.31-Jun25/include/linux/cgroup.h
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/include/linux/cgroup.h
> +++ mmotm-2.6.31-Jun25/include/linux/cgroup.h
> @@ -366,6 +366,20 @@ int cgroup_task_count(const struct cgrou
> int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
>
> /*
> + * When the subsys has to access css and may add permanent refcnt to css,
> + * it should take care of racy conditions with rmdir(). Following set of
> + * functions, is for stop/restart rmdir if necessary.
> + * Because these will call css_get/put, "css" should be alive css.
> + *
> + * cgroup_exclude_rmdir();
> + * ...do some jobs which may access arbitrary empty cgroup
> + * cgroup_release_rmdir();
> + */
> +
> +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
> +void cgroup_release_rmdir(struct cgroup_subsys_state *css);
> +
> +/*
> * Control Group subsystem type.
> * See Documentation/cgroups/cgroups.txt for details
> */
> Index: mmotm-2.6.31-Jun25/kernel/cgroup.c
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/kernel/cgroup.c
> +++ mmotm-2.6.31-Jun25/kernel/cgroup.c
> @@ -734,16 +734,28 @@ static void cgroup_d_remove_dir(struct d
> * reference to css->refcnt. In general, this refcnt is expected to goes down
> * to zero, soon.
> *
> - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
> + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
> */
> DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
>
> -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
> +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
Should the function explictly mention rmdir? Also something like
release_rmdir should be called release_and_wakeup to make the action
clearer.
Looks good to me otherwise and clean.
> {
> - if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
> wake_up_all(&cgroup_rmdir_waitq);
> }
>
> +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
> +{
> + css_get(css);
> +}
> +
> +void cgroup_release_rmdir(struct cgroup_subsys_state *css)
> +{
> + cgroup_wakeup_rmdir_waiter(css->cgroup);
> + css_put(css);
> +}
> +
> +
> static int rebind_subsystems(struct cgroupfs_root *root,
> unsigned long final_bits)
> {
> @@ -1357,7 +1369,7 @@ int cgroup_attach_task(struct cgroup *cg
> * wake up rmdir() waiter. the rmdir should fail since the cgroup
> * is no longer empty.
> */
> - cgroup_wakeup_rmdir_waiters(cgrp);
> + cgroup_wakeup_rmdir_waiter(cgrp);
> return 0;
> }
>
> @@ -2696,33 +2708,42 @@ again:
> mutex_unlock(&cgroup_mutex);
>
> /*
> + * In general, subsystem has no css->refcnt after pre_destroy(). But
> + * in racy cases, subsystem may have to get css->refcnt after
> + * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
> + * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
> + * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
> + * and subsystem's reference count handling. Please see css_get/put
> + * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
> + */
> + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> +
> + /*
> * Call pre_destroy handlers of subsys. Notify subsystems
> * that rmdir() request comes.
> */
> ret = cgroup_call_pre_destroy(cgrp);
> - if (ret)
> + if (ret) {
> + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> return ret;
> + }
>
> mutex_lock(&cgroup_mutex);
> parent = cgrp->parent;
> if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
> + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> mutex_unlock(&cgroup_mutex);
> return -EBUSY;
> }
> - /*
> - * css_put/get is provided for subsys to grab refcnt to css. In typical
> - * case, subsystem has no reference after pre_destroy(). But, under
> - * hierarchy management, some *temporal* refcnt can be hold.
> - * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
> - * is really busy, it should return -EBUSY at pre_destroy(). wake_up
> - * is called when css_put() is called and refcnt goes down to 0.
> - */
> - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
> -
> if (!cgroup_clear_css_refs(cgrp)) {
> mutex_unlock(&cgroup_mutex);
> - schedule();
> + /*
> + * Because someone may call cgroup_wakeup_rmdir_waiter() before
> + * prepare_to_wait(), we need to check this flag.
> + */
> + if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
> + schedule();
> finish_wait(&cgroup_rmdir_waitq, &wait);
> clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
> if (signal_pending(current))
> @@ -3294,7 +3315,7 @@ void __css_put(struct cgroup_subsys_stat
> set_bit(CGRP_RELEASABLE, &cgrp->flags);
> check_for_release(cgrp);
> }
> - cgroup_wakeup_rmdir_waiters(cgrp);
> + cgroup_wakeup_rmdir_waiter(cgrp);
> }
> rcu_read_unlock();
> }
> Index: mmotm-2.6.31-Jun25/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Jun25.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Jun25/mm/memcontrol.c
> @@ -1234,6 +1234,12 @@ static int mem_cgroup_move_account(struc
> ret = 0;
> out:
> unlock_page_cgroup(pc);
> + /*
> + * We charges against "to" which may not have any tasks. Then, "to"
> + * can be under rmdir(). But in current implementation, caller of
> + * this function is just force_empty() and it's garanteed that
> + * "to" is never removed. So, we don't check rmdir status here.
> + */
> return ret;
> }
>
> @@ -1455,6 +1461,7 @@ __mem_cgroup_commit_charge_swapin(struct
> return;
> if (!ptr)
> return;
> + cgroup_exclude_rmdir(&ptr->css);
> pc = lookup_page_cgroup(page);
> mem_cgroup_lru_del_before_commit_swapcache(page);
> __mem_cgroup_commit_charge(ptr, pc, ctype);
> @@ -1484,8 +1491,12 @@ __mem_cgroup_commit_charge_swapin(struct
> }
> rcu_read_unlock();
> }
> - /* add this page(page_cgroup) to the LRU we want. */
> -
> + /*
> + * At swapin, we may charge account against cgroup which has no tasks.
> + * So, rmdir()->pre_destroy() can be called while we do this charge.
> + * In that case, we need to call pre_destroy() again. check it here.
> + */
> + cgroup_release_rmdir(&ptr->css);
> }
>
> void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
> @@ -1691,7 +1702,7 @@ void mem_cgroup_end_migration(struct mem
>
> if (!mem)
> return;
> -
> + cgroup_exclude_rmdir(&mem->css);
> /* at migration success, oldpage->mapping is NULL. */
> if (oldpage->mapping) {
> target = oldpage;
> @@ -1731,6 +1742,12 @@ void mem_cgroup_end_migration(struct mem
> */
> if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
> mem_cgroup_uncharge_page(target);
> + /*
> + * At migration, we may charge account against cgroup which has no tasks
> + * So, rmdir()->pre_destroy() can be called while we do this charge.
> + * In that case, we need to call pre_destroy() again. check it here.
> + */
> + cgroup_release_rmdir(&mem->css);
> }
>
> /*
>
--
Balbir
On Thu, 2 Jul 2009 12:00:05 +0530
Balbir Singh <[email protected]> wrote:
> * KAMEZAWA Hiroyuki <[email protected]> [2009-07-01 10:47:47]:
> > -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
> > +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
>
> Should the function explictly mention rmdir?
For now, yes. this is only for rmdir.
> Also something like
> release_rmdir should be called release_and_wakeup to make the action
> clearer.
>
Hm, I don't think this name is too bad. Comment is enough and if we have to
change the behavior of cgroup-internal work, we have to rename this again.
This name is not too much explaining but showing enough information.
> Looks good to me otherwise and clean.
>
Thank you.
Regards,
-Kame