LinuxLists.cc - [PATCH 1/2] memcg clean up try

2010-06-03 02:52:59

Subject: [PATCH 1/2] memcg clean up try_charge main loop

From: KAMEZAWA Hiroyuki <[email protected]>

mem_cgroup_try_charge() has a big loop in it and seems to be hard to read.
Most of routines are for slow path. This patch moves codes out from the
loop and make it clear what's done.

Summary:
- refactoring a function to detect a memcg is under acccount move or not.
- refactoring a function to wait for the end of moving task acct.
- refactoring a main loop('s slow path) as a function and make it clear
why we retry or quit by return code.
- add fatal_signal_pending() check for bypassing charge loops.

Changelog 2010-06-01
- added fatal_signal_pending() to bypass charge loop. This is useful
and valid to do because if signal is fatal, charging against it
isn't very necessary and the user can see smooth kill even under
heavy workload on a memcg.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 247 +++++++++++++++++++++++++++++++++-----------------------
1 file changed, 148 insertions(+), 99 deletions(-)

Index: mmotm-2.6.34-May21/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-May21.orig/mm/memcontrol.c
+++ mmotm-2.6.34-May21/mm/memcontrol.c
@@ -1072,6 +1072,49 @@ static unsigned int get_swappiness(struc
return swappiness;
}

+/* A routine for testing mem is not under move_account */
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+ bool ret = false;
+
+ if (from == mem || to == mem)
+ return true;
+
+ if (!from || !to || !mem->use_hierarchy)
+ return false;
+
+ rcu_read_lock();
+ if (css_tryget(&from->css)) {
+ ret = css_is_ancestor(&from->css, &mem->css);
+ css_put(&from->css);
+ }
+ if (!ret && css_tryget(&to->css)) {
+ ret = css_is_ancestor(&to->css, &mem->css);
+ css_put(&to->css);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(mem)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1582,16 +1625,83 @@ static int __cpuinit memcg_stock_cpu_cal
return NOTIFY_OK;
}

+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+ CHARGE_OK, /* success */
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+ CHARGE_OOM_DIE, /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ int csize, bool oom_check)
+{
+ struct mem_cgroup *mem_over_limit;
+ struct res_counter *fail_res;
+ unsigned long flags = 0;
+ int ret;
+
+ ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ return CHARGE_OK;
+ ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ if (likely(!ret))
+ return CHARGE_OK;
+
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ } else
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+ if (csize > PAGE_SIZE) /* change csize and retry */
+ return CHARGE_RETRY;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return CHARGE_WOULDBLOCK;
+
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /* If we don't need to call oom-killer at el, return immediately */
+ if (!oom_check)
+ return CHARGE_NOMEM;
+ /* check OOM */
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+ return CHARGE_OOM_DIE;
+
+ return CHARGE_RETRY;
+}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
*/
+
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
{
- struct mem_cgroup *mem, *mem_over_limit;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res;
+ struct mem_cgroup *mem = NULL;
+ int ret = CHARGE_RETRY;
int csize = CHARGE_SIZE;

/*
@@ -1609,120 +1719,57 @@ static int __mem_cgroup_try_charge(struc
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- mem = *memcg;
- if (likely(!mem)) {
+ if (*memcg) {
+ mem = *memcg;
+ css_get(&mem->css);
+ } else {
mem = try_get_mem_cgroup_from_mm(mm);
+ if (unlikely(!mem))
+ return 0;
*memcg = mem;
- } else {
- css_get(&mem->css);
}
- if (unlikely(!mem))
- return 0;

VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;

- while (1) {
- int ret = 0;
- unsigned long flags = 0;
+ while (ret != CHARGE_OK) {
+ int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ bool oom_check;

if (consume_stock(mem))
- goto done;
-
- ret = res_counter_charge(&mem->res, csize, &fail_res);
- if (likely(!ret)) {
- if (!do_swap_account)
- break;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
- if (likely(!ret))
- break;
- /* mem+swap counter fails */
- res_counter_uncharge(&mem->res, csize);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- memsw);
- } else
- /* mem counter fails */
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- res);
+ goto done; /* don't need to fill stock */
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current))
+ goto bypass;

- /* reduce request size and retry */
- if (csize > PAGE_SIZE) {
- csize = PAGE_SIZE;
- continue;
+ oom_check = false;
+ if (oom && !nr_oom_retries) {
+ oom_check = true;
+ nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- if (!(gfp_mask & __GFP_WAIT))
- goto nomem;

- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
- if (ret)
- continue;
+ ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);

- /*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
- *
- */
- if (mem_cgroup_check_under_limit(mem_over_limit))
- continue;
-
- /* try to avoid oom while someone is moving charge */
- if (mc.moving_task && current != mc.moving_task) {
- struct mem_cgroup *from, *to;
- bool do_continue = false;
- /*
- * There is a small race that "from" or "to" can be
- * freed by rmdir, so we use css_tryget().
- */
- from = mc.from;
- to = mc.to;
- if (from && css_tryget(&from->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &from->css,
- &mem_over_limit->css);
- else
- do_continue = (from == mem_over_limit);
- css_put(&from->css);
- }
- if (!do_continue && to && css_tryget(&to->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &to->css,
- &mem_over_limit->css);
- else
- do_continue = (to == mem_over_limit);
- css_put(&to->css);
- }
- if (do_continue) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&mc.waitq, &wait,
- TASK_INTERRUPTIBLE);
- /* moving charge context might have finished. */
- if (mc.moving_task)
- schedule();
- finish_wait(&mc.waitq, &wait);
- continue;
- }
- }
-
- if (!nr_retries--) {
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+ case CHARGE_RETRY: /* not in OOM situation but retry */
+ csize = PAGE_SIZE;
+ break;
+ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
if (!oom)
goto nomem;
- if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- continue;
- }
- /* When we reach here, current task is dying .*/
- css_put(&mem->css);
+ /* If oom, we never return -ENOMEM */
+ nr_oom_retries--;
+ break;
+ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
goto bypass;
}
}
+
if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE);
done:
@@ -1731,6 +1778,8 @@ nomem:
css_put(&mem->css);
return -ENOMEM;
bypass:
+ if (mem)
+ css_put(&mem->css);
*memcg = NULL;
return 0;
}

2010-06-03 02:55:40

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: [PATCH 2/2] memcg make move_task_charge check clearer

From: KAMEZAWA Hiroyuki <[email protected]>

Now, for checking a memcg is under task-account-moving, we do css_tryget()
against mc.to and mc.from. But this is just complicating things. This patch
makes the check easier.

This patch adds a spinlock to move_charge_struct and guard modification
of mc.to and mc.from. By this, we don't have to think about complicated
races around this not-critical path.

Changelog:
- removed disable/enable irq.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 48 ++++++++++++++++++++++++++++--------------------
1 file changed, 28 insertions(+), 20 deletions(-)

Index: mmotm-2.6.34-May21/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-May21.orig/mm/memcontrol.c
+++ mmotm-2.6.34-May21/mm/memcontrol.c
@@ -268,6 +268,7 @@ enum move_type {

/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
+ spinlock_t lock; /* for from, to, moving_task */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
@@ -276,6 +277,7 @@ static struct move_charge_struct {
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};

@@ -1076,26 +1078,25 @@ static unsigned int get_swappiness(struc

static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
- struct mem_cgroup *from = mc.from;
- struct mem_cgroup *to = mc.to;
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
bool ret = false;
-
- if (from == mem || to == mem)
- return true;
-
- if (!from || !to || !mem->use_hierarchy)
- return false;
-
- rcu_read_lock();
- if (css_tryget(&from->css)) {
- ret = css_is_ancestor(&from->css, &mem->css);
- css_put(&from->css);
- }
- if (!ret && css_tryget(&to->css)) {
- ret = css_is_ancestor(&to->css, &mem->css);
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+ if (from == mem || to == mem
+ || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+ || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
css_put(&to->css);
- }
- rcu_read_unlock();
+ ret = true;
+unlock:
+ spin_unlock(&mc.lock);
return ret;
}

@@ -4447,11 +4448,13 @@ static int mem_cgroup_precharge_mc(struc

static void mem_cgroup_clear_mc(void)
{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
- memcg_oom_recover(mc.to);
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4460,7 +4463,6 @@ static void mem_cgroup_clear_mc(void)
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
- memcg_oom_recover(mc.from);
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
@@ -4485,9 +4487,13 @@ static void mem_cgroup_clear_mc(void)

mc.moved_swap = 0;
}
+ spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
mc.moving_task = NULL;
+ spin_unlock(&mc.lock);
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}

@@ -4516,12 +4522,14 @@ static int mem_cgroup_can_attach(struct
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
mc.precharge = 0;
mc.moved_charge = 0;
mc.moved_swap = 0;
mc.moving_task = current;
+ spin_unlock(&mc.lock);

ret = mem_cgroup_precharge_mc(mm);
if (ret)

2010-06-03 06:10:12

by Daisuke Nishimura

[permalink] [raw]

Subject: Re: [PATCH 1/2] memcg clean up try_charge main loop

On Thu, 3 Jun 2010 11:48:37 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> mem_cgroup_try_charge() has a big loop in it and seems to be hard to read.
> Most of routines are for slow path. This patch moves codes out from the
> loop and make it clear what's done.
>
> Summary:
> - refactoring a function to detect a memcg is under acccount move or not.
> - refactoring a function to wait for the end of moving task acct.
> - refactoring a main loop('s slow path) as a function and make it clear
> why we retry or quit by return code.
> - add fatal_signal_pending() check for bypassing charge loops.
>
> Changelog 2010-06-01
> - added fatal_signal_pending() to bypass charge loop. This is useful
> and valid to do because if signal is fatal, charging against it
> isn't very necessary and the user can see smooth kill even under
> heavy workload on a memcg.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> mm/memcontrol.c | 247 +++++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 148 insertions(+), 99 deletions(-)
>
> Index: mmotm-2.6.34-May21/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.34-May21.orig/mm/memcontrol.c
> +++ mmotm-2.6.34-May21/mm/memcontrol.c
> @@ -1072,6 +1072,49 @@ static unsigned int get_swappiness(struc
> return swappiness;
> }
>
> +/* A routine for testing mem is not under move_account */
> +
> +static bool mem_cgroup_under_move(struct mem_cgroup *mem)
> +{
> + struct mem_cgroup *from = mc.from;
> + struct mem_cgroup *to = mc.to;
> + bool ret = false;
> +
> + if (from == mem || to == mem)
> + return true;
> +
> + if (!from || !to || !mem->use_hierarchy)
> + return false;
> +
> + rcu_read_lock();
> + if (css_tryget(&from->css)) {
> + ret = css_is_ancestor(&from->css, &mem->css);
> + css_put(&from->css);
> + }
> + if (!ret && css_tryget(&to->css)) {
> + ret = css_is_ancestor(&to->css, &mem->css);
> + css_put(&to->css);
> + }
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
> +{
> + if (mc.moving_task && current != mc.moving_task) {
> + if (mem_cgroup_under_move(mem)) {
> + DEFINE_WAIT(wait);
> + prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
> + /* moving charge context might have finished. */
> + if (mc.moving_task)
> + schedule();
> + finish_wait(&mc.waitq, &wait);
> + return true;
> + }
> + }
> + return false;
> +}
> +
> static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
> {
> int *val = data;
> @@ -1582,16 +1625,83 @@ static int __cpuinit memcg_stock_cpu_cal
> return NOTIFY_OK;
> }
>
> +
> +/* See __mem_cgroup_try_charge() for details */
> +enum {
> + CHARGE_OK, /* success */
> + CHARGE_RETRY, /* need to retry but retry is not bad */
> + CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
> + CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
> + CHARGE_OOM_DIE, /* the current is killed because of OOM */
> +};
> +
> +static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
> + int csize, bool oom_check)
> +{
> + struct mem_cgroup *mem_over_limit;
> + struct res_counter *fail_res;
> + unsigned long flags = 0;
> + int ret;
> +
> + ret = res_counter_charge(&mem->res, csize, &fail_res);
> +
> + if (likely(!ret)) {
> + if (!do_swap_account)
> + return CHARGE_OK;
> + ret = res_counter_charge(&mem->memsw, csize, &fail_res);
> + if (likely(!ret))
> + return CHARGE_OK;
> +
> + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> + flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> + } else
> + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> +
> + if (csize > PAGE_SIZE) /* change csize and retry */
> + return CHARGE_RETRY;
> +
> + if (!(gfp_mask & __GFP_WAIT))
> + return CHARGE_WOULDBLOCK;
> +
> + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
> + gfp_mask, flags);
> + /*
> + * try_to_free_mem_cgroup_pages() might not give us a full
> + * picture of reclaim. Some pages are reclaimed and might be
> + * moved to swap cache or just unmapped from the cgroup.
> + * Check the limit again to see if the reclaim reduced the
> + * current usage of the cgroup before giving up
> + */
> + if (ret || mem_cgroup_check_under_limit(mem_over_limit))
> + return CHARGE_RETRY;
> +
> + /*
> + * At task move, charge accounts can be doubly counted. So, it's
> + * better to wait until the end of task_move if something is going on.
> + */
> + if (mem_cgroup_wait_acct_move(mem_over_limit))
> + return CHARGE_RETRY;
> +
> + /* If we don't need to call oom-killer at el, return immediately */
> + if (!oom_check)
> + return CHARGE_NOMEM;
> + /* check OOM */
> + if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
> + return CHARGE_OOM_DIE;
> +
> + return CHARGE_RETRY;
> +}
> +
> /*
> * Unlike exported interface, "oom" parameter is added. if oom==true,
> * oom-killer can be invoked.
> */
> +
> static int __mem_cgroup_try_charge(struct mm_struct *mm,
> - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
> + gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
> {
> - struct mem_cgroup *mem, *mem_over_limit;
> - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> - struct res_counter *fail_res;
> + struct mem_cgroup *mem = NULL;
> + int ret = CHARGE_RETRY;
> int csize = CHARGE_SIZE;
>
it would be a nitpick, but how about:

int ret;
...
do {
...
} while (ret != CHARGE_OK);
?

> /*
> @@ -1609,120 +1719,57 @@ static int __mem_cgroup_try_charge(struc
> * thread group leader migrates. It's possible that mm is not
> * set, if so charge the init_mm (happens for pagecache usage).
> */
> - mem = *memcg;
> - if (likely(!mem)) {
> + if (*memcg) {
> + mem = *memcg;
> + css_get(&mem->css);
> + } else {
> mem = try_get_mem_cgroup_from_mm(mm);
> + if (unlikely(!mem))
> + return 0;
> *memcg = mem;
> - } else {
> - css_get(&mem->css);
> }
> - if (unlikely(!mem))
> - return 0;
>
> VM_BUG_ON(css_is_removed(&mem->css));
> if (mem_cgroup_is_root(mem))
> goto done;
>
> - while (1) {
> - int ret = 0;
> - unsigned long flags = 0;
> + while (ret != CHARGE_OK) {
> + int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
> + bool oom_check;
>
> if (consume_stock(mem))
> - goto done;
> -
> - ret = res_counter_charge(&mem->res, csize, &fail_res);
> - if (likely(!ret)) {
> - if (!do_swap_account)
> - break;
> - ret = res_counter_charge(&mem->memsw, csize, &fail_res);
> - if (likely(!ret))
> - break;
> - /* mem+swap counter fails */
> - res_counter_uncharge(&mem->res, csize);
> - flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> - mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> - memsw);
> - } else
> - /* mem counter fails */
> - mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> - res);
> + goto done; /* don't need to fill stock */
> + /* If killed, bypass charge */
> + if (fatal_signal_pending(current))
> + goto bypass;
>
> - /* reduce request size and retry */
> - if (csize > PAGE_SIZE) {
> - csize = PAGE_SIZE;
> - continue;
> + oom_check = false;
> + if (oom && !nr_oom_retries) {
> + oom_check = true;
> + nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
> }
Well, as I said before, nr_oom_retries seems to be reset at the begining of
this loop every time. Actually, I cannot cause oom even when I set a value
to limit_in_bytes and run a program which uses more memory than the limit,
because we never meet !nr_oom_retries here and oom_check will never be set to true.

Thanks,
Daisuke Nishimura.

> - if (!(gfp_mask & __GFP_WAIT))
> - goto nomem;
>
> - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
> - gfp_mask, flags);
> - if (ret)
> - continue;
> + ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
>
> - /*
> - * try_to_free_mem_cgroup_pages() might not give us a full
> - * picture of reclaim. Some pages are reclaimed and might be
> - * moved to swap cache or just unmapped from the cgroup.
> - * Check the limit again to see if the reclaim reduced the
> - * current usage of the cgroup before giving up
> - *
> - */
> - if (mem_cgroup_check_under_limit(mem_over_limit))
> - continue;
> -
> - /* try to avoid oom while someone is moving charge */
> - if (mc.moving_task && current != mc.moving_task) {
> - struct mem_cgroup *from, *to;
> - bool do_continue = false;
> - /*
> - * There is a small race that "from" or "to" can be
> - * freed by rmdir, so we use css_tryget().
> - */
> - from = mc.from;
> - to = mc.to;
> - if (from && css_tryget(&from->css)) {
> - if (mem_over_limit->use_hierarchy)
> - do_continue = css_is_ancestor(
> - &from->css,
> - &mem_over_limit->css);
> - else
> - do_continue = (from == mem_over_limit);
> - css_put(&from->css);
> - }
> - if (!do_continue && to && css_tryget(&to->css)) {
> - if (mem_over_limit->use_hierarchy)
> - do_continue = css_is_ancestor(
> - &to->css,
> - &mem_over_limit->css);
> - else
> - do_continue = (to == mem_over_limit);
> - css_put(&to->css);
> - }
> - if (do_continue) {
> - DEFINE_WAIT(wait);
> - prepare_to_wait(&mc.waitq, &wait,
> - TASK_INTERRUPTIBLE);
> - /* moving charge context might have finished. */
> - if (mc.moving_task)
> - schedule();
> - finish_wait(&mc.waitq, &wait);
> - continue;
> - }
> - }
> -
> - if (!nr_retries--) {
> + switch (ret) {
> + case CHARGE_OK:
> + break;
> + case CHARGE_RETRY: /* not in OOM situation but retry */
> + csize = PAGE_SIZE;
> + break;
> + case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
> + goto nomem;
> + case CHARGE_NOMEM: /* OOM routine works */
> if (!oom)
> goto nomem;
> - if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
> - nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> - continue;
> - }
> - /* When we reach here, current task is dying .*/
> - css_put(&mem->css);
> + /* If oom, we never return -ENOMEM */
> + nr_oom_retries--;
> + break;
> + case CHARGE_OOM_DIE: /* Killed by OOM Killer */
> goto bypass;
> }
> }
> +
> if (csize > PAGE_SIZE)
> refill_stock(mem, csize - PAGE_SIZE);
> done:
> @@ -1731,6 +1778,8 @@ nomem:
> css_put(&mem->css);
> return -ENOMEM;
> bypass:
> + if (mem)
> + css_put(&mem->css);
> *memcg = NULL;
> return 0;
> }
>

2010-06-03 06:33:05

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [PATCH 1/2] memcg clean up try_charge main loop

On Thu, 3 Jun 2010 15:06:19 +0900
Daisuke Nishimura <[email protected]> wrote:
}
> Well, as I said before, nr_oom_retries seems to be reset at the begining of
> this loop every time. Actually, I cannot cause oom even when I set a value
> to limit_in_bytes and run a program which uses more memory than the limit,
> because we never meet !nr_oom_retries here and oom_check will never be set to true.
>
>
sorry, here.

==
From: KAMEZAWA Hiroyuki <[email protected]>

mem_cgroup_try_charge() has a big loop in it and seems to be hard to read.
Most of routines are for slow path. This patch moves codes out from the
loop and make it clear what's done.

Summary:
- refactoring a function to detect a memcg is under acccount move or not.
- refactoring a function to wait for the end of moving task acct.
- refactoring a main loop('s slow path) as a function and make it clear
why we retry or quit by return code.
- add fatal_signal_pending() check for bypassing charge loops.

Changelog 2010-06-03
- fixed oom retry handling.
- use do...while()
Changelog 2010-06-01
- added fatal_signal_pending() to bypass charge loop. This is useful
and valid to do because if signal is fatal, charging against it
isn't very necessary and the user can see smooth kill even under
heavy workload on a memcg.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 249 +++++++++++++++++++++++++++++++++-----------------------
1 file changed, 149 insertions(+), 100 deletions(-)

Index: mmotm-2.6.34-May21/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-May21.orig/mm/memcontrol.c
+++ mmotm-2.6.34-May21/mm/memcontrol.c
@@ -1072,6 +1072,49 @@ static unsigned int get_swappiness(struc
return swappiness;
}

+/* A routine for testing mem is not under move_account */
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+ bool ret = false;
+
+ if (from == mem || to == mem)
+ return true;
+
+ if (!from || !to || !mem->use_hierarchy)
+ return false;
+
+ rcu_read_lock();
+ if (css_tryget(&from->css)) {
+ ret = css_is_ancestor(&from->css, &mem->css);
+ css_put(&from->css);
+ }
+ if (!ret && css_tryget(&to->css)) {
+ ret = css_is_ancestor(&to->css, &mem->css);
+ css_put(&to->css);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(mem)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1582,16 +1625,84 @@ static int __cpuinit memcg_stock_cpu_cal
return NOTIFY_OK;
}

+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+ CHARGE_OK, /* success */
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+ CHARGE_OOM_DIE, /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ int csize, bool oom_check)
+{
+ struct mem_cgroup *mem_over_limit;
+ struct res_counter *fail_res;
+ unsigned long flags = 0;
+ int ret;
+
+ ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ return CHARGE_OK;
+ ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ if (likely(!ret))
+ return CHARGE_OK;
+
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ } else
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+ if (csize > PAGE_SIZE) /* change csize and retry */
+ return CHARGE_RETRY;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return CHARGE_WOULDBLOCK;
+
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /* If we don't need to call oom-killer at el, return immediately */
+ if (!oom_check)
+ return CHARGE_NOMEM;
+ /* check OOM */
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+ return CHARGE_OOM_DIE;
+
+ return CHARGE_RETRY;
+}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
*/
+
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
{
- struct mem_cgroup *mem, *mem_over_limit;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res;
+ int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup *mem = NULL;
+ int ret;
int csize = CHARGE_SIZE;

/*
@@ -1609,120 +1720,56 @@ static int __mem_cgroup_try_charge(struc
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- mem = *memcg;
- if (likely(!mem)) {
+ if (*memcg) {
+ mem = *memcg;
+ css_get(&mem->css);
+ } else {
mem = try_get_mem_cgroup_from_mm(mm);
+ if (unlikely(!mem))
+ return 0;
*memcg = mem;
- } else {
- css_get(&mem->css);
}
- if (unlikely(!mem))
- return 0;

VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;

- while (1) {
- int ret = 0;
- unsigned long flags = 0;
+ do {
+ bool oom_check;

if (consume_stock(mem))
- goto done;
-
- ret = res_counter_charge(&mem->res, csize, &fail_res);
- if (likely(!ret)) {
- if (!do_swap_account)
- break;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
- if (likely(!ret))
- break;
- /* mem+swap counter fails */
- res_counter_uncharge(&mem->res, csize);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- memsw);
- } else
- /* mem counter fails */
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- res);
+ goto done; /* don't need to fill stock */
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current))
+ goto bypass;

- /* reduce request size and retry */
- if (csize > PAGE_SIZE) {
- csize = PAGE_SIZE;
- continue;
+ oom_check = false;
+ if (oom && !nr_oom_retries) {
+ oom_check = true;
+ nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- if (!(gfp_mask & __GFP_WAIT))
- goto nomem;

- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
- if (ret)
- continue;
+ ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);

- /*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
- *
- */
- if (mem_cgroup_check_under_limit(mem_over_limit))
- continue;
-
- /* try to avoid oom while someone is moving charge */
- if (mc.moving_task && current != mc.moving_task) {
- struct mem_cgroup *from, *to;
- bool do_continue = false;
- /*
- * There is a small race that "from" or "to" can be
- * freed by rmdir, so we use css_tryget().
- */
- from = mc.from;
- to = mc.to;
- if (from && css_tryget(&from->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &from->css,
- &mem_over_limit->css);
- else
- do_continue = (from == mem_over_limit);
- css_put(&from->css);
- }
- if (!do_continue && to && css_tryget(&to->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &to->css,
- &mem_over_limit->css);
- else
- do_continue = (to == mem_over_limit);
- css_put(&to->css);
- }
- if (do_continue) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&mc.waitq, &wait,
- TASK_INTERRUPTIBLE);
- /* moving charge context might have finished. */
- if (mc.moving_task)
- schedule();
- finish_wait(&mc.waitq, &wait);
- continue;
- }
- }
-
- if (!nr_retries--) {
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+ case CHARGE_RETRY: /* not in OOM situation but retry */
+ csize = PAGE_SIZE;
+ break;
+ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
if (!oom)
goto nomem;
- if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- continue;
- }
- /* When we reach here, current task is dying .*/
- css_put(&mem->css);
+ /* If oom, we never return -ENOMEM */
+ nr_oom_retries--;
+ break;
+ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
goto bypass;
}
- }
+ } while (ret != CHARGE_OK);
+
if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE);
done:
@@ -1731,6 +1778,8 @@ nomem:
css_put(&mem->css);
return -ENOMEM;
bypass:
+ if (mem)
+ css_put(&mem->css);
*memcg = NULL;
return 0;
}

2010-06-03 06:43:51

by Daisuke Nishimura

[permalink] [raw]

Subject: Re: [PATCH 2/2] memcg make move_task_charge check clearer

On Thu, 3 Jun 2010 11:51:19 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> Now, for checking a memcg is under task-account-moving, we do css_tryget()
> against mc.to and mc.from. But this is just complicating things. This patch
> makes the check easier.
>
> This patch adds a spinlock to move_charge_struct and guard modification
> of mc.to and mc.from. By this, we don't have to think about complicated
> races around this not-critical path.
>
> Changelog:
> - removed disable/enable irq.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> mm/memcontrol.c | 48 ++++++++++++++++++++++++++++--------------------
> 1 file changed, 28 insertions(+), 20 deletions(-)
>
> Index: mmotm-2.6.34-May21/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.34-May21.orig/mm/memcontrol.c
> +++ mmotm-2.6.34-May21/mm/memcontrol.c
> @@ -268,6 +268,7 @@ enum move_type {
>
> /* "mc" and its members are protected by cgroup_mutex */
> static struct move_charge_struct {
> + spinlock_t lock; /* for from, to, moving_task */
> struct mem_cgroup *from;
> struct mem_cgroup *to;
> unsigned long precharge;
> @@ -276,6 +277,7 @@ static struct move_charge_struct {
> struct task_struct *moving_task; /* a task moving charges */
> wait_queue_head_t waitq; /* a waitq for other context */
> } mc = {
> + .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
> .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
> };
>
> @@ -1076,26 +1078,25 @@ static unsigned int get_swappiness(struc
>
> static bool mem_cgroup_under_move(struct mem_cgroup *mem)
> {
> - struct mem_cgroup *from = mc.from;
> - struct mem_cgroup *to = mc.to;
> + struct mem_cgroup *from;
> + struct mem_cgroup *to;
> bool ret = false;
> -
> - if (from == mem || to == mem)
> - return true;
> -
> - if (!from || !to || !mem->use_hierarchy)
> - return false;
> -
> - rcu_read_lock();
> - if (css_tryget(&from->css)) {
> - ret = css_is_ancestor(&from->css, &mem->css);
> - css_put(&from->css);
> - }
> - if (!ret && css_tryget(&to->css)) {
> - ret = css_is_ancestor(&to->css, &mem->css);
> + /*
> + * Unlike task_move routines, we access mc.to, mc.from not under
> + * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
> + */
> + spin_lock(&mc.lock);
> + from = mc.from;
> + to = mc.to;
> + if (!from)
> + goto unlock;
> + if (from == mem || to == mem
> + || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
> + || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
> css_put(&to->css);
I think this css_put() must be removed too.
Except for it, this patch looks good to me.

Thank you for cleaning up my dirty code :)

Thanks,
Daisuke Nishimura.

> - }
> - rcu_read_unlock();
> + ret = true;
> +unlock:
> + spin_unlock(&mc.lock);
> return ret;
> }
>
> @@ -4447,11 +4448,13 @@ static int mem_cgroup_precharge_mc(struc
>
> static void mem_cgroup_clear_mc(void)
> {
> + struct mem_cgroup *from = mc.from;
> + struct mem_cgroup *to = mc.to;
> +
> /* we must uncharge all the leftover precharges from mc.to */
> if (mc.precharge) {
> __mem_cgroup_cancel_charge(mc.to, mc.precharge);
> mc.precharge = 0;
> - memcg_oom_recover(mc.to);
> }
> /*
> * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
> @@ -4460,7 +4463,6 @@ static void mem_cgroup_clear_mc(void)
> if (mc.moved_charge) {
> __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
> mc.moved_charge = 0;
> - memcg_oom_recover(mc.from);
> }
> /* we must fixup refcnts and charges */
> if (mc.moved_swap) {
> @@ -4485,9 +4487,13 @@ static void mem_cgroup_clear_mc(void)
>
> mc.moved_swap = 0;
> }
> + spin_lock(&mc.lock);
> mc.from = NULL;
> mc.to = NULL;
> mc.moving_task = NULL;
> + spin_unlock(&mc.lock);
> + memcg_oom_recover(from);
> + memcg_oom_recover(to);
> wake_up_all(&mc.waitq);
> }
>
> @@ -4516,12 +4522,14 @@ static int mem_cgroup_can_attach(struct
> VM_BUG_ON(mc.moved_charge);
> VM_BUG_ON(mc.moved_swap);
> VM_BUG_ON(mc.moving_task);
> + spin_lock(&mc.lock);
> mc.from = from;
> mc.to = mem;
> mc.precharge = 0;
> mc.moved_charge = 0;
> mc.moved_swap = 0;
> mc.moving_task = current;
> + spin_unlock(&mc.lock);
>
> ret = mem_cgroup_precharge_mc(mm);
> if (ret)
>

2010-06-03 07:05:38

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [PATCH 2/2] memcg make move_task_charge check clearer

On Thu, 3 Jun 2010 15:40:31 +0900
Daisuke Nishimura <[email protected]> wrote:
css_put(&to->css);
> I think this css_put() must be removed too.
> Except for it, this patch looks good to me.
>
Nice catch. Thank you.

> Thank you for cleaning up my dirty code :)

no problem. create -> bugfix -> clean up is in a wheel of development.

==
From: KAMEZAWA Hiroyuki <[email protected]>

Now, for checking a memcg is under task-account-moving, we do css_tryget()
against mc.to and mc.from. But this is just complicating things. This patch
makes the check easier.

This patch adds a spinlock to move_charge_struct and guard modification
of mc.to and mc.from. By this, we don't have to think about complicated
races arount this not-critical path.

Changelog:
- removed disable/enable irq.
- removed unnecessary css_put()

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 49 ++++++++++++++++++++++++++++---------------------
1 file changed, 28 insertions(+), 21 deletions(-)

Index: mmotm-2.6.34-May21/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-May21.orig/mm/memcontrol.c
+++ mmotm-2.6.34-May21/mm/memcontrol.c
@@ -268,6 +268,7 @@ enum move_type {

/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
+ spinlock_t lock; /* for from, to, moving_task */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
@@ -276,6 +277,7 @@ static struct move_charge_struct {
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};

@@ -1076,26 +1078,24 @@ static unsigned int get_swappiness(struc

static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
- struct mem_cgroup *from = mc.from;
- struct mem_cgroup *to = mc.to;
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
bool ret = false;
-
- if (from == mem || to == mem)
- return true;
-
- if (!from || !to || !mem->use_hierarchy)
- return false;
-
- rcu_read_lock();
- if (css_tryget(&from->css)) {
- ret = css_is_ancestor(&from->css, &mem->css);
- css_put(&from->css);
- }
- if (!ret && css_tryget(&to->css)) {
- ret = css_is_ancestor(&to->css, &mem->css);
- css_put(&to->css);
- }
- rcu_read_unlock();
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+ if (from == mem || to == mem
+ || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+ || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+ ret = true;
+unlock:
+ spin_unlock(&mc.lock);
return ret;
}

@@ -4447,11 +4447,13 @@ static int mem_cgroup_precharge_mc(struc

static void mem_cgroup_clear_mc(void)
{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
- memcg_oom_recover(mc.to);
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4460,7 +4462,6 @@ static void mem_cgroup_clear_mc(void)
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
- memcg_oom_recover(mc.from);
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
@@ -4485,9 +4486,13 @@ static void mem_cgroup_clear_mc(void)

mc.moved_swap = 0;
}
+ spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
mc.moving_task = NULL;
+ spin_unlock(&mc.lock);
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}

@@ -4516,12 +4521,14 @@ static int mem_cgroup_can_attach(struct
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
mc.precharge = 0;
mc.moved_charge = 0;
mc.moved_swap = 0;
mc.moving_task = current;
+ spin_unlock(&mc.lock);

ret = mem_cgroup_precharge_mc(mm);
if (ret)

2010-06-03 11:32:42

by Daisuke Nishimura

[permalink] [raw]

Subject: Re: [PATCH 1/2] memcg clean up try_charge main loop

One more comment.

> + ret = res_counter_charge(&mem->res, csize, &fail_res);
> +
> + if (likely(!ret)) {
> + if (!do_swap_account)
> + return CHARGE_OK;
> + ret = res_counter_charge(&mem->memsw, csize, &fail_res);
> + if (likely(!ret))
> + return CHARGE_OK;
> +
> + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
This must be mem_cgroup_from_res_counter(fail_res, memsw).
We will access to an invalid pointer, otherwise.

> + flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> + } else
> + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> +

Thanks,
Daisuke Nishimura.

2010-06-04 00:31:03

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: Re: [PATCH 1/2] memcg clean up try_charge main loop

On Thu, 3 Jun 2010 19:38:09 +0900
Daisuke Nishimura <[email protected]> wrote:

> One more comment.
>
> > + ret = res_counter_charge(&mem->res, csize, &fail_res);
> > +
> > + if (likely(!ret)) {
> > + if (!do_swap_account)
> > + return CHARGE_OK;
> > + ret = res_counter_charge(&mem->memsw, csize, &fail_res);
> > + if (likely(!ret))
> > + return CHARGE_OK;
> > +
> > + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> This must be mem_cgroup_from_res_counter(fail_res, memsw).
> We will access to an invalid pointer, otherwise.
>
ouch..ok. (my test wasn't enough..)

I'll rewrite this against the new mmotm.

Thanks,
-Kame

2010-06-04 05:50:37

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: [PATCH 1/2] memcg clean up try_charge main loop v2

Rebased onto the latest mmotm and tested again.

=
From: KAMEZAWA Hiroyuki <[email protected]>

mem_cgroup_try_charge() has a big loop in it and seems to be hard to read.
Most of routines are for slow path. This patch moves codes out from the
loop and make it clear what's done.

Summary:
- refactoring a function to detect a memcg is under acccount move or not.
- refactoring a function to wait for the end of moving task acct.
- refactoring a main loop('s slow path) as a function and make it clear
why we retry or quit by return code.
- add fatal_signal_pending() check for bypassing charge loops.

Changelog 2010--6-04
- fixed getting mem_over_limit

Changelog 2010-06-03
- fixed oom retry handling.
- use do...while()
Changelog 2010-06-01
- added fatal_signal_pending() to bypass charge loop. This is useful
and valid to do because if signal is fatal, charging against it
isn't very necessary and the user can see smooth kill even under
heavy workload on a memcg.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 248 +++++++++++++++++++++++++++++++++-----------------------
1 file changed, 148 insertions(+), 100 deletions(-)

Index: mmotm-2.6.34-Jun6/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-Jun6.orig/mm/memcontrol.c
+++ mmotm-2.6.34-Jun6/mm/memcontrol.c
@@ -1072,6 +1072,49 @@ static unsigned int get_swappiness(struc
return swappiness;
}

+/* A routine for testing mem is not under move_account */
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+ bool ret = false;
+
+ if (from == mem || to == mem)
+ return true;
+
+ if (!from || !to || !mem->use_hierarchy)
+ return false;
+
+ rcu_read_lock();
+ if (css_tryget(&from->css)) {
+ ret = css_is_ancestor(&from->css, &mem->css);
+ css_put(&from->css);
+ }
+ if (!ret && css_tryget(&to->css)) {
+ ret = css_is_ancestor(&to->css, &mem->css);
+ css_put(&to->css);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(mem)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1582,16 +1625,83 @@ static int __cpuinit memcg_stock_cpu_cal
return NOTIFY_OK;
}

+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+ CHARGE_OK, /* success */
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+ CHARGE_OOM_DIE, /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ int csize, bool oom_check)
+{
+ struct mem_cgroup *mem_over_limit;
+ struct res_counter *fail_res;
+ unsigned long flags = 0;
+ int ret;
+
+ ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ return CHARGE_OK;
+ ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ if (likely(!ret))
+ return CHARGE_OK;
+
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ } else
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+ if (csize > PAGE_SIZE) /* change csize and retry */
+ return CHARGE_RETRY;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return CHARGE_WOULDBLOCK;
+
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /* If we don't need to call oom-killer at el, return immediately */
+ if (!oom_check)
+ return CHARGE_NOMEM;
+ /* check OOM */
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+ return CHARGE_OOM_DIE;
+
+ return CHARGE_RETRY;
+}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
{
- struct mem_cgroup *mem, *mem_over_limit;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res;
+ int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup *mem = NULL;
+ int ret;
int csize = CHARGE_SIZE;

/*
@@ -1609,120 +1719,56 @@ static int __mem_cgroup_try_charge(struc
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- mem = *memcg;
- if (likely(!mem)) {
+ if (*memcg) {
+ mem = *memcg;
+ css_get(&mem->css);
+ } else {
mem = try_get_mem_cgroup_from_mm(mm);
+ if (unlikely(!mem))
+ return 0;
*memcg = mem;
- } else {
- css_get(&mem->css);
}
- if (unlikely(!mem))
- return 0;

VM_BUG_ON(css_is_removed(&mem->css));
if (mem_cgroup_is_root(mem))
goto done;

- while (1) {
- int ret = 0;
- unsigned long flags = 0;
+ do {
+ bool oom_check;

if (consume_stock(mem))
- goto done;
-
- ret = res_counter_charge(&mem->res, csize, &fail_res);
- if (likely(!ret)) {
- if (!do_swap_account)
- break;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
- if (likely(!ret))
- break;
- /* mem+swap counter fails */
- res_counter_uncharge(&mem->res, csize);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- memsw);
- } else
- /* mem counter fails */
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- res);
+ goto done; /* don't need to fill stock */
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current))
+ goto bypass;

- /* reduce request size and retry */
- if (csize > PAGE_SIZE) {
- csize = PAGE_SIZE;
- continue;
+ oom_check = false;
+ if (oom && !nr_oom_retries) {
+ oom_check = true;
+ nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
}
- if (!(gfp_mask & __GFP_WAIT))
- goto nomem;
-
- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
- if (ret)
- continue;

- /*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
- *
- */
- if (mem_cgroup_check_under_limit(mem_over_limit))
- continue;
+ ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);

- /* try to avoid oom while someone is moving charge */
- if (mc.moving_task && current != mc.moving_task) {
- struct mem_cgroup *from, *to;
- bool do_continue = false;
- /*
- * There is a small race that "from" or "to" can be
- * freed by rmdir, so we use css_tryget().
- */
- from = mc.from;
- to = mc.to;
- if (from && css_tryget(&from->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &from->css,
- &mem_over_limit->css);
- else
- do_continue = (from == mem_over_limit);
- css_put(&from->css);
- }
- if (!do_continue && to && css_tryget(&to->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &to->css,
- &mem_over_limit->css);
- else
- do_continue = (to == mem_over_limit);
- css_put(&to->css);
- }
- if (do_continue) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&mc.waitq, &wait,
- TASK_INTERRUPTIBLE);
- /* moving charge context might have finished. */
- if (mc.moving_task)
- schedule();
- finish_wait(&mc.waitq, &wait);
- continue;
- }
- }
-
- if (!nr_retries--) {
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+ case CHARGE_RETRY: /* not in OOM situation but retry */
+ csize = PAGE_SIZE;
+ break;
+ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
if (!oom)
goto nomem;
- if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- continue;
- }
- /* When we reach here, current task is dying .*/
- css_put(&mem->css);
+ /* If oom, we never return -ENOMEM */
+ nr_oom_retries--;
+ break;
+ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
goto bypass;
}
- }
+ } while (ret != CHARGE_OK);
+
if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE);
done:
@@ -1731,6 +1777,8 @@ nomem:
css_put(&mem->css);
return -ENOMEM;
bypass:
+ if (mem)
+ css_put(&mem->css);
*memcg = NULL;
return 0;
}

2010-06-04 05:54:07

by Kamezawa Hiroyuki

[permalink] [raw]

Subject: [PATCH 2/2] memcg clean up waiting move acct v2

No functional changes but rebased onto mmotm-2010-0603 and tested again.

==
From: KAMEZAWA Hiroyuki <[email protected]>

Now, for checking a memcg is under task-account-moving, we do css_tryget()
against mc.to and mc.from. But this is just complicating things. This patch
makes the check easier.

This patch adds a spinlock to move_charge_struct and guard modification
of mc.to and mc.from. By this, we don't have to think about complicated
races arount this not-critical path.

Changelog: 2010-06-04
- rebased onto mmotm-2010-06-03
Changelog:
- removed disable/enable irq.
- removed unnecessary css_put()

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 49 ++++++++++++++++++++++++++++---------------------
1 file changed, 28 insertions(+), 21 deletions(-)

Index: mmotm-2.6.34-Jun6/mm/memcontrol.c
===================================================================
--- mmotm-2.6.34-Jun6.orig/mm/memcontrol.c
+++ mmotm-2.6.34-Jun6/mm/memcontrol.c
@@ -268,6 +268,7 @@ enum move_type {

/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
+ spinlock_t lock; /* for from, to, moving_task */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
@@ -276,6 +277,7 @@ static struct move_charge_struct {
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};

@@ -1076,26 +1078,24 @@ static unsigned int get_swappiness(struc

static bool mem_cgroup_under_move(struct mem_cgroup *mem)
{
- struct mem_cgroup *from = mc.from;
- struct mem_cgroup *to = mc.to;
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
bool ret = false;
-
- if (from == mem || to == mem)
- return true;
-
- if (!from || !to || !mem->use_hierarchy)
- return false;
-
- rcu_read_lock();
- if (css_tryget(&from->css)) {
- ret = css_is_ancestor(&from->css, &mem->css);
- css_put(&from->css);
- }
- if (!ret && css_tryget(&to->css)) {
- ret = css_is_ancestor(&to->css, &mem->css);
- css_put(&to->css);
- }
- rcu_read_unlock();
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+ if (from == mem || to == mem
+ || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+ || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+ ret = true;
+unlock:
+ spin_unlock(&mc.lock);
return ret;
}

@@ -4448,11 +4448,13 @@ static int mem_cgroup_precharge_mc(struc

static void mem_cgroup_clear_mc(void)
{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
- memcg_oom_recover(mc.to);
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4461,7 +4463,6 @@ static void mem_cgroup_clear_mc(void)
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
- memcg_oom_recover(mc.from);
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
@@ -4486,9 +4487,13 @@ static void mem_cgroup_clear_mc(void)

mc.moved_swap = 0;
}
+ spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
mc.moving_task = NULL;
+ spin_unlock(&mc.lock);
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}

@@ -4517,12 +4522,14 @@ static int mem_cgroup_can_attach(struct
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
mc.precharge = 0;
mc.moved_charge = 0;
mc.moved_swap = 0;
mc.moving_task = current;
+ spin_unlock(&mc.lock);

ret = mem_cgroup_precharge_mc(mm);
if (ret)