This patch clean up/fixes for memcg's uncharge soft limit path.
This is also a preparation for batched uncharge. When batched uncharge is
introduced, calls for res_counter_uncharge() will be reduced and event
counter will not work correctly. This tries to fix it.
Problems:
Now, res_counter_charge()/uncharge() handles softlimit information at
charge/uncharge. Its softlimit-check is done when event counter per memcg
goes over limit. But event counter per memcg is updated only when
when res_counter tells it's over soft limit. And ancerstors of memcg are
handled in "charge" path but not in "uncharge" path.
Prolems:
1. memcg's event counter incremented only when softlimit hits. That's bad.
It makes event counter hard to be reused for other purpose.
2. At uncharge, only the lowest level rescounter is handled. This is bug.
Because ancesotor's event counter is not incremented, children should
take care of them.
3. res_counter_uncharge()'s 3rd argument is NULL in most case.
ops under res_counter->lock should be small.
No "if" in res_counter is better.
Fixes:
1. make event-counter of memcg checked at every charge/uncharge.
(per-cpu area is not slow, no problem.)
2.All ancestors are checked at soft-limit-check. This is necessary because
ancesotor's event counter may never be modified. Then, they should be
checked at the same time. This fixes uncharge path.
2. Removed soft_limit_xx poitner and from charge and uncharge calls.
Do-check-only-when-event counter expires scheme works well without them.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/res_counter.h | 6 --
kernel/res_counter.c | 18 ------
mm/memcontrol.c | 115 +++++++++++++++++++-------------------------
3 files changed, 55 insertions(+), 84 deletions(-)
Index: mmotm-2.6.31-Aug27/kernel/res_counter.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/kernel/res_counter.c
+++ mmotm-2.6.31-Aug27/kernel/res_counter.c
@@ -37,27 +37,17 @@ int res_counter_charge_locked(struct res
}
int res_counter_charge(struct res_counter *counter, unsigned long val,
- struct res_counter **limit_fail_at,
- struct res_counter **soft_limit_fail_at)
+ struct res_counter **limit_fail_at)
{
int ret;
unsigned long flags;
struct res_counter *c, *u;
*limit_fail_at = NULL;
- if (soft_limit_fail_at)
- *soft_limit_fail_at = NULL;
local_irq_save(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
ret = res_counter_charge_locked(c, val);
- /*
- * With soft limits, we return the highest ancestor
- * that exceeds its soft limit
- */
- if (soft_limit_fail_at &&
- !res_counter_soft_limit_check_locked(c))
- *soft_limit_fail_at = c;
spin_unlock(&c->lock);
if (ret < 0) {
*limit_fail_at = c;
@@ -85,8 +75,7 @@ void res_counter_uncharge_locked(struct
counter->usage -= val;
}
-void res_counter_uncharge(struct res_counter *counter, unsigned long val,
- bool *was_soft_limit_excess)
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
{
unsigned long flags;
struct res_counter *c;
@@ -94,9 +83,6 @@ void res_counter_uncharge(struct res_cou
local_irq_save(flags);
for (c = counter; c != NULL; c = c->parent) {
spin_lock(&c->lock);
- if (was_soft_limit_excess)
- *was_soft_limit_excess =
- !res_counter_soft_limit_check_locked(c);
res_counter_uncharge_locked(c, val);
spin_unlock(&c->lock);
}
Index: mmotm-2.6.31-Aug27/include/linux/res_counter.h
===================================================================
--- mmotm-2.6.31-Aug27.orig/include/linux/res_counter.h
+++ mmotm-2.6.31-Aug27/include/linux/res_counter.h
@@ -114,8 +114,7 @@ void res_counter_init(struct res_counter
int __must_check res_counter_charge_locked(struct res_counter *counter,
unsigned long val);
int __must_check res_counter_charge(struct res_counter *counter,
- unsigned long val, struct res_counter **limit_fail_at,
- struct res_counter **soft_limit_at);
+ unsigned long val, struct res_counter **limit_fail_at);
/*
* uncharge - tell that some portion of the resource is released
@@ -128,8 +127,7 @@ int __must_check res_counter_charge(stru
*/
void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
-void res_counter_uncharge(struct res_counter *counter, unsigned long val,
- bool *was_soft_limit_excess);
+void res_counter_uncharge(struct res_counter *counter, unsigned long val);
static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
{
Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Aug27/mm/memcontrol.c
@@ -353,16 +353,6 @@ __mem_cgroup_remove_exceeded(struct mem_
}
static void
-mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
- struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
-{
- spin_lock(&mctz->lock);
- __mem_cgroup_insert_exceeded(mem, mz, mctz);
- spin_unlock(&mctz->lock);
-}
-
-static void
mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
struct mem_cgroup_tree_per_zone *mctz)
@@ -392,34 +382,40 @@ static bool mem_cgroup_soft_limit_check(
static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
{
- unsigned long long prev_usage_in_excess, new_usage_in_excess;
- bool updated_tree = false;
+ unsigned long long new_usage_in_excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
-
- mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
+ int nid = page_to_nid(page);
+ int zid = page_zonenum(page);
mctz = soft_limit_tree_from_page(page);
/*
- * We do updates in lazy mode, mem's are removed
- * lazily from the per-zone, per-node rb tree
+ * Necessary to update all ancestors when hierarchy is used.
+ * because their event counter is not touched.
*/
- prev_usage_in_excess = mz->usage_in_excess;
-
- new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
- if (prev_usage_in_excess) {
- mem_cgroup_remove_exceeded(mem, mz, mctz);
- updated_tree = true;
- }
- if (!new_usage_in_excess)
- goto done;
- mem_cgroup_insert_exceeded(mem, mz, mctz);
-
-done:
- if (updated_tree) {
- spin_lock(&mctz->lock);
- mz->usage_in_excess = new_usage_in_excess;
- spin_unlock(&mctz->lock);
+ for (; mem; mem = parent_mem_cgroup(mem)) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ new_usage_in_excess =
+ res_counter_soft_limit_excess(&mem->res);
+ /*
+ * We have to update the tree if mz is on RB-tree or
+ * mem is over its softlimit.
+ */
+ if (new_usage_in_excess || mz->on_tree) {
+ spin_lock(&mctz->lock);
+ /* if on-tree, remove it */
+ if (mz->on_tree)
+ __mem_cgroup_remove_exceeded(mem, mz, mctz);
+ /*
+ * if over soft limit, insert again. mz->usage_in_excess
+ * will be updated properly.
+ */
+ if (new_usage_in_excess)
+ __mem_cgroup_insert_exceeded(mem, mz, mctz);
+ else
+ mz->usage_in_excess = 0;
+ spin_unlock(&mctz->lock);
+ }
}
}
@@ -1266,9 +1262,9 @@ static int __mem_cgroup_try_charge(struc
gfp_t gfp_mask, struct mem_cgroup **memcg,
bool oom, struct page *page)
{
- struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
+ struct mem_cgroup *mem, *mem_over_limit;
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res, *soft_fail_res = NULL;
+ struct res_counter *fail_res;
if (unlikely(test_thread_flag(TIF_MEMDIE))) {
/* Don't account this! */
@@ -1300,17 +1296,16 @@ static int __mem_cgroup_try_charge(struc
if (mem_cgroup_is_root(mem))
goto done;
- ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
- &soft_fail_res);
+ ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
break;
ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
- &fail_res, NULL);
+ &fail_res);
if (likely(!ret))
break;
/* mem+swap counter fails */
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
memsw);
@@ -1349,16 +1344,11 @@ static int __mem_cgroup_try_charge(struc
}
}
/*
- * Insert just the ancestor, we should trickle down to the correct
- * cgroup for reclaim, since the other nodes will be below their
- * soft limit
- */
- if (soft_fail_res) {
- mem_over_soft_limit =
- mem_cgroup_from_res_counter(soft_fail_res, res);
- if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
- mem_cgroup_update_tree(mem_over_soft_limit, page);
- }
+ * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+ * if they exceeds softlimit.
+ */
+ if (mem_cgroup_soft_limit_check(mem))
+ mem_cgroup_update_tree(mem, page);
done:
return 0;
nomem:
@@ -1433,10 +1423,9 @@ static void __mem_cgroup_commit_charge(s
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE,
- NULL);
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
}
css_put(&mem->css);
return;
@@ -1515,7 +1504,7 @@ static int mem_cgroup_move_account(struc
goto out;
if (!mem_cgroup_is_root(from))
- res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&from->res, PAGE_SIZE);
mem_cgroup_charge_statistics(from, pc, false);
page = pc->page;
@@ -1535,7 +1524,7 @@ static int mem_cgroup_move_account(struc
}
if (do_swap_account && !mem_cgroup_is_root(from))
- res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&from->memsw, PAGE_SIZE);
css_put(&from->css);
css_get(&to->css);
@@ -1606,9 +1595,9 @@ uncharge:
css_put(&parent->css);
/* uncharge if move fails */
if (!mem_cgroup_is_root(parent)) {
- res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&parent->res, PAGE_SIZE);
if (do_swap_account)
- res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&parent->memsw, PAGE_SIZE);
}
return ret;
}
@@ -1799,8 +1788,7 @@ __mem_cgroup_commit_charge_swapin(struct
* calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
- NULL);
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
@@ -1827,9 +1815,9 @@ void mem_cgroup_cancel_charge_swapin(str
if (!mem)
return;
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
if (do_swap_account)
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
}
css_put(&mem->css);
}
@@ -1844,7 +1832,6 @@ __mem_cgroup_uncharge_common(struct page
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
struct mem_cgroup_per_zone *mz;
- bool soft_limit_excess = false;
if (mem_cgroup_disabled())
return NULL;
@@ -1884,10 +1871,10 @@ __mem_cgroup_uncharge_common(struct page
}
if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
if (do_swap_account &&
(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
- res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
}
if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
mem_cgroup_swap_statistics(mem, true);
@@ -1904,7 +1891,7 @@ __mem_cgroup_uncharge_common(struct page
mz = page_cgroup_zoneinfo(pc);
unlock_page_cgroup(pc);
- if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
+ if (mem_cgroup_soft_limit_check(mem))
mem_cgroup_update_tree(mem, page);
/* at swapout, this memcg will be accessed to record to swap */
if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
@@ -1982,7 +1969,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
* This memcg can be obsolete one. We avoid calling css_tryget
*/
if (!mem_cgroup_is_root(memcg))
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
mem_cgroup_put(memcg);
}
In charge path, usage_in_excess is calculated repeatedly and
it takes res_counter's spin_lock every time.
This patch removes unnecessary calls for res_count_soft_limit_excess.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 31 +++++++++++++++----------------
1 file changed, 15 insertions(+), 16 deletions(-)
Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Aug27/mm/memcontrol.c
@@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *p
static void
__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
@@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_
if (mz->on_tree)
return;
- mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+ mz->usage_in_excess = new_usage_in_excess;
+ if (!mz->usage_in_excess)
+ return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -382,7 +385,7 @@ static bool mem_cgroup_soft_limit_check(
static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
{
- unsigned long long new_usage_in_excess;
+ unsigned long long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
int nid = page_to_nid(page);
@@ -395,25 +398,21 @@ static void mem_cgroup_update_tree(struc
*/
for (; mem; mem = parent_mem_cgroup(mem)) {
mz = mem_cgroup_zoneinfo(mem, nid, zid);
- new_usage_in_excess =
- res_counter_soft_limit_excess(&mem->res);
+ excess = res_counter_soft_limit_excess(&mem->res);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
- if (new_usage_in_excess || mz->on_tree) {
+ if (excess || mz->on_tree) {
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(mem, mz, mctz);
/*
- * if over soft limit, insert again. mz->usage_in_excess
- * will be updated properly.
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
*/
- if (new_usage_in_excess)
- __mem_cgroup_insert_exceeded(mem, mz, mctz);
- else
- mz->usage_in_excess = 0;
+ __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
spin_unlock(&mctz->lock);
}
}
@@ -2216,6 +2215,7 @@ unsigned long mem_cgroup_soft_limit_recl
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
+ unsigned long long excess;
if (order > 0)
return 0;
@@ -2260,9 +2260,8 @@ unsigned long mem_cgroup_soft_limit_recl
__mem_cgroup_largest_soft_limit_node(mctz);
} while (next_mz == mz);
}
- mz->usage_in_excess =
- res_counter_soft_limit_excess(&mz->mem->res);
__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+ excess = res_counter_soft_limit_excess(&mz->mem->res);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
@@ -2271,8 +2270,8 @@ unsigned long mem_cgroup_soft_limit_recl
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
- if (mz->usage_in_excess)
- __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
+ /* If excess == 0, no tree ops */
+ __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->mem->css);
loop++;
On Wed, 2 Sep 2009 09:34:38 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> This patch clean up/fixes for memcg's uncharge soft limit path.
> This is also a preparation for batched uncharge. When batched uncharge is
> introduced, calls for res_counter_uncharge() will be reduced and event
> counter will not work correctly. This tries to fix it.
>
> Problems:
> Now, res_counter_charge()/uncharge() handles softlimit information at
> charge/uncharge. Its softlimit-check is done when event counter per memcg
> goes over limit. But event counter per memcg is updated only when
> when res_counter tells it's over soft limit. And ancerstors of memcg are
> handled in "charge" path but not in "uncharge" path.
>
> Prolems:
> 1. memcg's event counter incremented only when softlimit hits. That's bad.
> It makes event counter hard to be reused for other purpose.
> 2. At uncharge, only the lowest level rescounter is handled. This is bug.
> Because ancesotor's event counter is not incremented, children should
> take care of them.
> 3. res_counter_uncharge()'s 3rd argument is NULL in most case.
> ops under res_counter->lock should be small.
> No "if" in res_counter is better.
>
> Fixes:
> 1. make event-counter of memcg checked at every charge/uncharge.
> (per-cpu area is not slow, no problem.)
>
> 2.All ancestors are checked at soft-limit-check. This is necessary because
> ancesotor's event counter may never be modified. Then, they should be
> checked at the same time. This fixes uncharge path.
>
> 2. Removed soft_limit_xx poitner and from charge and uncharge calls.
> Do-check-only-when-event counter expires scheme works well without them.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
Nice cleanups and fixes.
IMHO, softlimit codes become easier to understand.
Reviewed-by: Daisuke Nishimura <[email protected]>
Thanks,
Daisuke Nishimura.
> ---
> include/linux/res_counter.h | 6 --
> kernel/res_counter.c | 18 ------
> mm/memcontrol.c | 115 +++++++++++++++++++-------------------------
> 3 files changed, 55 insertions(+), 84 deletions(-)
>
> Index: mmotm-2.6.31-Aug27/kernel/res_counter.c
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/kernel/res_counter.c
> +++ mmotm-2.6.31-Aug27/kernel/res_counter.c
> @@ -37,27 +37,17 @@ int res_counter_charge_locked(struct res
> }
>
> int res_counter_charge(struct res_counter *counter, unsigned long val,
> - struct res_counter **limit_fail_at,
> - struct res_counter **soft_limit_fail_at)
> + struct res_counter **limit_fail_at)
> {
> int ret;
> unsigned long flags;
> struct res_counter *c, *u;
>
> *limit_fail_at = NULL;
> - if (soft_limit_fail_at)
> - *soft_limit_fail_at = NULL;
> local_irq_save(flags);
> for (c = counter; c != NULL; c = c->parent) {
> spin_lock(&c->lock);
> ret = res_counter_charge_locked(c, val);
> - /*
> - * With soft limits, we return the highest ancestor
> - * that exceeds its soft limit
> - */
> - if (soft_limit_fail_at &&
> - !res_counter_soft_limit_check_locked(c))
> - *soft_limit_fail_at = c;
> spin_unlock(&c->lock);
> if (ret < 0) {
> *limit_fail_at = c;
> @@ -85,8 +75,7 @@ void res_counter_uncharge_locked(struct
> counter->usage -= val;
> }
>
> -void res_counter_uncharge(struct res_counter *counter, unsigned long val,
> - bool *was_soft_limit_excess)
> +void res_counter_uncharge(struct res_counter *counter, unsigned long val)
> {
> unsigned long flags;
> struct res_counter *c;
> @@ -94,9 +83,6 @@ void res_counter_uncharge(struct res_cou
> local_irq_save(flags);
> for (c = counter; c != NULL; c = c->parent) {
> spin_lock(&c->lock);
> - if (was_soft_limit_excess)
> - *was_soft_limit_excess =
> - !res_counter_soft_limit_check_locked(c);
> res_counter_uncharge_locked(c, val);
> spin_unlock(&c->lock);
> }
> Index: mmotm-2.6.31-Aug27/include/linux/res_counter.h
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/include/linux/res_counter.h
> +++ mmotm-2.6.31-Aug27/include/linux/res_counter.h
> @@ -114,8 +114,7 @@ void res_counter_init(struct res_counter
> int __must_check res_counter_charge_locked(struct res_counter *counter,
> unsigned long val);
> int __must_check res_counter_charge(struct res_counter *counter,
> - unsigned long val, struct res_counter **limit_fail_at,
> - struct res_counter **soft_limit_at);
> + unsigned long val, struct res_counter **limit_fail_at);
>
> /*
> * uncharge - tell that some portion of the resource is released
> @@ -128,8 +127,7 @@ int __must_check res_counter_charge(stru
> */
>
> void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
> -void res_counter_uncharge(struct res_counter *counter, unsigned long val,
> - bool *was_soft_limit_excess);
> +void res_counter_uncharge(struct res_counter *counter, unsigned long val);
>
> static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
> {
> Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
> @@ -353,16 +353,6 @@ __mem_cgroup_remove_exceeded(struct mem_
> }
>
> static void
> -mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
> - struct mem_cgroup_per_zone *mz,
> - struct mem_cgroup_tree_per_zone *mctz)
> -{
> - spin_lock(&mctz->lock);
> - __mem_cgroup_insert_exceeded(mem, mz, mctz);
> - spin_unlock(&mctz->lock);
> -}
> -
> -static void
> mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
> struct mem_cgroup_per_zone *mz,
> struct mem_cgroup_tree_per_zone *mctz)
> @@ -392,34 +382,40 @@ static bool mem_cgroup_soft_limit_check(
>
> static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
> {
> - unsigned long long prev_usage_in_excess, new_usage_in_excess;
> - bool updated_tree = false;
> + unsigned long long new_usage_in_excess;
> struct mem_cgroup_per_zone *mz;
> struct mem_cgroup_tree_per_zone *mctz;
> -
> - mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
> + int nid = page_to_nid(page);
> + int zid = page_zonenum(page);
> mctz = soft_limit_tree_from_page(page);
>
> /*
> - * We do updates in lazy mode, mem's are removed
> - * lazily from the per-zone, per-node rb tree
> + * Necessary to update all ancestors when hierarchy is used.
> + * because their event counter is not touched.
> */
> - prev_usage_in_excess = mz->usage_in_excess;
> -
> - new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
> - if (prev_usage_in_excess) {
> - mem_cgroup_remove_exceeded(mem, mz, mctz);
> - updated_tree = true;
> - }
> - if (!new_usage_in_excess)
> - goto done;
> - mem_cgroup_insert_exceeded(mem, mz, mctz);
> -
> -done:
> - if (updated_tree) {
> - spin_lock(&mctz->lock);
> - mz->usage_in_excess = new_usage_in_excess;
> - spin_unlock(&mctz->lock);
> + for (; mem; mem = parent_mem_cgroup(mem)) {
> + mz = mem_cgroup_zoneinfo(mem, nid, zid);
> + new_usage_in_excess =
> + res_counter_soft_limit_excess(&mem->res);
> + /*
> + * We have to update the tree if mz is on RB-tree or
> + * mem is over its softlimit.
> + */
> + if (new_usage_in_excess || mz->on_tree) {
> + spin_lock(&mctz->lock);
> + /* if on-tree, remove it */
> + if (mz->on_tree)
> + __mem_cgroup_remove_exceeded(mem, mz, mctz);
> + /*
> + * if over soft limit, insert again. mz->usage_in_excess
> + * will be updated properly.
> + */
> + if (new_usage_in_excess)
> + __mem_cgroup_insert_exceeded(mem, mz, mctz);
> + else
> + mz->usage_in_excess = 0;
> + spin_unlock(&mctz->lock);
> + }
> }
> }
>
> @@ -1266,9 +1262,9 @@ static int __mem_cgroup_try_charge(struc
> gfp_t gfp_mask, struct mem_cgroup **memcg,
> bool oom, struct page *page)
> {
> - struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
> + struct mem_cgroup *mem, *mem_over_limit;
> int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
> - struct res_counter *fail_res, *soft_fail_res = NULL;
> + struct res_counter *fail_res;
>
> if (unlikely(test_thread_flag(TIF_MEMDIE))) {
> /* Don't account this! */
> @@ -1300,17 +1296,16 @@ static int __mem_cgroup_try_charge(struc
>
> if (mem_cgroup_is_root(mem))
> goto done;
> - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
> - &soft_fail_res);
> + ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
> if (likely(!ret)) {
> if (!do_swap_account)
> break;
> ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
> - &fail_res, NULL);
> + &fail_res);
> if (likely(!ret))
> break;
> /* mem+swap counter fails */
> - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
> + res_counter_uncharge(&mem->res, PAGE_SIZE);
> flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> memsw);
> @@ -1349,16 +1344,11 @@ static int __mem_cgroup_try_charge(struc
> }
> }
> /*
> - * Insert just the ancestor, we should trickle down to the correct
> - * cgroup for reclaim, since the other nodes will be below their
> - * soft limit
> - */
> - if (soft_fail_res) {
> - mem_over_soft_limit =
> - mem_cgroup_from_res_counter(soft_fail_res, res);
> - if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
> - mem_cgroup_update_tree(mem_over_soft_limit, page);
> - }
> + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> + * if they exceeds softlimit.
> + */
> + if (mem_cgroup_soft_limit_check(mem))
> + mem_cgroup_update_tree(mem, page);
> done:
> return 0;
> nomem:
> @@ -1433,10 +1423,9 @@ static void __mem_cgroup_commit_charge(s
> if (unlikely(PageCgroupUsed(pc))) {
> unlock_page_cgroup(pc);
> if (!mem_cgroup_is_root(mem)) {
> - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
> + res_counter_uncharge(&mem->res, PAGE_SIZE);
> if (do_swap_account)
> - res_counter_uncharge(&mem->memsw, PAGE_SIZE,
> - NULL);
> + res_counter_uncharge(&mem->memsw, PAGE_SIZE);
> }
> css_put(&mem->css);
> return;
> @@ -1515,7 +1504,7 @@ static int mem_cgroup_move_account(struc
> goto out;
>
> if (!mem_cgroup_is_root(from))
> - res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
> + res_counter_uncharge(&from->res, PAGE_SIZE);
> mem_cgroup_charge_statistics(from, pc, false);
>
> page = pc->page;
> @@ -1535,7 +1524,7 @@ static int mem_cgroup_move_account(struc
> }
>
> if (do_swap_account && !mem_cgroup_is_root(from))
> - res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
> + res_counter_uncharge(&from->memsw, PAGE_SIZE);
> css_put(&from->css);
>
> css_get(&to->css);
> @@ -1606,9 +1595,9 @@ uncharge:
> css_put(&parent->css);
> /* uncharge if move fails */
> if (!mem_cgroup_is_root(parent)) {
> - res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
> + res_counter_uncharge(&parent->res, PAGE_SIZE);
> if (do_swap_account)
> - res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
> + res_counter_uncharge(&parent->memsw, PAGE_SIZE);
> }
> return ret;
> }
> @@ -1799,8 +1788,7 @@ __mem_cgroup_commit_charge_swapin(struct
> * calling css_tryget
> */
> if (!mem_cgroup_is_root(memcg))
> - res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
> - NULL);
> + res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> mem_cgroup_swap_statistics(memcg, false);
> mem_cgroup_put(memcg);
> }
> @@ -1827,9 +1815,9 @@ void mem_cgroup_cancel_charge_swapin(str
> if (!mem)
> return;
> if (!mem_cgroup_is_root(mem)) {
> - res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
> + res_counter_uncharge(&mem->res, PAGE_SIZE);
> if (do_swap_account)
> - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
> + res_counter_uncharge(&mem->memsw, PAGE_SIZE);
> }
> css_put(&mem->css);
> }
> @@ -1844,7 +1832,6 @@ __mem_cgroup_uncharge_common(struct page
> struct page_cgroup *pc;
> struct mem_cgroup *mem = NULL;
> struct mem_cgroup_per_zone *mz;
> - bool soft_limit_excess = false;
>
> if (mem_cgroup_disabled())
> return NULL;
> @@ -1884,10 +1871,10 @@ __mem_cgroup_uncharge_common(struct page
> }
>
> if (!mem_cgroup_is_root(mem)) {
> - res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
> + res_counter_uncharge(&mem->res, PAGE_SIZE);
> if (do_swap_account &&
> (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
> - res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
> + res_counter_uncharge(&mem->memsw, PAGE_SIZE);
> }
> if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> mem_cgroup_swap_statistics(mem, true);
> @@ -1904,7 +1891,7 @@ __mem_cgroup_uncharge_common(struct page
> mz = page_cgroup_zoneinfo(pc);
> unlock_page_cgroup(pc);
>
> - if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
> + if (mem_cgroup_soft_limit_check(mem))
> mem_cgroup_update_tree(mem, page);
> /* at swapout, this memcg will be accessed to record to swap */
> if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
> @@ -1982,7 +1969,7 @@ void mem_cgroup_uncharge_swap(swp_entry_
> * This memcg can be obsolete one. We avoid calling css_tryget
> */
> if (!mem_cgroup_is_root(memcg))
> - res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
> + res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
> mem_cgroup_swap_statistics(memcg, false);
> mem_cgroup_put(memcg);
> }
>
Based on mmotm+ softlimit clean up patches I posted today.
I think this version is much easier to read and cleaner than previous one.
But it's -rc8 age now. I can wait until the end of next merge window.
after make -j8 kernel.
==
lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
&counter->lock: 3074759 3086077 0.94 14372.27 1978508.38 68039606 80763359 0.45 34243.51 37892635.86
--------------
&counter->lock 2970383 [<ffffffff8109921d>] res_counter_charge+0x4d
&counter->lock 112516 [<ffffffff81099175>] res_counter_uncharge+0x35
&counter->lock 3178 [<ffffffff81111e2a>] mem_cgroup_update_tree+0x11a/0x1c0
--------------
&counter->lock 2973905 [<ffffffff8109921d>] res_counter_charge+0x4d
&counter->lock 109581 [<ffffffff81099175>] res_counter_uncharge+0x35
&counter->lock 2591 [<ffffffff81111e2a>] mem_cgroup_update_tree+0x11a/0x1c0
==
After this patch, lock conention in uncharge is reduced.
(And I know if I can reduce lock contention in charge path,
this dramatically reduced.)
==
From: KAMEZAWA Hiroyuki <[email protected]>
In massive parallel enviroment, res_counter can be a performance bottleneck.
This patch is a trial for reducing lock contention.
One strong techinque to reduce lock contention is reducing calls by
batching some amount of calls int one.
Considering charge/uncharge chatacteristic,
- charge is done one by one via demand-paging.
- uncharge is done by
- in chunk at munmap, truncate, exit, execve...
- one by one via vmscan/paging.
It seems we hace a chance to batched-uncharge.
This patch is a base patch for batched uncharge. For avoiding
scattering memcg's structure, this patch adds memcg batch uncharge
information to the task. please see start/end usage in next patch.
Because it will be used always at exit(), it's not very costly.
(And much gives us much easier code than using percpu_xxx.)
The degree of coalescing depends on callers
- at invalidate/trucate... pagevec size
- at unmap ....ZAP_BLOCK_SIZE
pages itself will be freed in this degree.
Changelog:
- unified patch for callers
- added commetns.
- make ->do_batch as bool.
- removed css_get() at el. We don't need it.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
include/linux/memcontrol.h | 12 ++++++
include/linux/sched.h | 7 +++
mm/memcontrol.c | 90 ++++++++++++++++++++++++++++++++++++++++++---
mm/memory.c | 2 +
mm/truncate.c | 6 +++
5 files changed, 111 insertions(+), 6 deletions(-)
Index: mmotm-2.6.31-Aug27/include/linux/memcontrol.h
===================================================================
--- mmotm-2.6.31-Aug27.orig/include/linux/memcontrol.h
+++ mmotm-2.6.31-Aug27/include/linux/memcontrol.h
@@ -54,6 +54,10 @@ extern void mem_cgroup_rotate_lru_list(s
extern void mem_cgroup_del_lru(struct page *page);
extern void mem_cgroup_move_lists(struct page *page,
enum lru_list from, enum lru_list to);
+
+extern void mem_cgroup_uncharge_batch_start(void);
+extern void mem_cgroup_uncharge_batch_end(void);
+
extern void mem_cgroup_uncharge_page(struct page *page);
extern void mem_cgroup_uncharge_cache_page(struct page *page);
extern int mem_cgroup_shmem_charge_fallback(struct page *page,
@@ -151,6 +155,14 @@ static inline void mem_cgroup_cancel_cha
{
}
+static inline void mem_cgroup_uncharge_batch_start(void)
+{
+}
+
+static inline void mem_cgroup_uncharge_batch_start(void)
+{
+}
+
static inline void mem_cgroup_uncharge_page(struct page *page)
{
}
Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Aug27/mm/memcontrol.c
@@ -1821,6 +1821,48 @@ void mem_cgroup_cancel_charge_swapin(str
css_put(&mem->css);
}
+static void
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+{
+ struct memcg_batch_info *batch = NULL;
+ bool uncharge_memsw = true;
+ /* If swapout, usage of swap doesn't decrease */
+ if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+ uncharge_memsw = false;
+ /*
+ * do_batch == true when unmapping pages or inode invaludate/truncate.
+ * In those cases, all pages freed continously can be expected to be in
+ * the same cgroup and we have chance to coalesce uncharges.
+ */
+ if (!current->memcg_batch.do_batch)
+ goto direct_uncharge;
+
+ batch = ¤t->memcg_batch;
+ /*
+ * In usual, we do css_get() when we remember memcg pointer.
+ * But in this case, we keep res->usage until end of a series of
+ * uncharges. Then, it's ok to ignore memcg's refcnt.
+ */
+ if (!batch->memcg)
+ batch->memcg = mem;
+ /*
+ * In typical case, batch->memcg == mem. This means we can
+ * merge a series of uncharges to an uncharge of res_counter.
+ * If not, we uncharge res_counter ony by one.
+ */
+ if (batch->memcg != mem)
+ goto direct_uncharge;
+ /* remember freed charge and uncharge it later */
+ batch->pages += PAGE_SIZE;
+ if (uncharge_memsw)
+ batch->memsw += PAGE_SIZE;
+ return;
+direct_uncharge:
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ if (uncharge_memsw)
+ res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+ return;
+}
/*
* uncharge if !page_mapped(page)
@@ -1869,12 +1911,8 @@ __mem_cgroup_uncharge_common(struct page
break;
}
- if (!mem_cgroup_is_root(mem)) {
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- if (do_swap_account &&
- (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
- res_counter_uncharge(&mem->memsw, PAGE_SIZE);
- }
+ if (!mem_cgroup_is_root(mem))
+ __do_uncharge(mem, ctype);
if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
mem_cgroup_swap_statistics(mem, true);
mem_cgroup_charge_statistics(mem, pc, false);
@@ -1920,6 +1958,46 @@ void mem_cgroup_uncharge_cache_page(stru
__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
}
+/*
+ * batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
+ * In that cases, pages are freed continuously and we can expect pages
+ * are in the same memcg. All these calls itself limits the number of
+ * pages freed at once, then uncharge_start/end() is called properly.
+ */
+
+void mem_cgroup_uncharge_batch_start(void)
+{
+ VM_BUG_ON(current->memcg_batch.do_batch);
+ /* avoid batch if killed by OOM */
+ if (test_thread_flag(TIF_MEMDIE))
+ return;
+ current->memcg_batch.memcg = NULL;
+ current->memcg_batch.pages = 0;
+ current->memcg_batch.memsw = 0;
+ current->memcg_batch.do_batch = true;
+}
+
+void mem_cgroup_uncharge_batch_end(void)
+{
+ struct mem_cgroup *mem;
+
+ if (!current->memcg_batch.do_batch)
+ return;
+
+ current->memcg_batch.do_batch = false;
+
+ mem = current->memcg_batch.memcg;
+ if (!mem)
+ return;
+ /* This "mem" is valid bacause we hide charges behind us. */
+ if (current->memcg_batch.pages)
+ res_counter_uncharge(&mem->res, current->memcg_batch.pages);
+ if (current->memcg_batch.memsw)
+ res_counter_uncharge(&mem->memsw, current->memcg_batch.memsw);
+ /* Not necessary. but forget this pointer */
+ current->memcg_batch.memcg = NULL;
+}
+
#ifdef CONFIG_SWAP
/*
* called after __delete_from_swap_cache() and drop "page" account.
Index: mmotm-2.6.31-Aug27/include/linux/sched.h
===================================================================
--- mmotm-2.6.31-Aug27.orig/include/linux/sched.h
+++ mmotm-2.6.31-Aug27/include/linux/sched.h
@@ -1540,6 +1540,13 @@ struct task_struct {
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
unsigned long stack_start;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
+ struct memcg_batch_info {
+ bool do_batch;
+ struct mem_cgroup *memcg;
+ long pages, memsw;
+ } memcg_batch;
+#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
Index: mmotm-2.6.31-Aug27/mm/memory.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/mm/memory.c
+++ mmotm-2.6.31-Aug27/mm/memory.c
@@ -909,6 +909,7 @@ static unsigned long unmap_page_range(st
details = NULL;
BUG_ON(addr >= end);
+ mem_cgroup_uncharge_batch_start();
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
do {
@@ -921,6 +922,7 @@ static unsigned long unmap_page_range(st
zap_work, details);
} while (pgd++, addr = next, (addr != end && *zap_work > 0));
tlb_end_vma(tlb, vma);
+ mem_cgroup_uncharge_batch_end();
return addr;
}
Index: mmotm-2.6.31-Aug27/mm/truncate.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/mm/truncate.c
+++ mmotm-2.6.31-Aug27/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct a
pagevec_release(&pvec);
break;
}
+ mem_cgroup_uncharge_batch_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct a
unlock_page(page);
}
pagevec_release(&pvec);
+ mem_cgroup_uncharge_batch_end();
}
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(s
pagevec_init(&pvec, 0);
while (next <= end &&
pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ mem_cgroup_uncharge_batch_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(s
break;
}
pagevec_release(&pvec);
+ mem_cgroup_uncharge_batch_end();
cond_resched();
}
return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct
while (next <= end && !wrapped &&
pagevec_lookup(&pvec, mapping, next,
min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+ mem_cgroup_uncharge_batch_start();
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct
unlock_page(page);
}
pagevec_release(&pvec);
+ mem_cgroup_uncharge_batch_end();
cond_resched();
}
return ret;
On Wed, 2 Sep 2009 09:35:51 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> In charge path, usage_in_excess is calculated repeatedly and
> it takes res_counter's spin_lock every time.
>
Hmm, mem_cgroup_update_tree() is called in both charge and uncharge path.
So, this patch have effect on both path, doesn't it ?
> This patch removes unnecessary calls for res_count_soft_limit_excess.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> mm/memcontrol.c | 31 +++++++++++++++----------------
> 1 file changed, 15 insertions(+), 16 deletions(-)
>
> Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
> @@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *p
> static void
> __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
> struct mem_cgroup_per_zone *mz,
> - struct mem_cgroup_tree_per_zone *mctz)
> + struct mem_cgroup_tree_per_zone *mctz,
> + unsigned long new_usage_in_excess)
It might be a nitpick, shouldn't it be unsigned long long ?
Otherwise, it looks good to me.
Reviewed-by: Daisuke Nishimura <[email protected]>
Thanks,
Daisuke Nishimura.
> {
> struct rb_node **p = &mctz->rb_root.rb_node;
> struct rb_node *parent = NULL;
> @@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_
> if (mz->on_tree)
> return;
>
> - mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
> + mz->usage_in_excess = new_usage_in_excess;
> + if (!mz->usage_in_excess)
> + return;
> while (*p) {
> parent = *p;
> mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
> @@ -382,7 +385,7 @@ static bool mem_cgroup_soft_limit_check(
>
> static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
> {
> - unsigned long long new_usage_in_excess;
> + unsigned long long excess;
> struct mem_cgroup_per_zone *mz;
> struct mem_cgroup_tree_per_zone *mctz;
> int nid = page_to_nid(page);
> @@ -395,25 +398,21 @@ static void mem_cgroup_update_tree(struc
> */
> for (; mem; mem = parent_mem_cgroup(mem)) {
> mz = mem_cgroup_zoneinfo(mem, nid, zid);
> - new_usage_in_excess =
> - res_counter_soft_limit_excess(&mem->res);
> + excess = res_counter_soft_limit_excess(&mem->res);
> /*
> * We have to update the tree if mz is on RB-tree or
> * mem is over its softlimit.
> */
> - if (new_usage_in_excess || mz->on_tree) {
> + if (excess || mz->on_tree) {
> spin_lock(&mctz->lock);
> /* if on-tree, remove it */
> if (mz->on_tree)
> __mem_cgroup_remove_exceeded(mem, mz, mctz);
> /*
> - * if over soft limit, insert again. mz->usage_in_excess
> - * will be updated properly.
> + * Insert again. mz->usage_in_excess will be updated.
> + * If excess is 0, no tree ops.
> */
> - if (new_usage_in_excess)
> - __mem_cgroup_insert_exceeded(mem, mz, mctz);
> - else
> - mz->usage_in_excess = 0;
> + __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
> spin_unlock(&mctz->lock);
> }
> }
> @@ -2216,6 +2215,7 @@ unsigned long mem_cgroup_soft_limit_recl
> unsigned long reclaimed;
> int loop = 0;
> struct mem_cgroup_tree_per_zone *mctz;
> + unsigned long long excess;
>
> if (order > 0)
> return 0;
> @@ -2260,9 +2260,8 @@ unsigned long mem_cgroup_soft_limit_recl
> __mem_cgroup_largest_soft_limit_node(mctz);
> } while (next_mz == mz);
> }
> - mz->usage_in_excess =
> - res_counter_soft_limit_excess(&mz->mem->res);
> __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
> + excess = res_counter_soft_limit_excess(&mz->mem->res);
> /*
> * One school of thought says that we should not add
> * back the node to the tree if reclaim returns 0.
> @@ -2271,8 +2270,8 @@ unsigned long mem_cgroup_soft_limit_recl
> * memory to reclaim from. Consider this as a longer
> * term TODO.
> */
> - if (mz->usage_in_excess)
> - __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
> + /* If excess == 0, no tree ops */
> + __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
> spin_unlock(&mctz->lock);
> css_put(&mz->mem->css);
> loop++;
>
On Wed, 2 Sep 2009 14:16:39 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Wed, 2 Sep 2009 09:35:51 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> > In charge path, usage_in_excess is calculated repeatedly and
> > it takes res_counter's spin_lock every time.
> >
> Hmm, mem_cgroup_update_tree() is called in both charge and uncharge path.
> So, this patch have effect on both path, doesn't it ?
>
> > This patch removes unnecessary calls for res_count_soft_limit_excess.
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > ---
> > mm/memcontrol.c | 31 +++++++++++++++----------------
> > 1 file changed, 15 insertions(+), 16 deletions(-)
> >
> > Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
> > +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
> > @@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *p
> > static void
> > __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
> > struct mem_cgroup_per_zone *mz,
> > - struct mem_cgroup_tree_per_zone *mctz)
> > + struct mem_cgroup_tree_per_zone *mctz,
> > + unsigned long new_usage_in_excess)
> It might be a nitpick, shouldn't it be unsigned long long ?
>
Ouch, yes. I'll post fixed one, today.
Thank you for pointing out.
-Kame
> Otherwise, it looks good to me.
>
> Reviewed-by: Daisuke Nishimura <[email protected]>
>
> Thanks,
> Daisuke Nishimura.
>
> > {
> > struct rb_node **p = &mctz->rb_root.rb_node;
> > struct rb_node *parent = NULL;
> > @@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_
> > if (mz->on_tree)
> > return;
> >
> > - mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
> > + mz->usage_in_excess = new_usage_in_excess;
> > + if (!mz->usage_in_excess)
> > + return;
> > while (*p) {
> > parent = *p;
> > mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
> > @@ -382,7 +385,7 @@ static bool mem_cgroup_soft_limit_check(
> >
> > static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
> > {
> > - unsigned long long new_usage_in_excess;
> > + unsigned long long excess;
> > struct mem_cgroup_per_zone *mz;
> > struct mem_cgroup_tree_per_zone *mctz;
> > int nid = page_to_nid(page);
> > @@ -395,25 +398,21 @@ static void mem_cgroup_update_tree(struc
> > */
> > for (; mem; mem = parent_mem_cgroup(mem)) {
> > mz = mem_cgroup_zoneinfo(mem, nid, zid);
> > - new_usage_in_excess =
> > - res_counter_soft_limit_excess(&mem->res);
> > + excess = res_counter_soft_limit_excess(&mem->res);
> > /*
> > * We have to update the tree if mz is on RB-tree or
> > * mem is over its softlimit.
> > */
> > - if (new_usage_in_excess || mz->on_tree) {
> > + if (excess || mz->on_tree) {
> > spin_lock(&mctz->lock);
> > /* if on-tree, remove it */
> > if (mz->on_tree)
> > __mem_cgroup_remove_exceeded(mem, mz, mctz);
> > /*
> > - * if over soft limit, insert again. mz->usage_in_excess
> > - * will be updated properly.
> > + * Insert again. mz->usage_in_excess will be updated.
> > + * If excess is 0, no tree ops.
> > */
> > - if (new_usage_in_excess)
> > - __mem_cgroup_insert_exceeded(mem, mz, mctz);
> > - else
> > - mz->usage_in_excess = 0;
> > + __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
> > spin_unlock(&mctz->lock);
> > }
> > }
> > @@ -2216,6 +2215,7 @@ unsigned long mem_cgroup_soft_limit_recl
> > unsigned long reclaimed;
> > int loop = 0;
> > struct mem_cgroup_tree_per_zone *mctz;
> > + unsigned long long excess;
> >
> > if (order > 0)
> > return 0;
> > @@ -2260,9 +2260,8 @@ unsigned long mem_cgroup_soft_limit_recl
> > __mem_cgroup_largest_soft_limit_node(mctz);
> > } while (next_mz == mz);
> > }
> > - mz->usage_in_excess =
> > - res_counter_soft_limit_excess(&mz->mem->res);
> > __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
> > + excess = res_counter_soft_limit_excess(&mz->mem->res);
> > /*
> > * One school of thought says that we should not add
> > * back the node to the tree if reclaim returns 0.
> > @@ -2271,8 +2270,8 @@ unsigned long mem_cgroup_soft_limit_recl
> > * memory to reclaim from. Consider this as a longer
> > * term TODO.
> > */
> > - if (mz->usage_in_excess)
> > - __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
> > + /* If excess == 0, no tree ops */
> > + __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
> > spin_unlock(&mctz->lock);
> > css_put(&mz->mem->css);
> > loop++;
> >
>
In charge/uncharge/reclaim path, usage_in_excess is calculated repeatedly and
it takes res_counter's spin_lock every time.
This patch removes unnecessary calls for res_count_soft_limit_excess.
Changelog:
- fixed description.
- fixed unsigned long to be unsigned long long (Thanks, Nishimura)
Reviewed-by: Daisuke Nishimura <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 31 +++++++++++++++----------------
1 file changed, 15 insertions(+), 16 deletions(-)
Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Aug27/mm/memcontrol.c
@@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *p
static void
__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
struct mem_cgroup_per_zone *mz,
- struct mem_cgroup_tree_per_zone *mctz)
+ struct mem_cgroup_tree_per_zone *mctz,
+ unsigned long long new_usage_in_excess)
{
struct rb_node **p = &mctz->rb_root.rb_node;
struct rb_node *parent = NULL;
@@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_
if (mz->on_tree)
return;
- mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
+ mz->usage_in_excess = new_usage_in_excess;
+ if (!mz->usage_in_excess)
+ return;
while (*p) {
parent = *p;
mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -382,7 +385,7 @@ static bool mem_cgroup_soft_limit_check(
static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
{
- unsigned long long new_usage_in_excess;
+ unsigned long long excess;
struct mem_cgroup_per_zone *mz;
struct mem_cgroup_tree_per_zone *mctz;
int nid = page_to_nid(page);
@@ -395,25 +398,21 @@ static void mem_cgroup_update_tree(struc
*/
for (; mem; mem = parent_mem_cgroup(mem)) {
mz = mem_cgroup_zoneinfo(mem, nid, zid);
- new_usage_in_excess =
- res_counter_soft_limit_excess(&mem->res);
+ excess = res_counter_soft_limit_excess(&mem->res);
/*
* We have to update the tree if mz is on RB-tree or
* mem is over its softlimit.
*/
- if (new_usage_in_excess || mz->on_tree) {
+ if (excess || mz->on_tree) {
spin_lock(&mctz->lock);
/* if on-tree, remove it */
if (mz->on_tree)
__mem_cgroup_remove_exceeded(mem, mz, mctz);
/*
- * if over soft limit, insert again. mz->usage_in_excess
- * will be updated properly.
+ * Insert again. mz->usage_in_excess will be updated.
+ * If excess is 0, no tree ops.
*/
- if (new_usage_in_excess)
- __mem_cgroup_insert_exceeded(mem, mz, mctz);
- else
- mz->usage_in_excess = 0;
+ __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
spin_unlock(&mctz->lock);
}
}
@@ -2216,6 +2215,7 @@ unsigned long mem_cgroup_soft_limit_recl
unsigned long reclaimed;
int loop = 0;
struct mem_cgroup_tree_per_zone *mctz;
+ unsigned long long excess;
if (order > 0)
return 0;
@@ -2260,9 +2260,8 @@ unsigned long mem_cgroup_soft_limit_recl
__mem_cgroup_largest_soft_limit_node(mctz);
} while (next_mz == mz);
}
- mz->usage_in_excess =
- res_counter_soft_limit_excess(&mz->mem->res);
__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
+ excess = res_counter_soft_limit_excess(&mz->mem->res);
/*
* One school of thought says that we should not add
* back the node to the tree if reclaim returns 0.
@@ -2271,8 +2270,8 @@ unsigned long mem_cgroup_soft_limit_recl
* memory to reclaim from. Consider this as a longer
* term TODO.
*/
- if (mz->usage_in_excess)
- __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
+ /* If excess == 0, no tree ops */
+ __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
spin_unlock(&mctz->lock);
css_put(&mz->mem->css);
loop++;
On Wed, 2 Sep 2009 13:41:14 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> #ifdef CONFIG_SWAP
> /*
> * called after __delete_from_swap_cache() and drop "page" account.
> Index: mmotm-2.6.31-Aug27/include/linux/sched.h
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/include/linux/sched.h
> +++ mmotm-2.6.31-Aug27/include/linux/sched.h
> @@ -1540,6 +1540,13 @@ struct task_struct {
> unsigned long trace_recursion;
> #endif /* CONFIG_TRACING */
> unsigned long stack_start;
> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
> + struct memcg_batch_info {
> + bool do_batch;
> + struct mem_cgroup *memcg;
> + long pages, memsw;
> + } memcg_batch;
> +#endif
I wonder I should make this as a pointer
struct memcg_batch_info *
and allocate this dynamically at attach. Hmm. maybe useful for other purpose
to track per-task memcg behavior.
Regards,
-Kmae
I'm sorry that I'll be absent tomorrow. This is dump of current code.
IMHO, this version is enough simple.
My next target is css's refcnt per page. I think we never need it...
=
This is a code for batched charging using percpu cache.
At charge, memcg charges 64pages and remember it in percpu cache.
Because it's cache, drain/flushed if necessary.
This version uses public percpu area , not per-memcg percpu area.
2 benefits of public percpu area.
1. Sum of stocked charge in the system is limited to # of cpus
not to the number of memcg. This shows better synchonization.
2. drain code for flush/cpuhotplug is very easy (and quick)
The most important point of this patch is that we never touch res_counter
in fast path. The res_counter is system-wide shared counter which is modified
very frequently. We shouldn't touch it as far as we can for avoid false sharing.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 88 insertions(+), 5 deletions(-)
Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Aug27/mm/memcontrol.c
@@ -275,6 +275,7 @@ enum charge_type {
static void mem_cgroup_get(struct mem_cgroup *mem);
static void mem_cgroup_put(struct mem_cgroup *mem);
static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
+static void drain_all_stock(void);
static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -1136,6 +1137,8 @@ static int mem_cgroup_hierarchical_recla
victim = mem_cgroup_select_victim(root_mem);
if (victim == root_mem) {
loop++;
+ if (loop >= 1)
+ drain_all_stock();
if (loop >= 2) {
/*
* If we have not been able to reclaim
@@ -1253,6 +1256,79 @@ done:
unlock_page_cgroup(pc);
}
+#define CHARGE_SIZE (64 * PAGE_SIZE)
+struct memcg_stock_pcp {
+ struct mem_cgroup *from;
+ int charge;
+};
+DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+
+static bool consume_stock(struct mem_cgroup *mem)
+{
+ struct memcg_stock_pcp *stock;
+ bool ret = true;
+
+ stock = &get_cpu_var(memcg_stock);
+ if (mem == stock->from && stock->charge)
+ stock->charge -= PAGE_SIZE;
+ else
+ ret = false;
+ put_cpu_var(memcg_stock);
+ return ret;
+}
+
+static void drain_stock(struct memcg_stock_pcp *stock)
+{
+ struct mem_cgroup *old = stock->from;
+
+ if (stock->charge) {
+ res_counter_uncharge(&old->res, stock->charge);
+ if (do_swap_account)
+ res_counter_uncharge(&old->memsw, stock->charge);
+ }
+ stock->from = NULL;
+}
+
+static void drain_local_stock(struct work_struct *dummy)
+{
+ struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
+ drain_stock(stock);
+ put_cpu_var(memcg_stock);
+}
+
+static void refill_stock(struct mem_cgroup *mem, int val)
+{
+ struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
+
+ if (stock->from != mem) {
+ drain_stock(stock);
+ stock->from = mem;
+ }
+ stock->charge = val;
+ put_cpu_var(memcg_stock);
+}
+
+static void drain_all_stock(void)
+{
+ schedule_on_each_cpu(drain_local_stock);
+}
+
+static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
+ unsigned long action,
+ void *hcpu)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ int cpu = (unsigned long)*hcpu;
+ struct memcg_stock_pcp *stock;
+
+ if (action != CPU_DEAD)
+ return NOTIFY_OK;
+ stock = per_cpu(memcg_stock, cpu);
+ drain_stock(stock);
+#endif
+ return NOTIFY_OK;
+}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
@@ -1288,23 +1364,25 @@ static int __mem_cgroup_try_charge(struc
return 0;
VM_BUG_ON(css_is_removed(&mem->css));
+ if (mem_cgroup_is_root(mem))
+ goto done;
+ if (consume_stock(mem))
+ goto charged;
while (1) {
int ret = 0;
unsigned long flags = 0;
- if (mem_cgroup_is_root(mem))
- goto done;
- ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+ ret = res_counter_charge(&mem->res, CHARGE_SIZE, &fail_res);
if (likely(!ret)) {
if (!do_swap_account)
break;
- ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
+ ret = res_counter_charge(&mem->memsw, CHARGE_SIZE,
&fail_res);
if (likely(!ret))
break;
/* mem+swap counter fails */
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ res_counter_uncharge(&mem->res, CHARGE_SIZE);
flags |= MEM_CGROUP_RECLAIM_NOSWAP;
mem_over_limit = mem_cgroup_from_res_counter(fail_res,
memsw);
@@ -1342,6 +1420,9 @@ static int __mem_cgroup_try_charge(struc
goto nomem;
}
}
+ refill_stock(mem, CHARGE_SIZE - PAGE_SIZE);
+
+charged:
/*
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
@@ -2448,6 +2529,7 @@ move_account:
goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
+ drain_all_stock();
ret = 0;
for_each_node_state(node, N_HIGH_MEMORY) {
for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
@@ -3166,6 +3248,7 @@ mem_cgroup_create(struct cgroup_subsys *
root_mem_cgroup = mem;
if (mem_cgroup_soft_limit_tree_init())
goto free_out;
+ hotcpu_notifier(memcg_stock_cpu_callback, 0);
} else {
parent = mem_cgroup_from_cont(cont->parent);
On Wed, Sep 2, 2009 at 2:59 PM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> I'm sorry that I'll be absent tomorrow. This is dump of current code.
> IMHO, this version is enough simple.
>
> My next target is css's refcnt per page. I think we never need it...
Is this against 27th August 2009 mmotm?
Balbir Singh
On Wed, Sep 2, 2009 at 11:26 AM, KAMEZAWA
Hiroyuki<[email protected]> wrote:
> In charge/uncharge/reclaim path, usage_in_excess is calculated repeatedly and
> it takes res_counter's spin_lock every time.
>
I think the changelog needs to mention some refactoring you've done
below as well, like change new_charge_in_excess to excess.
> This patch removes unnecessary calls for res_count_soft_limit_excess.
>
> Changelog:
> ?- fixed description.
> ?- fixed unsigned long to be unsigned long long (Thanks, Nishimura)
>
> Reviewed-by: Daisuke Nishimura <[email protected]>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> ?mm/memcontrol.c | ? 31 +++++++++++++++----------------
> ?1 file changed, 15 insertions(+), 16 deletions(-)
>
> Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
> @@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *p
> ?static void
> ?__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?struct mem_cgroup_per_zone *mz,
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct mem_cgroup_tree_per_zone *mctz)
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? struct mem_cgroup_tree_per_zone *mctz,
> + ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? unsigned long long new_usage_in_excess)
> ?{
> ? ? ? ?struct rb_node **p = &mctz->rb_root.rb_node;
> ? ? ? ?struct rb_node *parent = NULL;
> @@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_
> ? ? ? ?if (mz->on_tree)
> ? ? ? ? ? ? ? ?return;
>
> - ? ? ? mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
> + ? ? ? mz->usage_in_excess = new_usage_in_excess;
> + ? ? ? if (!mz->usage_in_excess)
> + ? ? ? ? ? ? ? return;
> ? ? ? ?while (*p) {
> ? ? ? ? ? ? ? ?parent = *p;
> ? ? ? ? ? ? ? ?mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
> @@ -382,7 +385,7 @@ static bool mem_cgroup_soft_limit_check(
>
> ?static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
> ?{
> - ? ? ? unsigned long long new_usage_in_excess;
> + ? ? ? unsigned long long excess;
> ? ? ? ?struct mem_cgroup_per_zone *mz;
> ? ? ? ?struct mem_cgroup_tree_per_zone *mctz;
> ? ? ? ?int nid = page_to_nid(page);
> @@ -395,25 +398,21 @@ static void mem_cgroup_update_tree(struc
> ? ? ? ? */
> ? ? ? ?for (; mem; mem = parent_mem_cgroup(mem)) {
> ? ? ? ? ? ? ? ?mz = mem_cgroup_zoneinfo(mem, nid, zid);
> - ? ? ? ? ? ? ? new_usage_in_excess =
> - ? ? ? ? ? ? ? ? ? ? ? res_counter_soft_limit_excess(&mem->res);
> + ? ? ? ? ? ? ? excess = res_counter_soft_limit_excess(&mem->res);
> ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? * We have to update the tree if mz is on RB-tree or
> ? ? ? ? ? ? ? ? * mem is over its softlimit.
> ? ? ? ? ? ? ? ? */
> - ? ? ? ? ? ? ? if (new_usage_in_excess || mz->on_tree) {
> + ? ? ? ? ? ? ? if (excess || mz->on_tree) {
> ? ? ? ? ? ? ? ? ? ? ? ?spin_lock(&mctz->lock);
> ? ? ? ? ? ? ? ? ? ? ? ?/* if on-tree, remove it */
> ? ? ? ? ? ? ? ? ? ? ? ?if (mz->on_tree)
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?__mem_cgroup_remove_exceeded(mem, mz, mctz);
> ? ? ? ? ? ? ? ? ? ? ? ?/*
> - ? ? ? ? ? ? ? ? ? ? ? ?* if over soft limit, insert again. mz->usage_in_excess
> - ? ? ? ? ? ? ? ? ? ? ? ?* will be updated properly.
> + ? ? ? ? ? ? ? ? ? ? ? ?* Insert again. mz->usage_in_excess will be updated.
> + ? ? ? ? ? ? ? ? ? ? ? ?* If excess is 0, no tree ops.
> ? ? ? ? ? ? ? ? ? ? ? ? */
> - ? ? ? ? ? ? ? ? ? ? ? if (new_usage_in_excess)
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? __mem_cgroup_insert_exceeded(mem, mz, mctz);
> - ? ? ? ? ? ? ? ? ? ? ? else
> - ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? mz->usage_in_excess = 0;
> + ? ? ? ? ? ? ? ? ? ? ? __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
> ? ? ? ? ? ? ? ? ? ? ? ?spin_unlock(&mctz->lock);
> ? ? ? ? ? ? ? ?}
> ? ? ? ?}
> @@ -2216,6 +2215,7 @@ unsigned long mem_cgroup_soft_limit_recl
> ? ? ? ?unsigned long reclaimed;
> ? ? ? ?int loop = 0;
> ? ? ? ?struct mem_cgroup_tree_per_zone *mctz;
> + ? ? ? unsigned long long excess;
>
> ? ? ? ?if (order > 0)
> ? ? ? ? ? ? ? ?return 0;
> @@ -2260,9 +2260,8 @@ unsigned long mem_cgroup_soft_limit_recl
> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?__mem_cgroup_largest_soft_limit_node(mctz);
> ? ? ? ? ? ? ? ? ? ? ? ?} while (next_mz == mz);
> ? ? ? ? ? ? ? ?}
> - ? ? ? ? ? ? ? mz->usage_in_excess =
> - ? ? ? ? ? ? ? ? ? ? ? res_counter_soft_limit_excess(&mz->mem->res);
> ? ? ? ? ? ? ? ?__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
> + ? ? ? ? ? ? ? excess = res_counter_soft_limit_excess(&mz->mem->res);
> ? ? ? ? ? ? ? ?/*
> ? ? ? ? ? ? ? ? * One school of thought says that we should not add
> ? ? ? ? ? ? ? ? * back the node to the tree if reclaim returns 0.
> @@ -2271,8 +2270,8 @@ unsigned long mem_cgroup_soft_limit_recl
> ? ? ? ? ? ? ? ? * memory to reclaim from. Consider this as a longer
> ? ? ? ? ? ? ? ? * term TODO.
> ? ? ? ? ? ? ? ? */
> - ? ? ? ? ? ? ? if (mz->usage_in_excess)
> - ? ? ? ? ? ? ? ? ? ? ? __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
> + ? ? ? ? ? ? ? /* If excess == 0, no tree ops */
> + ? ? ? ? ? ? ? __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
> ? ? ? ? ? ? ? ?spin_unlock(&mctz->lock);
> ? ? ? ? ? ? ? ?css_put(&mz->mem->css);
> ? ? ? ? ? ? ? ?loop++;
OK.. so everytime we call __mem_cgroup_insert_exceeded we save one
res_counter operation.
Looks good
Acked-by: Balbir Singh <[email protected]>
Balbir Singh
Balbir Singh $B$5$s$O=q$-$^$7$?!'(B
> On Wed, Sep 2, 2009 at 2:59 PM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
>> I'm sorry that I'll be absent tomorrow. This is dump of current code.
>> IMHO, this version is enough simple.
>>
>> My next target is css's refcnt per page. I think we never need it...
>
> Is this against 27th August 2009 mmotm?
>
Onto mmotm-27+ all patches I sent. Maybe arguments to res_counter will
HUNK, at least
Thanks,
-Kame
> Balbir Singh
>
Balbir Singh $B$5$s$O=q$-$^$7$?!'(B
> On Wed, Sep 2, 2009 at 11:26 AM, KAMEZAWA
> Hiroyuki<[email protected]> wrote:
>> In charge/uncharge/reclaim path, usage_in_excess is calculated
>> repeatedly and
>> it takes res_counter's spin_lock every time.
>>
>
> I think the changelog needs to mention some refactoring you've done
> below as well, like change new_charge_in_excess to excess.
>
will do when I sent out v3. (and I'll have to do, anyway.)
Bye,
-Kame
>
>
>> This patch removes unnecessary calls for res_count_soft_limit_excess.
>>
>> Changelog:
>>  - fixed description.
>>  - fixed unsigned long to be unsigned long long (Thanks, Nishimura)
>>
>> Reviewed-by: Daisuke Nishimura <[email protected]>
>> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>> ---
>>  mm/memcontrol.c |   31 +++++++++++++++----------------
>>  1 file changed, 15 insertions(+), 16 deletions(-)
>>
>> Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
>> ===================================================================
>> --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
>> +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
>> @@ -313,7 +313,8 @@ soft_limit_tree_from_page(struct page *p
>>  static void
>>  __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
>>                    
           struct mem_cgroup_per_zone
*mz,
>> -                    
          struct mem_cgroup_tree_per_zone
*mctz)
>> +                    
          struct mem_cgroup_tree_per_zone
*mctz,
>> +                    
          unsigned long long
new_usage_in_excess)
>>  {
>>        struct rb_node **p = &mctz->rb_root.rb_node;
>>        struct rb_node *parent = NULL;
>> @@ -322,7 +323,9 @@ __mem_cgroup_insert_exceeded(struct mem_
>>        if (mz->on_tree)
>>                return;
>>
>> -       mz->usage_in_excess =
res_counter_soft_limit_excess(&mem->res);
>> +       mz->usage_in_excess = new_usage_in_excess;
>> +       if (!mz->usage_in_excess)
>> +               return;
>>        while (*p) {
>>                parent = *p;
>>                mz_node =
rb_entry(parent, struct mem_cgroup_per_zone,
>> @@ -382,7 +385,7 @@ static bool mem_cgroup_soft_limit_check(
>>
>>  static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct
page *page)
>>  {
>> -       unsigned long long new_usage_in_excess;
>> +       unsigned long long excess;
>>        struct mem_cgroup_per_zone *mz;
>>        struct mem_cgroup_tree_per_zone *mctz;
>>        int nid = page_to_nid(page);
>> @@ -395,25 +398,21 @@ static void mem_cgroup_update_tree(struc
>>         */
>>        for (; mem; mem = parent_mem_cgroup(mem)) {
>>                mz =
mem_cgroup_zoneinfo(mem, nid, zid);
>> -               new_usage_in_excess =
>> -                    
  res_counter_soft_limit_excess(&mem->res);
>> +               excess =
res_counter_soft_limit_excess(&mem->res);
>>                /*
>>                 * We have to
update the tree if mz is on RB-tree or
>>                 * mem is over
its softlimit.
>>                 */
>> -               if
(new_usage_in_excess || mz->on_tree) {
>> +               if (excess ||
mz->on_tree) {
>>                    
   spin_lock(&mctz->lock);
>>                    
   /* if on-tree, remove it */
>>                    
   if (mz->on_tree)
>>                    
         
 __mem_cgroup_remove_exceeded(mem, mz, mctz);
>>                    
   /*
>> -                    
   * if over soft limit, insert again. mz->usage_in_excess
>> -                    
   * will be updated properly.
>> +                    
   * Insert again. mz->usage_in_excess will be updated.
>> +                    
   * If excess is 0, no tree ops.
>>                    
    */
>> -                    
  if (new_usage_in_excess)
>> -                    
          __mem_cgroup_insert_exceeded(mem,
mz, mctz);
>> -                    
  else
>> -                    
          mz->usage_in_excess = 0;
>> +                    
  __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
>>                    
   spin_unlock(&mctz->lock);
>>                }
>>        }
>> @@ -2216,6 +2215,7 @@ unsigned long mem_cgroup_soft_limit_recl
>>        unsigned long reclaimed;
>>        int loop = 0;
>>        struct mem_cgroup_tree_per_zone *mctz;
>> +       unsigned long long excess;
>>
>>        if (order > 0)
>>                return 0;
>> @@ -2260,9 +2260,8 @@ unsigned long mem_cgroup_soft_limit_recl
>>                    
         
 __mem_cgroup_largest_soft_limit_node(mctz);
>>                    
   } while (next_mz == mz);
>>                }
>> -               mz->usage_in_excess =
>> -                    
  res_counter_soft_limit_excess(&mz->mem->res);
>>              
 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
>> +               excess =
res_counter_soft_limit_excess(&mz->mem->res);
>>                /*
>>                 * One school of
thought says that we should not add
>>                 * back the node
to the tree if reclaim returns 0.
>> @@ -2271,8 +2270,8 @@ unsigned long mem_cgroup_soft_limit_recl
>>                 * memory to
reclaim from. Consider this as a longer
>>                 * term TODO.
>>                 */
>> -               if
(mz->usage_in_excess)
>> -                    
  __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
>> +               /* If excess == 0,
no tree ops */
>> +              
__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
>>              
 spin_unlock(&mctz->lock);
>>              
 css_put(&mz->mem->css);
>>                loop++;
>
> OK.. so everytime we call __mem_cgroup_insert_exceeded we save one
> res_counter operation.
>
> Looks good
>
> Acked-by: Balbir Singh <[email protected]>
>
> Balbir Singh
>
On Wed, 2 Sep 2009 18:29:23 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> I'm sorry that I'll be absent tomorrow. This is dump of current code.
> IMHO, this version is enough simple.
>
> My next target is css's refcnt per page. I think we never need it...
>
hmm, maybe.
> =
> This is a code for batched charging using percpu cache.
> At charge, memcg charges 64pages and remember it in percpu cache.
> Because it's cache, drain/flushed if necessary.
>
> This version uses public percpu area , not per-memcg percpu area.
> 2 benefits of public percpu area.
> 1. Sum of stocked charge in the system is limited to # of cpus
> not to the number of memcg. This shows better synchonization.
> 2. drain code for flush/cpuhotplug is very easy (and quick)
>
> The most important point of this patch is that we never touch res_counter
> in fast path. The res_counter is system-wide shared counter which is modified
> very frequently. We shouldn't touch it as far as we can for avoid false sharing.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
It looks basically good. I'll do some tests with all patches applied.
I have some comments inlined.
> ---
> mm/memcontrol.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> 1 file changed, 88 insertions(+), 5 deletions(-)
>
> Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
> @@ -275,6 +275,7 @@ enum charge_type {
> static void mem_cgroup_get(struct mem_cgroup *mem);
> static void mem_cgroup_put(struct mem_cgroup *mem);
> static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> +static void drain_all_stock(void);
>
> static struct mem_cgroup_per_zone *
> mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
> @@ -1136,6 +1137,8 @@ static int mem_cgroup_hierarchical_recla
> victim = mem_cgroup_select_victim(root_mem);
> if (victim == root_mem) {
> loop++;
> + if (loop >= 1)
> + drain_all_stock();
> if (loop >= 2) {
> /*
> * If we have not been able to reclaim
> @@ -1253,6 +1256,79 @@ done:
> unlock_page_cgroup(pc);
> }
>
> +#define CHARGE_SIZE (64 * PAGE_SIZE)
> +struct memcg_stock_pcp {
> + struct mem_cgroup *from;
> + int charge;
> +};
> +DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
> +
It might be better to add "static".
> +static bool consume_stock(struct mem_cgroup *mem)
> +{
> + struct memcg_stock_pcp *stock;
> + bool ret = true;
> +
> + stock = &get_cpu_var(memcg_stock);
> + if (mem == stock->from && stock->charge)
> + stock->charge -= PAGE_SIZE;
> + else
> + ret = false;
> + put_cpu_var(memcg_stock);
> + return ret;
> +}
> +
> +static void drain_stock(struct memcg_stock_pcp *stock)
> +{
> + struct mem_cgroup *old = stock->from;
> +
> + if (stock->charge) {
> + res_counter_uncharge(&old->res, stock->charge);
> + if (do_swap_account)
> + res_counter_uncharge(&old->memsw, stock->charge);
> + }
> + stock->from = NULL;
We must clear stock->charge too.
> +}
> +
> +static void drain_local_stock(struct work_struct *dummy)
> +{
> + struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
> + drain_stock(stock);
> + put_cpu_var(memcg_stock);
> +}
> +
> +static void refill_stock(struct mem_cgroup *mem, int val)
> +{
> + struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
> +
> + if (stock->from != mem) {
> + drain_stock(stock);
> + stock->from = mem;
> + }
> + stock->charge = val;
> + put_cpu_var(memcg_stock);
> +}
> +
> +static void drain_all_stock(void)
> +{
> + schedule_on_each_cpu(drain_local_stock);
> +}
> +
> +static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
> + unsigned long action,
> + void *hcpu)
> +{
> +#ifdef CONFIG_HOTPLUG_CPU
> + int cpu = (unsigned long)*hcpu;
'*' isn't needed.
> + struct memcg_stock_pcp *stock;
> +
> + if (action != CPU_DEAD)
> + return NOTIFY_OK;
> + stock = per_cpu(memcg_stock, cpu);
'&' is needed.
> + drain_stock(stock);
> +#endif
> + return NOTIFY_OK;
> +}
> +
> /*
> * Unlike exported interface, "oom" parameter is added. if oom==true,
> * oom-killer can be invoked.
> @@ -1288,23 +1364,25 @@ static int __mem_cgroup_try_charge(struc
> return 0;
>
> VM_BUG_ON(css_is_removed(&mem->css));
> + if (mem_cgroup_is_root(mem))
> + goto done;
> + if (consume_stock(mem))
> + goto charged;
>
> while (1) {
> int ret = 0;
> unsigned long flags = 0;
>
> - if (mem_cgroup_is_root(mem))
> - goto done;
> - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
> + ret = res_counter_charge(&mem->res, CHARGE_SIZE, &fail_res);
> if (likely(!ret)) {
> if (!do_swap_account)
> break;
> - ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
> + ret = res_counter_charge(&mem->memsw, CHARGE_SIZE,
> &fail_res);
> if (likely(!ret))
> break;
> /* mem+swap counter fails */
> - res_counter_uncharge(&mem->res, PAGE_SIZE);
> + res_counter_uncharge(&mem->res, CHARGE_SIZE);
> flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> memsw);
> @@ -1342,6 +1420,9 @@ static int __mem_cgroup_try_charge(struc
> goto nomem;
> }
> }
> + refill_stock(mem, CHARGE_SIZE - PAGE_SIZE);
> +
> +charged:
> /*
> * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> * if they exceeds softlimit.
> @@ -2448,6 +2529,7 @@ move_account:
> goto out;
> /* This is for making all *used* pages to be on LRU. */
> lru_add_drain_all();
> + drain_all_stock();
> ret = 0;
> for_each_node_state(node, N_HIGH_MEMORY) {
> for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
> @@ -3166,6 +3248,7 @@ mem_cgroup_create(struct cgroup_subsys *
> root_mem_cgroup = mem;
> if (mem_cgroup_soft_limit_tree_init())
> goto free_out;
> + hotcpu_notifier(memcg_stock_cpu_callback, 0);
>
We should include cpu.h to use hotcpu_notifier().
> } else {
> parent = mem_cgroup_from_cont(cont->parent);
>
Thanks,
Daisuke Nishimura.
On Thu, 3 Sep 2009 14:17:27 +0900
Daisuke Nishimura <[email protected]> wrote:
> > =
> > This is a code for batched charging using percpu cache.
> > At charge, memcg charges 64pages and remember it in percpu cache.
> > Because it's cache, drain/flushed if necessary.
> >
> > This version uses public percpu area , not per-memcg percpu area.
> > 2 benefits of public percpu area.
> > 1. Sum of stocked charge in the system is limited to # of cpus
> > not to the number of memcg. This shows better synchonization.
> > 2. drain code for flush/cpuhotplug is very easy (and quick)
> >
> > The most important point of this patch is that we never touch res_counter
> > in fast path. The res_counter is system-wide shared counter which is modified
> > very frequently. We shouldn't touch it as far as we can for avoid false sharing.
> >
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> It looks basically good. I'll do some tests with all patches applied.
>
thanks.
> I have some comments inlined.
> > ---
> > mm/memcontrol.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++++++----
> > 1 file changed, 88 insertions(+), 5 deletions(-)
> >
> > Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
> > ===================================================================
> > --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
> > +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
> > @@ -275,6 +275,7 @@ enum charge_type {
> > static void mem_cgroup_get(struct mem_cgroup *mem);
> > static void mem_cgroup_put(struct mem_cgroup *mem);
> > static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> > +static void drain_all_stock(void);
> >
> > static struct mem_cgroup_per_zone *
> > mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
> > @@ -1136,6 +1137,8 @@ static int mem_cgroup_hierarchical_recla
> > victim = mem_cgroup_select_victim(root_mem);
> > if (victim == root_mem) {
> > loop++;
> > + if (loop >= 1)
> > + drain_all_stock();
> > if (loop >= 2) {
> > /*
> > * If we have not been able to reclaim
> > @@ -1253,6 +1256,79 @@ done:
> > unlock_page_cgroup(pc);
> > }
> >
> > +#define CHARGE_SIZE (64 * PAGE_SIZE)
> > +struct memcg_stock_pcp {
> > + struct mem_cgroup *from;
> > + int charge;
> > +};
> > +DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
> > +
> It might be better to add "static".
>
ok.
> > +static bool consume_stock(struct mem_cgroup *mem)
> > +{
> > + struct memcg_stock_pcp *stock;
> > + bool ret = true;
> > +
> > + stock = &get_cpu_var(memcg_stock);
> > + if (mem == stock->from && stock->charge)
> > + stock->charge -= PAGE_SIZE;
> > + else
> > + ret = false;
> > + put_cpu_var(memcg_stock);
> > + return ret;
> > +}
> > +
> > +static void drain_stock(struct memcg_stock_pcp *stock)
> > +{
> > + struct mem_cgroup *old = stock->from;
> > +
> > + if (stock->charge) {
> > + res_counter_uncharge(&old->res, stock->charge);
> > + if (do_swap_account)
> > + res_counter_uncharge(&old->memsw, stock->charge);
> > + }
> > + stock->from = NULL;
> We must clear stock->charge too.
>
ok.
> > +}
> > +
> > +static void drain_local_stock(struct work_struct *dummy)
> > +{
> > + struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
> > + drain_stock(stock);
> > + put_cpu_var(memcg_stock);
> > +}
> > +
> > +static void refill_stock(struct mem_cgroup *mem, int val)
> > +{
> > + struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
> > +
> > + if (stock->from != mem) {
> > + drain_stock(stock);
> > + stock->from = mem;
> > + }
> > + stock->charge = val;
> > + put_cpu_var(memcg_stock);
> > +}
> > +
> > +static void drain_all_stock(void)
> > +{
> > + schedule_on_each_cpu(drain_local_stock);
> > +}
> > +
> > +static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
> > + unsigned long action,
> > + void *hcpu)
> > +{
> > +#ifdef CONFIG_HOTPLUG_CPU
> > + int cpu = (unsigned long)*hcpu;
> '*' isn't needed.
>
Hmm, ouch.
> > + struct memcg_stock_pcp *stock;
> > +
> > + if (action != CPU_DEAD)
> > + return NOTIFY_OK;
> > + stock = per_cpu(memcg_stock, cpu);
> '&' is needed.
>
yes..
> > + drain_stock(stock);
> > +#endif
> > + return NOTIFY_OK;
> > +}
> > +
> > /*
> > * Unlike exported interface, "oom" parameter is added. if oom==true,
> > * oom-killer can be invoked.
> > @@ -1288,23 +1364,25 @@ static int __mem_cgroup_try_charge(struc
> > return 0;
> >
> > VM_BUG_ON(css_is_removed(&mem->css));
> > + if (mem_cgroup_is_root(mem))
> > + goto done;
> > + if (consume_stock(mem))
> > + goto charged;
> >
> > while (1) {
> > int ret = 0;
> > unsigned long flags = 0;
> >
> > - if (mem_cgroup_is_root(mem))
> > - goto done;
> > - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
> > + ret = res_counter_charge(&mem->res, CHARGE_SIZE, &fail_res);
> > if (likely(!ret)) {
> > if (!do_swap_account)
> > break;
> > - ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
> > + ret = res_counter_charge(&mem->memsw, CHARGE_SIZE,
> > &fail_res);
> > if (likely(!ret))
> > break;
> > /* mem+swap counter fails */
> > - res_counter_uncharge(&mem->res, PAGE_SIZE);
> > + res_counter_uncharge(&mem->res, CHARGE_SIZE);
> > flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> > mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> > memsw);
> > @@ -1342,6 +1420,9 @@ static int __mem_cgroup_try_charge(struc
> > goto nomem;
> > }
> > }
> > + refill_stock(mem, CHARGE_SIZE - PAGE_SIZE);
> > +
> > +charged:
> > /*
> > * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> > * if they exceeds softlimit.
> > @@ -2448,6 +2529,7 @@ move_account:
> > goto out;
> > /* This is for making all *used* pages to be on LRU. */
> > lru_add_drain_all();
> > + drain_all_stock();
> > ret = 0;
> > for_each_node_state(node, N_HIGH_MEMORY) {
> > for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
> > @@ -3166,6 +3248,7 @@ mem_cgroup_create(struct cgroup_subsys *
> > root_mem_cgroup = mem;
> > if (mem_cgroup_soft_limit_tree_init())
> > goto free_out;
> > + hotcpu_notifier(memcg_stock_cpu_callback, 0);
> >
> We should include cpu.h to use hotcpu_notifier().
>
Ah, I'll check my .config again..
Thanks,
-Kame
> > } else {
> > parent = mem_cgroup_from_cont(cont->parent);
> >
>
>
> Thanks,
> Daisuke Nishimura.
>
A few more comments.
On Fri, 4 Sep 2009 13:18:35 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> On Thu, 3 Sep 2009 14:17:27 +0900
> Daisuke Nishimura <[email protected]> wrote:
> > > =
> > > This is a code for batched charging using percpu cache.
> > > At charge, memcg charges 64pages and remember it in percpu cache.
> > > Because it's cache, drain/flushed if necessary.
> > >
> > > This version uses public percpu area , not per-memcg percpu area.
> > > 2 benefits of public percpu area.
> > > 1. Sum of stocked charge in the system is limited to # of cpus
> > > not to the number of memcg. This shows better synchonization.
> > > 2. drain code for flush/cpuhotplug is very easy (and quick)
> > >
> > > The most important point of this patch is that we never touch res_counter
> > > in fast path. The res_counter is system-wide shared counter which is modified
> > > very frequently. We shouldn't touch it as far as we can for avoid false sharing.
> > >
> > > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > It looks basically good. I'll do some tests with all patches applied.
> >
> thanks.
>
it seems that these patches make rmdir stall again...
This batched charge patch seems not to be the (only) suspect, though.
> > > @@ -1288,23 +1364,25 @@ static int __mem_cgroup_try_charge(struc
> > > return 0;
> > >
> > > VM_BUG_ON(css_is_removed(&mem->css));
> > > + if (mem_cgroup_is_root(mem))
> > > + goto done;
> > > + if (consume_stock(mem))
> > > + goto charged;
> > >
IMHO, it would be better to check consume_stock() every time in the while loop below,
because someone might have already refilled the stock while the current context
sleeps in reclaiming memory.
> > > while (1) {
> > > int ret = 0;
> > > unsigned long flags = 0;
> > >
> > > - if (mem_cgroup_is_root(mem))
> > > - goto done;
> > > - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
> > > + ret = res_counter_charge(&mem->res, CHARGE_SIZE, &fail_res);
> > > if (likely(!ret)) {
> > > if (!do_swap_account)
> > > break;
> > > - ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
> > > + ret = res_counter_charge(&mem->memsw, CHARGE_SIZE,
> > > &fail_res);
> > > if (likely(!ret))
> > > break;
> > > /* mem+swap counter fails */
> > > - res_counter_uncharge(&mem->res, PAGE_SIZE);
> > > + res_counter_uncharge(&mem->res, CHARGE_SIZE);
> > > flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> > > mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> > > memsw);
How about changing pre-charge size according to the loop count ?
IMHO, it would be better to disable pre-charge at least in nr_retries==0 case,
i.e. it is about to causing oom.
P.S. I will not be so active next week.
Thanks,
Daisuke Nishimura.
On Fri, 4 Sep 2009 14:11:57 +0900
Daisuke Nishimura <[email protected]> wrote:
> > > > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > > It looks basically good. I'll do some tests with all patches applied.
> > >
> > thanks.
> >
> it seems that these patches make rmdir stall again...
> This batched charge patch seems not to be the (only) suspect, though.
>
Ouch, no probelm with the latest mmotm ? I think this charge-uncharge-offload
patch set doesn't use css_set()/get()...
Hm, softlimit related parts ?
> > > > @@ -1288,23 +1364,25 @@ static int __mem_cgroup_try_charge(struc
> > > > return 0;
> > > >
> > > > VM_BUG_ON(css_is_removed(&mem->css));
> > > > + if (mem_cgroup_is_root(mem))
> > > > + goto done;
> > > > + if (consume_stock(mem))
> > > > + goto charged;
> > > >
> IMHO, it would be better to check consume_stock() every time in the while loop below,
> because someone might have already refilled the stock while the current context
> sleeps in reclaiming memory.
>
Hm, make sense. I'll add it.
> > > > while (1) {
> > > > int ret = 0;
> > > > unsigned long flags = 0;
> > > >
> > > > - if (mem_cgroup_is_root(mem))
> > > > - goto done;
> > > > - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
> > > > + ret = res_counter_charge(&mem->res, CHARGE_SIZE, &fail_res);
> > > > if (likely(!ret)) {
> > > > if (!do_swap_account)
> > > > break;
> > > > - ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
> > > > + ret = res_counter_charge(&mem->memsw, CHARGE_SIZE,
> > > > &fail_res);
> > > > if (likely(!ret))
> > > > break;
> > > > /* mem+swap counter fails */
> > > > - res_counter_uncharge(&mem->res, PAGE_SIZE);
> > > > + res_counter_uncharge(&mem->res, CHARGE_SIZE);
> > > > flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> > > > mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> > > > memsw);
> How about changing pre-charge size according to the loop count ?
> IMHO, it would be better to disable pre-charge at least in nr_retries==0 case,
> i.e. it is about to causing oom.
ya, I wonder I should do that. but it increases complexity if in bad conding.
let me try.
Thanks,
-Kame
>
>
> P.S. I will not be so active next week.
>
> Thanks,
> Daisuke Nishimura.
>
On Fri, 4 Sep 2009 14:21:43 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> On Fri, 4 Sep 2009 14:11:57 +0900
> Daisuke Nishimura <[email protected]> wrote:
>
> > > > > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > > > It looks basically good. I'll do some tests with all patches applied.
> > > >
> > > thanks.
> > >
> > it seems that these patches make rmdir stall again...
> > This batched charge patch seems not to be the (only) suspect, though.
> >
> Ouch, no probelm with the latest mmotm ? I think this charge-uncharge-offload
> patch set doesn't use css_set()/get()...
> Hm, softlimit related parts ?
>
Ah, one more question. What memory.usage_in_bytes shows in that case ?
If not zero, charge/uncharge coalescing is guilty.
Thanks,
-Kame
>
> > > > > @@ -1288,23 +1364,25 @@ static int __mem_cgroup_try_charge(struc
> > > > > return 0;
> > > > >
> > > > > VM_BUG_ON(css_is_removed(&mem->css));
> > > > > + if (mem_cgroup_is_root(mem))
> > > > > + goto done;
> > > > > + if (consume_stock(mem))
> > > > > + goto charged;
> > > > >
> > IMHO, it would be better to check consume_stock() every time in the while loop below,
> > because someone might have already refilled the stock while the current context
> > sleeps in reclaiming memory.
> >
> Hm, make sense. I'll add it.
>
>
> > > > > while (1) {
> > > > > int ret = 0;
> > > > > unsigned long flags = 0;
> > > > >
> > > > > - if (mem_cgroup_is_root(mem))
> > > > > - goto done;
> > > > > - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
> > > > > + ret = res_counter_charge(&mem->res, CHARGE_SIZE, &fail_res);
> > > > > if (likely(!ret)) {
> > > > > if (!do_swap_account)
> > > > > break;
> > > > > - ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
> > > > > + ret = res_counter_charge(&mem->memsw, CHARGE_SIZE,
> > > > > &fail_res);
> > > > > if (likely(!ret))
> > > > > break;
> > > > > /* mem+swap counter fails */
> > > > > - res_counter_uncharge(&mem->res, PAGE_SIZE);
> > > > > + res_counter_uncharge(&mem->res, CHARGE_SIZE);
> > > > > flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> > > > > mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> > > > > memsw);
> > How about changing pre-charge size according to the loop count ?
> > IMHO, it would be better to disable pre-charge at least in nr_retries==0 case,
> > i.e. it is about to causing oom.
>
> ya, I wonder I should do that. but it increases complexity if in bad conding.
> let me try.
>
> Thanks,
> -Kame
>
> >
> >
> > P.S. I will not be so active next week.
> >
> > Thanks,
> > Daisuke Nishimura.
> >
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
On Fri, 4 Sep 2009 14:11:57 +0900
Daisuke Nishimura <[email protected]> wrote:
> A few more comments.
>
> On Fri, 4 Sep 2009 13:18:35 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> > On Thu, 3 Sep 2009 14:17:27 +0900
> > Daisuke Nishimura <[email protected]> wrote:
> > > > =
> > > > This is a code for batched charging using percpu cache.
> > > > At charge, memcg charges 64pages and remember it in percpu cache.
> > > > Because it's cache, drain/flushed if necessary.
> > > >
> > > > This version uses public percpu area , not per-memcg percpu area.
> > > > 2 benefits of public percpu area.
> > > > 1. Sum of stocked charge in the system is limited to # of cpus
> > > > not to the number of memcg. This shows better synchonization.
> > > > 2. drain code for flush/cpuhotplug is very easy (and quick)
> > > >
> > > > The most important point of this patch is that we never touch res_counter
> > > > in fast path. The res_counter is system-wide shared counter which is modified
> > > > very frequently. We shouldn't touch it as far as we can for avoid false sharing.
> > > >
> > > > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > > It looks basically good. I'll do some tests with all patches applied.
> > >
> > thanks.
> >
> it seems that these patches make rmdir stall again...
> This batched charge patch seems not to be the (only) suspect, though.
>
I reporduced. I doubt charge/uncharge patch is bad...I'll check it.
(And maybe very easily happen ;(
Thanks,
-Kame
> > > > @@ -1288,23 +1364,25 @@ static int __mem_cgroup_try_charge(struc
> > > > return 0;
> > > >
> > > > VM_BUG_ON(css_is_removed(&mem->css));
> > > > + if (mem_cgroup_is_root(mem))
> > > > + goto done;
> > > > + if (consume_stock(mem))
> > > > + goto charged;
> > > >
> IMHO, it would be better to check consume_stock() every time in the while loop below,
> because someone might have already refilled the stock while the current context
> sleeps in reclaiming memory.
>
> > > > while (1) {
> > > > int ret = 0;
> > > > unsigned long flags = 0;
> > > >
> > > > - if (mem_cgroup_is_root(mem))
> > > > - goto done;
> > > > - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
> > > > + ret = res_counter_charge(&mem->res, CHARGE_SIZE, &fail_res);
> > > > if (likely(!ret)) {
> > > > if (!do_swap_account)
> > > > break;
> > > > - ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
> > > > + ret = res_counter_charge(&mem->memsw, CHARGE_SIZE,
> > > > &fail_res);
> > > > if (likely(!ret))
> > > > break;
> > > > /* mem+swap counter fails */
> > > > - res_counter_uncharge(&mem->res, PAGE_SIZE);
> > > > + res_counter_uncharge(&mem->res, CHARGE_SIZE);
> > > > flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> > > > mem_over_limit = mem_cgroup_from_res_counter(fail_res,
> > > > memsw);
> How about changing pre-charge size according to the loop count ?
> IMHO, it would be better to disable pre-charge at least in nr_retries==0 case,
> i.e. it is about to causing oom.
>
>
> P.S. I will not be so active next week.
>
> Thanks,
> Daisuke Nishimura.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
On Fri, 4 Sep 2009 14:26:54 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> On Fri, 4 Sep 2009 14:21:43 +0900
> KAMEZAWA Hiroyuki <[email protected]> wrote:
>
> > On Fri, 4 Sep 2009 14:11:57 +0900
> > Daisuke Nishimura <[email protected]> wrote:
> >
> > > > > > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > > > > It looks basically good. I'll do some tests with all patches applied.
> > > > >
> > > > thanks.
> > > >
> > > it seems that these patches make rmdir stall again...
> > > This batched charge patch seems not to be the (only) suspect, though.
> > >
> > Ouch, no probelm with the latest mmotm ? I think this charge-uncharge-offload
> > patch set doesn't use css_set()/get()...
> > Hm, softlimit related parts ?
> >
hmm, these patches(including softlimit cleanup) seems not to be guilt.
Current(I'm using mmotm-2009-08-27-16-51) mmotm seems to be broken about memcg's rmdir.
I must admit I've not tested mmotm for several months because I have been working
on stabilizing mainline for a long time...
> Ah, one more question. What memory.usage_in_bytes shows in that case ?
> If not zero, charge/uncharge coalescing is guilty.
>
usage_in_bytes is 0.
I've confirmed by crash command that the mem_cgroup has extra ref counts.
I'll dig more..
Thanks,
Daisuke Nishimura.
On Fri, 4 Sep 2009 15:40:50 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Fri, 4 Sep 2009 14:26:54 +0900, KAMEZAWA Hiroyuki <[email protected]> wrote:
> > On Fri, 4 Sep 2009 14:21:43 +0900
> > KAMEZAWA Hiroyuki <[email protected]> wrote:
> >
> > > On Fri, 4 Sep 2009 14:11:57 +0900
> > > Daisuke Nishimura <[email protected]> wrote:
> > >
> > > > > > > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > > > > > It looks basically good. I'll do some tests with all patches applied.
> > > > > >
> > > > > thanks.
> > > > >
> > > > it seems that these patches make rmdir stall again...
> > > > This batched charge patch seems not to be the (only) suspect, though.
> > > >
> > > Ouch, no probelm with the latest mmotm ? I think this charge-uncharge-offload
> > > patch set doesn't use css_set()/get()...
> > > Hm, softlimit related parts ?
> > >
> hmm, these patches(including softlimit cleanup) seems not to be guilt.
> Current(I'm using mmotm-2009-08-27-16-51) mmotm seems to be broken about memcg's rmdir.
>
> I must admit I've not tested mmotm for several months because I have been working
> on stabilizing mainline for a long time...
>
> > Ah, one more question. What memory.usage_in_bytes shows in that case ?
> > If not zero, charge/uncharge coalescing is guilty.
> >
> usage_in_bytes is 0.
> I've confirmed by crash command that the mem_cgroup has extra ref counts.
>
> I'll dig more..
>
Ok, then, I and you see different problem...
Hmm..css refcnt leak. I'll dig, too. I hope we can catch it before sneaking
into mainline.
Thanks,
-Kame
>
> Thanks,
> Daisuke Nishimura.
>
On Fri, 4 Sep 2009 15:40:50 +0900
Daisuke Nishimura <[email protected]> wrote:
> > Ah, one more question. What memory.usage_in_bytes shows in that case ?
> > If not zero, charge/uncharge coalescing is guilty.
> >
> usage_in_bytes is 0.
> I've confirmed by crash command that the mem_cgroup has extra ref counts.
>
> I'll dig more..
>
BTW, do you use softlimit ? I found this but...Hmm
==
SoftLimit tree 'find next one' loop uses next_mz to remember
next one to be visited if reclaimd==0.
But css'refcnt handling for next_mz is not enough and it makes
css->refcnt leak.
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
mm/memcontrol.c | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
===================================================================
--- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
+++ mmotm-2.6.31-Aug27/mm/memcontrol.c
@@ -2261,6 +2261,8 @@ unsigned long mem_cgroup_soft_limit_recl
if (!reclaimed) {
do {
/*
+ * Loop until we find yet another one.
+ *
* By the time we get the soft_limit lock
* again, someone might have aded the
* group back on the RB tree. Iterate to
@@ -2271,7 +2273,12 @@ unsigned long mem_cgroup_soft_limit_recl
*/
next_mz =
__mem_cgroup_largest_soft_limit_node(mctz);
- } while (next_mz == mz);
+ if (next_mz == mz) {
+ css_put(&next_mz->mem->css);
+ next_mz = NULL;
+ } else /* next_mz == NULL or other memcg */
+ break;
+ } while (1);
}
mz->usage_in_excess =
res_counter_soft_limit_excess(&mz->mem->res);
@@ -2299,6 +2306,8 @@ unsigned long mem_cgroup_soft_limit_recl
loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
break;
} while (!nr_reclaimed);
+ if (next_mz)
+ css_put(&next_mz->mem->css);
return nr_reclaimed;
}
On Fri, 4 Sep 2009 16:37:58 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> ==
> SoftLimit tree 'find next one' loop uses next_mz to remember
> next one to be visited if reclaimd==0.
> But css'refcnt handling for next_mz is not enough and it makes
> css->refcnt leak.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
maybe this patch can be applied after
memory-controller-soft-limit-reclaim-on-contention-v9-fix.patch
Thanks,
-Kmae
>
> ---
> mm/memcontrol.c | 11 ++++++++++-
> 1 file changed, 10 insertions(+), 1 deletion(-)
>
> Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
> @@ -2261,6 +2261,8 @@ unsigned long mem_cgroup_soft_limit_recl
> if (!reclaimed) {
> do {
> /*
> + * Loop until we find yet another one.
> + *
> * By the time we get the soft_limit lock
> * again, someone might have aded the
> * group back on the RB tree. Iterate to
> @@ -2271,7 +2273,12 @@ unsigned long mem_cgroup_soft_limit_recl
> */
> next_mz =
> __mem_cgroup_largest_soft_limit_node(mctz);
> - } while (next_mz == mz);
> + if (next_mz == mz) {
> + css_put(&next_mz->mem->css);
> + next_mz = NULL;
> + } else /* next_mz == NULL or other memcg */
> + break;
> + } while (1);
> }
> mz->usage_in_excess =
> res_counter_soft_limit_excess(&mz->mem->res);
> @@ -2299,6 +2306,8 @@ unsigned long mem_cgroup_soft_limit_recl
> loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
> break;
> } while (!nr_reclaimed);
> + if (next_mz)
> + css_put(&next_mz->mem->css);
> return nr_reclaimed;
> }
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
On Fri, 4 Sep 2009 16:37:58 +0900
KAMEZAWA Hiroyuki <[email protected]> wrote:
> On Fri, 4 Sep 2009 15:40:50 +0900
> Daisuke Nishimura <[email protected]> wrote:
> > > Ah, one more question. What memory.usage_in_bytes shows in that case ?
> > > If not zero, charge/uncharge coalescing is guilty.
> > >
> > usage_in_bytes is 0.
> > I've confirmed by crash command that the mem_cgroup has extra ref counts.
> >
> > I'll dig more..
> >
> BTW, do you use softlimit ? I found this but...Hmm
>
No.
I'm sorry I can't access my machine, so can't test this.
But I think this patch itself is needed and looks good.
Reviewed-by: Daisuke Nishimura <[email protected]>
Thanks,
Daisuke Nishimura.
> ==
> SoftLimit tree 'find next one' loop uses next_mz to remember
> next one to be visited if reclaimd==0.
> But css'refcnt handling for next_mz is not enough and it makes
> css->refcnt leak.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
>
>
> ---
> mm/memcontrol.c | 11 ++++++++++-
> 1 file changed, 10 insertions(+), 1 deletion(-)
>
> Index: mmotm-2.6.31-Aug27/mm/memcontrol.c
> ===================================================================
> --- mmotm-2.6.31-Aug27.orig/mm/memcontrol.c
> +++ mmotm-2.6.31-Aug27/mm/memcontrol.c
> @@ -2261,6 +2261,8 @@ unsigned long mem_cgroup_soft_limit_recl
> if (!reclaimed) {
> do {
> /*
> + * Loop until we find yet another one.
> + *
> * By the time we get the soft_limit lock
> * again, someone might have aded the
> * group back on the RB tree. Iterate to
> @@ -2271,7 +2273,12 @@ unsigned long mem_cgroup_soft_limit_recl
> */
> next_mz =
> __mem_cgroup_largest_soft_limit_node(mctz);
> - } while (next_mz == mz);
> + if (next_mz == mz) {
> + css_put(&next_mz->mem->css);
> + next_mz = NULL;
> + } else /* next_mz == NULL or other memcg */
> + break;
> + } while (1);
> }
> mz->usage_in_excess =
> res_counter_soft_limit_excess(&mz->mem->res);
> @@ -2299,6 +2306,8 @@ unsigned long mem_cgroup_soft_limit_recl
> loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
> break;
> } while (!nr_reclaimed);
> + if (next_mz)
> + css_put(&next_mz->mem->css);
> return nr_reclaimed;
> }
>
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>
On Fri, 4 Sep 2009 19:07:26 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Fri, 4 Sep 2009 16:37:58 +0900
> KAMEZAWA Hiroyuki <[email protected]> wrote:
>
> > On Fri, 4 Sep 2009 15:40:50 +0900
> > Daisuke Nishimura <[email protected]> wrote:
> > > > Ah, one more question. What memory.usage_in_bytes shows in that case ?
> > > > If not zero, charge/uncharge coalescing is guilty.
> > > >
> > > usage_in_bytes is 0.
> > > I've confirmed by crash command that the mem_cgroup has extra ref counts.
> > >
> > > I'll dig more..
> > >
> > BTW, do you use softlimit ? I found this but...Hmm
> >
> No.
> I'm sorry I can't access my machine, so can't test this.
>
>
> But I think this patch itself is needed and looks good.
>
I've found the cause of the issue.
Andrew, could you add this one after KAMEZAWA-san's
memory-controller-soft-limit-reclaim-on-contention-v9-fix-softlimit-css-refcnt-handling.patch ?
===
From: Daisuke Nishimura <[email protected]>
refcount of the "victim" should be decremented before exiting the loop.
Signed-off-by: Daisuke Nishimura <[email protected]>
---
mm/memcontrol.c | 8 ++++++--
1 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac51294..011aba6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1133,8 +1133,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
* anything, it might because there are
* no reclaimable pages under this hierarchy
*/
- if (!check_soft || !total)
+ if (!check_soft || !total) {
+ css_put(&victim->css);
break;
+ }
/*
* We want to do more targetted reclaim.
* excess >> 2 is not to excessive so as to
@@ -1142,8 +1144,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
* coming back to reclaim from this cgroup
*/
if (total >= (excess >> 2) ||
- (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
+ (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
+ css_put(&victim->css);
break;
+ }
}
}
if (!mem_cgroup_local_usage(&victim->stat)) {
On Mon, 7 Sep 2009 08:04:03 +0900
Daisuke Nishimura <[email protected]> wrote:
> On Fri, 4 Sep 2009 19:07:26 +0900
> Daisuke Nishimura <[email protected]> wrote:
>
> > On Fri, 4 Sep 2009 16:37:58 +0900
> > KAMEZAWA Hiroyuki <[email protected]> wrote:
> >
> > > On Fri, 4 Sep 2009 15:40:50 +0900
> > > Daisuke Nishimura <[email protected]> wrote:
> > > > > Ah, one more question. What memory.usage_in_bytes shows in that case ?
> > > > > If not zero, charge/uncharge coalescing is guilty.
> > > > >
> > > > usage_in_bytes is 0.
> > > > I've confirmed by crash command that the mem_cgroup has extra ref counts.
> > > >
> > > > I'll dig more..
> > > >
> > > BTW, do you use softlimit ? I found this but...Hmm
> > >
> > No.
> > I'm sorry I can't access my machine, so can't test this.
> >
> >
> > But I think this patch itself is needed and looks good.
> >
> I've found the cause of the issue.
>
> Andrew, could you add this one after KAMEZAWA-san's
> memory-controller-soft-limit-reclaim-on-contention-v9-fix-softlimit-css-refcnt-handling.patch ?
>
> ===
> From: Daisuke Nishimura <[email protected]>
>
> refcount of the "victim" should be decremented before exiting the loop.
>
> Signed-off-by: Daisuke Nishimura <[email protected]>
Nice!
Acked-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> mm/memcontrol.c | 8 ++++++--
> 1 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index ac51294..011aba6 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1133,8 +1133,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
> * anything, it might because there are
> * no reclaimable pages under this hierarchy
> */
> - if (!check_soft || !total)
> + if (!check_soft || !total) {
> + css_put(&victim->css);
> break;
> + }
> /*
> * We want to do more targetted reclaim.
> * excess >> 2 is not to excessive so as to
> @@ -1142,8 +1144,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
> * coming back to reclaim from this cgroup
> */
> if (total >= (excess >> 2) ||
> - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
> + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
> + css_put(&victim->css);
> break;
> + }
> }
> }
> if (!mem_cgroup_local_usage(&victim->stat)) {
>
* KAMEZAWA Hiroyuki <[email protected]> [2009-09-07 09:49:12]:
> > From: Daisuke Nishimura <[email protected]>
> >
> > refcount of the "victim" should be decremented before exiting the loop.
> >
> > Signed-off-by: Daisuke Nishimura <[email protected]>
> Nice!
>
> Acked-by: KAMEZAWA Hiroyuki <[email protected]>
>
> > ---
> > mm/memcontrol.c | 8 ++++++--
> > 1 files changed, 6 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index ac51294..011aba6 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1133,8 +1133,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
> > * anything, it might because there are
> > * no reclaimable pages under this hierarchy
> > */
> > - if (!check_soft || !total)
> > + if (!check_soft || !total) {
> > + css_put(&victim->css);
> > break;
> > + }
> > /*
> > * We want to do more targetted reclaim.
> > * excess >> 2 is not to excessive so as to
> > @@ -1142,8 +1144,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
> > * coming back to reclaim from this cgroup
> > */
> > if (total >= (excess >> 2) ||
> > - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
> > + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
> > + css_put(&victim->css);
> > break;
> > + }
> > }
> > }
> > if (!mem_cgroup_local_usage(&victim->stat)) {
Good catch! Sorry for the late response I've been away
Acked-by: Balbir Singh <[email protected]>
--
Balbir