2012-02-29 05:25:40

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH 3.3] memcg: fix deadlock by inverting lrucare nesting

We have forgotten the rules of lock nesting: the irq-safe ones must be
taken inside the non-irq-safe ones, otherwise we are open to deadlock:

CPU0 CPU1
---- ----
lock(&(&pc->lock)->rlock);
local_irq_disable();
lock(&(&zone->lru_lock)->rlock);
lock(&(&pc->lock)->rlock);
<Interrupt>
lock(&(&zone->lru_lock)->rlock);

To check a different locking issue, I happened to add a spin_lock to
memcg's bit_spin_lock in lock_page_cgroup(), and lockdep very quickly
complained about __mem_cgroup_commit_charge_lrucare() (on CPU1 above).

So delete __mem_cgroup_commit_charge_lrucare(), passing a bool lrucare
to __mem_cgroup_commit_charge() instead, taking zone->lru_lock under
lock_page_cgroup() in the lrucare case.

The original was using spin_lock_irqsave, but we'd be in more trouble
if it were ever called at interrupt time: unconditional _irq is enough.
And ClearPageLRU before del from lru, SetPageLRU before add to lru: no
strong reason, but that is the ordering used consistently elsewhere.

Signed-off-by: Hugh Dickins <[email protected]>
---
Not needed for -stable: this issue came into 3.3-rc1.

mm/memcontrol.c | 72 +++++++++++++++++++++++-----------------------
1 file changed, 37 insertions(+), 35 deletions(-)

--- 3.3-rc5/mm/memcontrol.c 2012-02-25 13:02:05.165830574 -0800
+++ linux/mm/memcontrol.c 2012-02-27 23:24:54.676160755 -0800
@@ -2408,8 +2408,12 @@ static void __mem_cgroup_commit_charge(s
struct page *page,
unsigned int nr_pages,
struct page_cgroup *pc,
- enum charge_type ctype)
+ enum charge_type ctype,
+ bool lrucare)
{
+ struct zone *uninitialized_var(zone);
+ bool was_on_lru = false;
+
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
@@ -2420,6 +2424,21 @@ static void __mem_cgroup_commit_charge(s
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
*/
+
+ /*
+ * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
+ * may already be on some other mem_cgroup's LRU. Take care of it.
+ */
+ if (lrucare) {
+ zone = page_zone(page);
+ spin_lock_irq(&zone->lru_lock);
+ if (PageLRU(page)) {
+ ClearPageLRU(page);
+ del_page_from_lru_list(zone, page, page_lru(page));
+ was_on_lru = true;
+ }
+ }
+
pc->mem_cgroup = memcg;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2443,9 +2462,18 @@ static void __mem_cgroup_commit_charge(s
break;
}

+ if (lrucare) {
+ if (was_on_lru) {
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ add_page_to_lru_list(zone, page, page_lru(page));
+ }
+ spin_unlock_irq(&zone->lru_lock);
+ }
+
mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
unlock_page_cgroup(pc);
- WARN_ON_ONCE(PageLRU(page));
+
/*
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -2643,7 +2671,7 @@ static int mem_cgroup_charge_common(stru
ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
if (ret == -ENOMEM)
return ret;
- __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
+ __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
return 0;
}

@@ -2663,35 +2691,6 @@ static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
enum charge_type ctype);

-static void
-__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
- enum charge_type ctype)
-{
- struct page_cgroup *pc = lookup_page_cgroup(page);
- struct zone *zone = page_zone(page);
- unsigned long flags;
- bool removed = false;
-
- /*
- * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
- * is already on LRU. It means the page may on some other page_cgroup's
- * LRU. Take care of it.
- */
- spin_lock_irqsave(&zone->lru_lock, flags);
- if (PageLRU(page)) {
- del_page_from_lru_list(zone, page, page_lru(page));
- ClearPageLRU(page);
- removed = true;
- }
- __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
- if (removed) {
- add_page_to_lru_list(zone, page, page_lru(page));
- SetPageLRU(page);
- }
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- return;
-}
-
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
@@ -2769,13 +2768,16 @@ static void
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
enum charge_type ctype)
{
+ struct page_cgroup *pc;
+
if (mem_cgroup_disabled())
return;
if (!memcg)
return;
cgroup_exclude_rmdir(&memcg->css);

- __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
+ pc = lookup_page_cgroup(page);
+ __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
@@ -3248,7 +3250,7 @@ int mem_cgroup_prepare_migration(struct
ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
else
ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
- __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype);
+ __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
return ret;
}

@@ -3332,7 +3334,7 @@ void mem_cgroup_replace_page_cache(struc
* the newpage may be on LRU(or pagevec for LRU) already. We lock
* LRU while we overwrite pc->mem_cgroup.
*/
- __mem_cgroup_commit_charge_lrucare(newpage, memcg, type);
+ __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
}

#ifdef CONFIG_DEBUG_VM


2012-02-29 05:27:25

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH next] memcg: fix deadlock by avoiding stat lock when anon

Fix deadlock in "memcg: use new logic for page stat accounting".

page_remove_rmap() first calls mem_cgroup_begin_update_page_stat(),
which may take move_lock_mem_cgroup(), unlocked at the end of
page_remove_rmap() by mem_cgroup_end_update_page_stat().

The PageAnon case never needs to mem_cgroup_dec_page_stat(page,
MEMCG_NR_FILE_MAPPED); but it often needs to mem_cgroup_uncharge_page(),
which does lock_page_cgroup(), while holding that move_lock_mem_cgroup().
Whereas mem_cgroup_move_account() calls move_lock_mem_cgroup() while
holding lock_page_cgroup().

Since mem_cgroup_begin and end are unnecessary here for PageAnon,
simply avoid the deadlock and wasted calls in that case.

Signed-off-by: Hugh Dickins <[email protected]>
---

mm/rmap.c | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)

--- 3.3-rc5-next/mm/rmap.c 2012-02-26 23:51:46.506050210 -0800
+++ linux/mm/rmap.c 2012-02-27 20:25:56.423324211 -0800
@@ -1166,10 +1166,12 @@ void page_add_file_rmap(struct page *pag
*/
void page_remove_rmap(struct page *page)
{
+ bool anon = PageAnon(page);
bool locked;
unsigned long flags;

- mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ if (!anon)
+ mem_cgroup_begin_update_page_stat(page, &locked, &flags);
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
@@ -1181,7 +1183,7 @@ void page_remove_rmap(struct page *page)
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
*/
- if ((!PageAnon(page) || PageSwapCache(page)) &&
+ if ((!anon || PageSwapCache(page)) &&
page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
/*
@@ -1190,7 +1192,7 @@ void page_remove_rmap(struct page *page)
*/
if (unlikely(PageHuge(page)))
goto out;
- if (PageAnon(page)) {
+ if (anon) {
mem_cgroup_uncharge_page(page);
if (!PageTransHuge(page))
__dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1211,7 +1213,8 @@ void page_remove_rmap(struct page *page)
* faster for those pages still in swapcache.
*/
out:
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
+ if (!anon)
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}

/*

2012-02-29 05:29:10

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH next] memcg: remove PCG_FILE_MAPPED fix cosmetic fix

mem_cgroup_move_account() begins with "anon = PageAnon(page)", and
then anon is used thereafter: testing PageAnon(page) in the middle
makes the reader wonder if there's some race to guard against - no.

Signed-off-by: Hugh Dickins <[email protected]>
---

mm/memcontrol.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

--- 3.3-rc5-next/mm/memcontrol.c 2012-02-27 09:56:59.072001463 -0800
+++ linux/mm/memcontrol.c 2012-02-28 20:45:43.488100423 -0800
@@ -2560,7 +2560,7 @@ static int mem_cgroup_move_account(struc

move_lock_mem_cgroup(from, &flags);

- if (!PageAnon(page) && page_mapped(page)) {
+ if (!anon && page_mapped(page)) {
/* Update mapped_file data for mem_cgroup */
preempt_disable();
__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);

2012-02-29 05:30:48

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH next] memcg: remove PCG_CACHE page_cgroup flag fix

Swapping tmpfs loads show absurd wrapped rss and wrong cache in memcg's
memory.stat statistics: __mem_cgroup_uncharge_common() is failing to
distinguish the anon and tmpfs cases.

Mostly we can decide between them by PageAnon, which is reliable once
it has been set; but there are several callers who need to uncharge a
MEM_CGROUP_CHARGE_TYPE_MAPPED page before it was fully initialized,
so allow that case to override the PageAnon decision.

Signed-off-by: Hugh Dickins <[email protected]>
---

mm/memcontrol.c | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)

--- 3.3-rc5-next/mm/memcontrol.c 2012-02-25 10:06:52.496035568 -0800
+++ linux/mm/memcontrol.c 2012-02-26 10:44:32.146365398 -0800
@@ -2944,13 +2944,16 @@ __mem_cgroup_uncharge_common(struct page
if (!PageCgroupUsed(pc))
goto unlock_out;

+ anon = PageAnon(page);
+
switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+ anon = true;
+ /* fallthrough */
case MEM_CGROUP_CHARGE_TYPE_DROP:
/* See mem_cgroup_prepare_migration() */
if (page_mapped(page) || PageCgroupMigration(pc))
goto unlock_out;
- anon = true;
break;
case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
if (!PageAnon(page)) { /* Shared memory */
@@ -2958,10 +2961,8 @@ __mem_cgroup_uncharge_common(struct page
goto unlock_out;
} else if (page_mapped(page)) /* Anon */
goto unlock_out;
- anon = true;
break;
default:
- anon = false;
break;
}

2012-02-29 05:41:35

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH 3.3] memcg: fix deadlock by inverting lrucare nesting

On Tue, 28 Feb 2012 21:25:02 -0800 (PST)
Hugh Dickins <[email protected]> wrote:

> We have forgotten the rules of lock nesting: the irq-safe ones must be
> taken inside the non-irq-safe ones, otherwise we are open to deadlock:
>
> CPU0 CPU1
> ---- ----
> lock(&(&pc->lock)->rlock);
> local_irq_disable();
> lock(&(&zone->lru_lock)->rlock);
> lock(&(&pc->lock)->rlock);
> <Interrupt>
> lock(&(&zone->lru_lock)->rlock);
>
> To check a different locking issue, I happened to add a spin_lock to
> memcg's bit_spin_lock in lock_page_cgroup(), and lockdep very quickly
> complained about __mem_cgroup_commit_charge_lrucare() (on CPU1 above).
>
> So delete __mem_cgroup_commit_charge_lrucare(), passing a bool lrucare
> to __mem_cgroup_commit_charge() instead, taking zone->lru_lock under
> lock_page_cgroup() in the lrucare case.
>
> The original was using spin_lock_irqsave, but we'd be in more trouble
> if it were ever called at interrupt time: unconditional _irq is enough.
> And ClearPageLRU before del from lru, SetPageLRU before add to lru: no
> strong reason, but that is the ordering used consistently elsewhere.
>
> Signed-off-by: Hugh Dickins <[email protected]>

Thank you. And I'm sorry for adding new bug :(

This is a fix for "memcg: simplify corner case handling of LRU"
as commit 36b62ad539498d00c2d280a151abad5f7630fa73 in upstream.

Acked-by: KAMEZAWA Hiroyuki <[email protected]>


> ---
> Not needed for -stable: this issue came into 3.3-rc1.
>
> mm/memcontrol.c | 72 +++++++++++++++++++++++-----------------------
> 1 file changed, 37 insertions(+), 35 deletions(-)
>
> --- 3.3-rc5/mm/memcontrol.c 2012-02-25 13:02:05.165830574 -0800
> +++ linux/mm/memcontrol.c 2012-02-27 23:24:54.676160755 -0800
> @@ -2408,8 +2408,12 @@ static void __mem_cgroup_commit_charge(s
> struct page *page,
> unsigned int nr_pages,
> struct page_cgroup *pc,
> - enum charge_type ctype)
> + enum charge_type ctype,
> + bool lrucare)
> {
> + struct zone *uninitialized_var(zone);
> + bool was_on_lru = false;
> +
> lock_page_cgroup(pc);
> if (unlikely(PageCgroupUsed(pc))) {
> unlock_page_cgroup(pc);
> @@ -2420,6 +2424,21 @@ static void __mem_cgroup_commit_charge(s
> * we don't need page_cgroup_lock about tail pages, becase they are not
> * accessed by any other context at this point.
> */
> +
> + /*
> + * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
> + * may already be on some other mem_cgroup's LRU. Take care of it.
> + */
> + if (lrucare) {
> + zone = page_zone(page);
> + spin_lock_irq(&zone->lru_lock);
> + if (PageLRU(page)) {
> + ClearPageLRU(page);
> + del_page_from_lru_list(zone, page, page_lru(page));
> + was_on_lru = true;
> + }
> + }
> +
> pc->mem_cgroup = memcg;
> /*
> * We access a page_cgroup asynchronously without lock_page_cgroup().
> @@ -2443,9 +2462,18 @@ static void __mem_cgroup_commit_charge(s
> break;
> }
>
> + if (lrucare) {
> + if (was_on_lru) {
> + VM_BUG_ON(PageLRU(page));
> + SetPageLRU(page);
> + add_page_to_lru_list(zone, page, page_lru(page));
> + }
> + spin_unlock_irq(&zone->lru_lock);
> + }
> +
> mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
> unlock_page_cgroup(pc);
> - WARN_ON_ONCE(PageLRU(page));
> +
> /*
> * "charge_statistics" updated event counter. Then, check it.
> * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> @@ -2643,7 +2671,7 @@ static int mem_cgroup_charge_common(stru
> ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
> if (ret == -ENOMEM)
> return ret;
> - __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
> + __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
> return 0;
> }
>
> @@ -2663,35 +2691,6 @@ static void
> __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
> enum charge_type ctype);
>
> -static void
> -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
> - enum charge_type ctype)
> -{
> - struct page_cgroup *pc = lookup_page_cgroup(page);
> - struct zone *zone = page_zone(page);
> - unsigned long flags;
> - bool removed = false;
> -
> - /*
> - * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
> - * is already on LRU. It means the page may on some other page_cgroup's
> - * LRU. Take care of it.
> - */
> - spin_lock_irqsave(&zone->lru_lock, flags);
> - if (PageLRU(page)) {
> - del_page_from_lru_list(zone, page, page_lru(page));
> - ClearPageLRU(page);
> - removed = true;
> - }
> - __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
> - if (removed) {
> - add_page_to_lru_list(zone, page, page_lru(page));
> - SetPageLRU(page);
> - }
> - spin_unlock_irqrestore(&zone->lru_lock, flags);
> - return;
> -}
> -
> int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
> gfp_t gfp_mask)
> {
> @@ -2769,13 +2768,16 @@ static void
> __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
> enum charge_type ctype)
> {
> + struct page_cgroup *pc;
> +
> if (mem_cgroup_disabled())
> return;
> if (!memcg)
> return;
> cgroup_exclude_rmdir(&memcg->css);
>
> - __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
> + pc = lookup_page_cgroup(page);
> + __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
> /*
> * Now swap is on-memory. This means this page may be
> * counted both as mem and swap....double count.
> @@ -3248,7 +3250,7 @@ int mem_cgroup_prepare_migration(struct
> ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
> else
> ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
> - __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype);
> + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
> return ret;
> }
>
> @@ -3332,7 +3334,7 @@ void mem_cgroup_replace_page_cache(struc
> * the newpage may be on LRU(or pagevec for LRU) already. We lock
> * LRU while we overwrite pc->mem_cgroup.
> */
> - __mem_cgroup_commit_charge_lrucare(newpage, memcg, type);
> + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
> }
>
> #ifdef CONFIG_DEBUG_VM
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>
>

2012-02-29 05:42:10

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH next] memcg: remove PCG_FILE_MAPPED fix cosmetic fix

On Tue, 28 Feb 2012 21:28:40 -0800 (PST)
Hugh Dickins <[email protected]> wrote:

> mem_cgroup_move_account() begins with "anon = PageAnon(page)", and
> then anon is used thereafter: testing PageAnon(page) in the middle
> makes the reader wonder if there's some race to guard against - no.
>
> Signed-off-by: Hugh Dickins <[email protected]>

Thanks.

Acked-by: KAMEZAWA Hiroyuki <[email protected]>

> ---
>
> mm/memcontrol.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> --- 3.3-rc5-next/mm/memcontrol.c 2012-02-27 09:56:59.072001463 -0800
> +++ linux/mm/memcontrol.c 2012-02-28 20:45:43.488100423 -0800
> @@ -2560,7 +2560,7 @@ static int mem_cgroup_move_account(struc
>
> move_lock_mem_cgroup(from, &flags);
>
> - if (!PageAnon(page) && page_mapped(page)) {
> + if (!anon && page_mapped(page)) {
> /* Update mapped_file data for mem_cgroup */
> preempt_disable();
> __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
>

2012-02-29 05:56:00

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH next] memcg: remove PCG_CACHE page_cgroup flag fix

On Tue, 28 Feb 2012 21:30:17 -0800 (PST)
Hugh Dickins <[email protected]> wrote:

> Swapping tmpfs loads show absurd wrapped rss and wrong cache in memcg's
> memory.stat statistics: __mem_cgroup_uncharge_common() is failing to
> distinguish the anon and tmpfs cases.
>
I thought I tested this..maybe my test was wrong, sorry.


> Mostly we can decide between them by PageAnon, which is reliable once
> it has been set; but there are several callers who need to uncharge a
> MEM_CGROUP_CHARGE_TYPE_MAPPED page before it was fully initialized,
> so allow that case to override the PageAnon decision.
>
> Signed-off-by: Hugh Dickins <[email protected]>
> ---

Acked-by: KAMEZAWA Hiroyuki <[email protected]>

It seems I should revisit these and consinder some cleanup...

_OFF TOPIC_
To be honest, I don't like to have anon/rss counter in memory.stat because
we have LRU statistics. It seems enough.. If shmem counter is required,
I think we should have shmem counter rather than anon/rss.


>
> mm/memcontrol.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> --- 3.3-rc5-next/mm/memcontrol.c 2012-02-25 10:06:52.496035568 -0800
> +++ linux/mm/memcontrol.c 2012-02-26 10:44:32.146365398 -0800
> @@ -2944,13 +2944,16 @@ __mem_cgroup_uncharge_common(struct page
> if (!PageCgroupUsed(pc))
> goto unlock_out;
>
> + anon = PageAnon(page);
> +
> switch (ctype) {
> case MEM_CGROUP_CHARGE_TYPE_MAPPED:
> + anon = true;
> + /* fallthrough */
> case MEM_CGROUP_CHARGE_TYPE_DROP:
> /* See mem_cgroup_prepare_migration() */
> if (page_mapped(page) || PageCgroupMigration(pc))
> goto unlock_out;
> - anon = true;
> break;
> case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
> if (!PageAnon(page)) { /* Shared memory */
> @@ -2958,10 +2961,8 @@ __mem_cgroup_uncharge_common(struct page
> goto unlock_out;
> } else if (page_mapped(page)) /* Anon */
> goto unlock_out;
> - anon = true;
> break;
> default:
> - anon = false;
> break;
> }
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2012-02-29 19:00:58

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH 3.3] memcg: fix deadlock by inverting lrucare nesting

On Tue, Feb 28, 2012 at 09:25:02PM -0800, Hugh Dickins wrote:
> We have forgotten the rules of lock nesting: the irq-safe ones must be
> taken inside the non-irq-safe ones, otherwise we are open to deadlock:
>
> CPU0 CPU1
> ---- ----
> lock(&(&pc->lock)->rlock);
> local_irq_disable();
> lock(&(&zone->lru_lock)->rlock);
> lock(&(&pc->lock)->rlock);
> <Interrupt>
> lock(&(&zone->lru_lock)->rlock);
>
> To check a different locking issue, I happened to add a spin_lock to
> memcg's bit_spin_lock in lock_page_cgroup(), and lockdep very quickly
> complained about __mem_cgroup_commit_charge_lrucare() (on CPU1 above).
>
> So delete __mem_cgroup_commit_charge_lrucare(), passing a bool lrucare
> to __mem_cgroup_commit_charge() instead, taking zone->lru_lock under
> lock_page_cgroup() in the lrucare case.
>
> The original was using spin_lock_irqsave, but we'd be in more trouble
> if it were ever called at interrupt time: unconditional _irq is enough.
> And ClearPageLRU before del from lru, SetPageLRU before add to lru: no
> strong reason, but that is the ordering used consistently elsewhere.
>
> Signed-off-by: Hugh Dickins <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

2012-02-29 19:35:31

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH next] memcg: fix deadlock by avoiding stat lock when anon

On Tue, Feb 28, 2012 at 09:26:56PM -0800, Hugh Dickins wrote:
> Fix deadlock in "memcg: use new logic for page stat accounting".
>
> page_remove_rmap() first calls mem_cgroup_begin_update_page_stat(),
> which may take move_lock_mem_cgroup(), unlocked at the end of
> page_remove_rmap() by mem_cgroup_end_update_page_stat().
>
> The PageAnon case never needs to mem_cgroup_dec_page_stat(page,
> MEMCG_NR_FILE_MAPPED); but it often needs to mem_cgroup_uncharge_page(),
> which does lock_page_cgroup(), while holding that move_lock_mem_cgroup().
> Whereas mem_cgroup_move_account() calls move_lock_mem_cgroup() while
> holding lock_page_cgroup().
>
> Since mem_cgroup_begin and end are unnecessary here for PageAnon,
> simply avoid the deadlock and wasted calls in that case.
>
> Signed-off-by: Hugh Dickins <[email protected]>

Eek.

Saving the begin/end_update_page_stat() calls for the anon case where
we know in advance we don't need them is one thing, but this also
hides a dependencies that even eludes lockdep behind what looks like a
minor optimization of the anon case.

Wouldn't this be more robust if we turned the ordering inside out in
move_account instead?

> ---
>
> mm/rmap.c | 11 +++++++----
> 1 file changed, 7 insertions(+), 4 deletions(-)
>
> --- 3.3-rc5-next/mm/rmap.c 2012-02-26 23:51:46.506050210 -0800
> +++ linux/mm/rmap.c 2012-02-27 20:25:56.423324211 -0800
> @@ -1166,10 +1166,12 @@ void page_add_file_rmap(struct page *pag
> */
> void page_remove_rmap(struct page *page)
> {
> + bool anon = PageAnon(page);
> bool locked;
> unsigned long flags;
>
> - mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> + if (!anon)
> + mem_cgroup_begin_update_page_stat(page, &locked, &flags);
> /* page still mapped by someone else? */
> if (!atomic_add_negative(-1, &page->_mapcount))
> goto out;
> @@ -1181,7 +1183,7 @@ void page_remove_rmap(struct page *page)
> * not if it's in swapcache - there might be another pte slot
> * containing the swap entry, but page not yet written to swap.
> */
> - if ((!PageAnon(page) || PageSwapCache(page)) &&
> + if ((!anon || PageSwapCache(page)) &&
> page_test_and_clear_dirty(page_to_pfn(page), 1))
> set_page_dirty(page);
> /*
> @@ -1190,7 +1192,7 @@ void page_remove_rmap(struct page *page)
> */
> if (unlikely(PageHuge(page)))
> goto out;
> - if (PageAnon(page)) {
> + if (anon) {
> mem_cgroup_uncharge_page(page);
> if (!PageTransHuge(page))
> __dec_zone_page_state(page, NR_ANON_PAGES);
> @@ -1211,7 +1213,8 @@ void page_remove_rmap(struct page *page)
> * faster for those pages still in swapcache.
> */
> out:
> - mem_cgroup_end_update_page_stat(page, &locked, &flags);
> + if (!anon)
> + mem_cgroup_end_update_page_stat(page, &locked, &flags);
> }
>
> /*

2012-02-29 19:36:03

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH next] memcg: remove PCG_FILE_MAPPED fix cosmetic fix

On Tue, Feb 28, 2012 at 09:28:40PM -0800, Hugh Dickins wrote:
> mem_cgroup_move_account() begins with "anon = PageAnon(page)", and
> then anon is used thereafter: testing PageAnon(page) in the middle
> makes the reader wonder if there's some race to guard against - no.
>
> Signed-off-by: Hugh Dickins <[email protected]>

Acked-by: Johannes Weiner <[email protected]>

2012-02-29 19:43:11

by Johannes Weiner

[permalink] [raw]
Subject: Re: [PATCH next] memcg: remove PCG_CACHE page_cgroup flag fix

On Tue, Feb 28, 2012 at 09:30:17PM -0800, Hugh Dickins wrote:
> Swapping tmpfs loads show absurd wrapped rss and wrong cache in memcg's
> memory.stat statistics: __mem_cgroup_uncharge_common() is failing to
> distinguish the anon and tmpfs cases.
>
> Mostly we can decide between them by PageAnon, which is reliable once
> it has been set; but there are several callers who need to uncharge a
> MEM_CGROUP_CHARGE_TYPE_MAPPED page before it was fully initialized,
> so allow that case to override the PageAnon decision.
>
> Signed-off-by: Hugh Dickins <[email protected]>
> ---
>
> mm/memcontrol.c | 7 ++++---
> 1 file changed, 4 insertions(+), 3 deletions(-)
>
> --- 3.3-rc5-next/mm/memcontrol.c 2012-02-25 10:06:52.496035568 -0800
> +++ linux/mm/memcontrol.c 2012-02-26 10:44:32.146365398 -0800
> @@ -2944,13 +2944,16 @@ __mem_cgroup_uncharge_common(struct page
> if (!PageCgroupUsed(pc))
> goto unlock_out;
>
> + anon = PageAnon(page);
> +
> switch (ctype) {
> case MEM_CGROUP_CHARGE_TYPE_MAPPED:
> + anon = true;
> + /* fallthrough */

If you don't mind, could you add a small comment on why this is the
exception where we don't trust page->mapping?

Other than that,
Acked-by: Johannes Weiner <[email protected]>

> case MEM_CGROUP_CHARGE_TYPE_DROP:
> /* See mem_cgroup_prepare_migration() */
> if (page_mapped(page) || PageCgroupMigration(pc))
> goto unlock_out;
> - anon = true;
> break;
> case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
> if (!PageAnon(page)) { /* Shared memory */
> @@ -2958,10 +2961,8 @@ __mem_cgroup_uncharge_common(struct page
> goto unlock_out;
> } else if (page_mapped(page)) /* Anon */
> goto unlock_out;
> - anon = true;
> break;
> default:
> - anon = false;
> break;
> }

2012-02-29 22:05:02

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 3.3] memcg: fix deadlock by inverting lrucare nesting

On Tue, 28 Feb 2012 21:25:02 -0800 (PST)
Hugh Dickins <[email protected]> wrote:

> We have forgotten the rules of lock nesting: the irq-safe ones must be
> taken inside the non-irq-safe ones, otherwise we are open to deadlock:
>
> CPU0 CPU1
> ---- ----
> lock(&(&pc->lock)->rlock);
> local_irq_disable();
> lock(&(&zone->lru_lock)->rlock);
> lock(&(&pc->lock)->rlock);
> <Interrupt>
> lock(&(&zone->lru_lock)->rlock);
>
> To check a different locking issue, I happened to add a spin_lock to
> memcg's bit_spin_lock in lock_page_cgroup(), and lockdep very quickly
> complained about __mem_cgroup_commit_charge_lrucare() (on CPU1 above).
>
> So delete __mem_cgroup_commit_charge_lrucare(), passing a bool lrucare
> to __mem_cgroup_commit_charge() instead, taking zone->lru_lock under
> lock_page_cgroup() in the lrucare case.
>
> The original was using spin_lock_irqsave, but we'd be in more trouble
> if it were ever called at interrupt time: unconditional _irq is enough.
> And ClearPageLRU before del from lru, SetPageLRU before add to lru: no
> strong reason, but that is the ordering used consistently elsewhere.

This patch makes rather a mess of "memcg: remove PCG_CACHE page_cgroup
flag".

--- mm/memcontrol.c~memcg-remove-pcg_cache-page_cgroup-flag
+++ mm/memcontrol.c
@@ -2410,6 +2414,8 @@
struct page_cgroup *pc,
enum charge_type ctype)
{
+ bool anon;
+
lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
@@ -2429,21 +2435,14 @@
* See mem_cgroup_add_lru_list(), etc.
*/
smp_wmb();
- switch (ctype) {
- case MEM_CGROUP_CHARGE_TYPE_CACHE:
- case MEM_CGROUP_CHARGE_TYPE_SHMEM:
- SetPageCgroupCache(pc);
- SetPageCgroupUsed(pc);
- break;
- case MEM_CGROUP_CHARGE_TYPE_MAPPED:
- ClearPageCgroupCache(pc);
- SetPageCgroupUsed(pc);
- break;
- default:
- break;
- }

- mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
+ SetPageCgroupUsed(pc);
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+ anon = true;
+ else
+ anon = false;
+
+ mem_cgroup_charge_statistics(memcg, anon, nr_pages);
unlock_page_cgroup(pc);
WARN_ON_ONCE(PageLRU(page));
/*

I did it this way:

static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
struct page *page,
unsigned int nr_pages,
struct page_cgroup *pc,
enum charge_type ctype,
bool lrucare)
{
struct zone *uninitialized_var(zone);
bool was_on_lru = false;
bool anon;

lock_page_cgroup(pc);
if (unlikely(PageCgroupUsed(pc))) {
unlock_page_cgroup(pc);
__mem_cgroup_cancel_charge(memcg, nr_pages);
return;
}
/*
* we don't need page_cgroup_lock about tail pages, becase they are not
* accessed by any other context at this point.
*/

/*
* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
* may already be on some other mem_cgroup's LRU. Take care of it.
*/
if (lrucare) {
zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
if (PageLRU(page)) {
ClearPageLRU(page);
del_page_from_lru_list(zone, page, page_lru(page));
was_on_lru = true;
}
}

pc->mem_cgroup = memcg;
/*
* We access a page_cgroup asynchronously without lock_page_cgroup().
* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
* is accessed after testing USED bit. To make pc->mem_cgroup visible
* before USED bit, we need memory barrier here.
* See mem_cgroup_add_lru_list(), etc.
*/
smp_wmb();
SetPageCgroupUsed(pc);

if (lrucare) {
if (was_on_lru) {
VM_BUG_ON(PageLRU(page));
SetPageLRU(page);
add_page_to_lru_list(zone, page, page_lru(page));
}
spin_unlock_irq(&zone->lru_lock);
}

if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
anon = true;
else
anon = false;

mem_cgroup_charge_statistics(memcg, anon, nr_pages);
unlock_page_cgroup(pc);

/*
* "charge_statistics" updated event counter. Then, check it.
* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
* if they exceeds softlimit.
*/
memcg_check_events(memcg, page);
}

2012-03-01 00:43:46

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH 3.3] memcg: fix deadlock by inverting lrucare nesting

On Wed, 29 Feb 2012, Andrew Morton wrote:
> On Tue, 28 Feb 2012 21:25:02 -0800 (PST)
> Hugh Dickins <[email protected]> wrote:
>
> > We have forgotten the rules of lock nesting: the irq-safe ones must be
> > taken inside the non-irq-safe ones, otherwise we are open to deadlock:
>
> This patch makes rather a mess of "memcg: remove PCG_CACHE page_cgroup
> flag".

Sorry about that.

>
> I did it this way:

Exactly right, thank you. In my tree I end up with a blank line
in between the smp_wmb() and the SetPageCgroupUsed(pc), but I
prefer the way you have grouped it.

Hugh

>
> static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
> struct page *page,
> unsigned int nr_pages,
> struct page_cgroup *pc,
> enum charge_type ctype,
> bool lrucare)
> {
> struct zone *uninitialized_var(zone);
> bool was_on_lru = false;
> bool anon;
>
> lock_page_cgroup(pc);
> if (unlikely(PageCgroupUsed(pc))) {
> unlock_page_cgroup(pc);
> __mem_cgroup_cancel_charge(memcg, nr_pages);
> return;
> }
> /*
> * we don't need page_cgroup_lock about tail pages, becase they are not
> * accessed by any other context at this point.
> */
>
> /*
> * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
> * may already be on some other mem_cgroup's LRU. Take care of it.
> */
> if (lrucare) {
> zone = page_zone(page);
> spin_lock_irq(&zone->lru_lock);
> if (PageLRU(page)) {
> ClearPageLRU(page);
> del_page_from_lru_list(zone, page, page_lru(page));
> was_on_lru = true;
> }
> }
>
> pc->mem_cgroup = memcg;
> /*
> * We access a page_cgroup asynchronously without lock_page_cgroup().
> * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
> * is accessed after testing USED bit. To make pc->mem_cgroup visible
> * before USED bit, we need memory barrier here.
> * See mem_cgroup_add_lru_list(), etc.
> */
> smp_wmb();
> SetPageCgroupUsed(pc);
>
> if (lrucare) {
> if (was_on_lru) {
> VM_BUG_ON(PageLRU(page));
> SetPageLRU(page);
> add_page_to_lru_list(zone, page, page_lru(page));
> }
> spin_unlock_irq(&zone->lru_lock);
> }
>
> if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
> anon = true;
> else
> anon = false;
>
> mem_cgroup_charge_statistics(memcg, anon, nr_pages);
> unlock_page_cgroup(pc);
>
> /*
> * "charge_statistics" updated event counter. Then, check it.
> * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
> * if they exceeds softlimit.
> */
> memcg_check_events(memcg, page);
> }
>
>

2012-03-01 01:18:40

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH next] memcg: fix deadlock by avoiding stat lock when anon

On Wed, 29 Feb 2012, Johannes Weiner wrote:
>
> Saving the begin/end_update_page_stat() calls for the anon case where
> we know in advance we don't need them is one thing, but this also
> hides a dependencies that even eludes lockdep behind what looks like a
> minor optimization of the anon case.

Sounds like you'd appreciate a comment there: akpm has not put
this version in yet, so I'll send an updated version shortly.

>
> Wouldn't this be more robust if we turned the ordering inside out in
> move_account instead?

I think we need more than the one user of this infrastructure before
that can be decided.

But I didn't actually consider that at all: perhaps prejudiced by the
way I had solved the race Konstantin pointed out in my patchset of 10
last week, by using the lruvec lock for move_lock_mem_cgroup too,
which fits with it being inside the page_cgroup lock.

Hmm, I notice move_lock_mem_cgroup is likewise spin_lock_irqsave:
if it needs to be (and I guess the idea is that it doesn't need to be
today, but for generality later on had better be), then it has to be
inside page_cgroup lock.

(If FILE_MAPPED were to be the only user of the infrastructure, I'd
actually prefer to remove the begin/end, and make move_account raise
the file page's mapcount temporarily, doing its own page_remove_rmap
after, to solve these races. There's probably one or two VM_BUG_ONs
elsewhere that would need deleting to make that completely safe.
But I understand there may be more users to come - and mapcount
games might not fit your desire for robustness.)

Hugh

2012-03-01 01:21:48

by Hugh Dickins

[permalink] [raw]
Subject: Re: [PATCH next] memcg: remove PCG_CACHE page_cgroup flag fix

On Wed, 29 Feb 2012, Johannes Weiner wrote:
> On Tue, Feb 28, 2012 at 09:30:17PM -0800, Hugh Dickins wrote:
> >
> > + anon = PageAnon(page);
> > +
> > switch (ctype) {
> > case MEM_CGROUP_CHARGE_TYPE_MAPPED:
> > + anon = true;
> > + /* fallthrough */
>
> If you don't mind, could you add a small comment on why this is the
> exception where we don't trust page->mapping?

Right, I'll send an incremental fix for that.

>
> Other than that,
> Acked-by: Johannes Weiner <[email protected]>

Thanks,
Hugh

2012-03-01 02:43:32

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH next] memcg: remove PCG_CACHE page_cgroup flag fix2

Add comment to MEM_CGROUP_CHARGE_TYPE_MAPPED case in
__mem_cgroup_uncharge_common().

Signed-off-by: Hugh Dickins <[email protected]>
---
This one incremental to patch already in mm-commits.

mm/memcontrol.c | 5 +++++
1 file changed, 5 insertions(+)

--- mm-commits/mm/memcontrol.c 2012-02-28 20:45:43.488100423 -0800
+++ linux/mm/memcontrol.c 2012-02-29 18:21:49.144702180 -0800
@@ -2953,6 +2953,11 @@ __mem_cgroup_uncharge_common(struct page

switch (ctype) {
case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+ /*
+ * Generally PageAnon tells if it's the anon statistics to be
+ * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
+ * used before page reached the stage of being marked PageAnon.
+ */
anon = true;
/* fallthrough */
case MEM_CGROUP_CHARGE_TYPE_DROP:

2012-03-01 02:45:28

by Hugh Dickins

[permalink] [raw]
Subject: [PATCH v2 next] memcg: fix deadlock by avoiding stat lock when anon

Fix deadlock in "memcg: use new logic for page stat accounting".

page_remove_rmap() first calls mem_cgroup_begin_update_page_stat(),
which may take move_lock_mem_cgroup(), unlocked at the end of
page_remove_rmap() by mem_cgroup_end_update_page_stat().

The PageAnon case never needs to mem_cgroup_dec_page_stat(page,
MEMCG_NR_FILE_MAPPED); but it often needs to mem_cgroup_uncharge_page(),
which does lock_page_cgroup(), while holding that move_lock_mem_cgroup().
Whereas mem_cgroup_move_account() calls move_lock_mem_cgroup() while
holding lock_page_cgroup().

Since mem_cgroup_begin and end are unnecessary here for PageAnon,
simply avoid the deadlock and wasted calls in that case.

Signed-off-by: Hugh Dickins <[email protected]>
---
v2: added comment in the code so it's not thought just an optimization.

mm/rmap.c | 17 +++++++++++++----
1 file changed, 13 insertions(+), 4 deletions(-)

--- 3.3-rc5-next/mm/rmap.c 2012-02-26 23:51:46.506050210 -0800
+++ linux/mm/rmap.c 2012-02-29 17:55:42.868665736 -0800
@@ -1166,10 +1166,18 @@ void page_add_file_rmap(struct page *pag
*/
void page_remove_rmap(struct page *page)
{
+ bool anon = PageAnon(page);
bool locked;
unsigned long flags;

- mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+ /*
+ * The anon case has no mem_cgroup page_stat to update; but may
+ * uncharge_page() below, where the lock ordering can deadlock if
+ * we hold the lock against page_stat move: so avoid it on anon.
+ */
+ if (!anon)
+ mem_cgroup_begin_update_page_stat(page, &locked, &flags);
+
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
@@ -1181,7 +1189,7 @@ void page_remove_rmap(struct page *page)
* not if it's in swapcache - there might be another pte slot
* containing the swap entry, but page not yet written to swap.
*/
- if ((!PageAnon(page) || PageSwapCache(page)) &&
+ if ((!anon || PageSwapCache(page)) &&
page_test_and_clear_dirty(page_to_pfn(page), 1))
set_page_dirty(page);
/*
@@ -1190,7 +1198,7 @@ void page_remove_rmap(struct page *page)
*/
if (unlikely(PageHuge(page)))
goto out;
- if (PageAnon(page)) {
+ if (anon) {
mem_cgroup_uncharge_page(page);
if (!PageTransHuge(page))
__dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1211,7 +1219,8 @@ void page_remove_rmap(struct page *page)
* faster for those pages still in swapcache.
*/
out:
- mem_cgroup_end_update_page_stat(page, &locked, &flags);
+ if (!anon)
+ mem_cgroup_end_update_page_stat(page, &locked, &flags);
}

/*