Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S934198Ab0KPKU4 (ORCPT ); Tue, 16 Nov 2010 05:20:56 -0500 Received: from TYO202.gate.nec.co.jp ([202.32.8.206]:34222 "EHLO tyo202.gate.nec.co.jp" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S932571Ab0KPKUy (ORCPT ); Tue, 16 Nov 2010 05:20:54 -0500 Date: Tue, 16 Nov 2010 19:17:48 +0900 From: Daisuke Nishimura To: Andrew Morton Cc: LKML , linux-mm , Balbir Singh , KAMEZAWA Hiroyuki , Daisuke Nishimura Subject: [BUGFIX] memcg: avoid deadlock between move charge and try_charge() Message-Id: <20101116191748.d6645376.nishimura@mxp.nes.nec.co.jp> Organization: NEC Soft, Ltd. X-Mailer: Sylpheed 3.0.3 (GTK+ 2.10.14; i686-pc-mingw32) Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 6069 Lines: 187 From: Daisuke Nishimura __mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g. mlock does it). This means it can cause deadlock if it races with move charge: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() To avoid this deadlock, we do all the move charge works (both can_attach() and attach()) under one mmap_sem section. And after this patch, we set/clear mc.moving_task outside mc.lock, because we use the lock only to check mc.from/to. Signed-off-by: Daisuke Nishimura Cc: --- mm/memcontrol.c | 43 ++++++++++++++++++++++++++----------------- 1 files changed, 26 insertions(+), 17 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2efa8ea..0255505 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -278,13 +278,14 @@ enum move_type { /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { - spinlock_t lock; /* for from, to, moving_task */ + spinlock_t lock; /* for from, to */ struct mem_cgroup *from; struct mem_cgroup *to; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ + struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -4631,7 +4632,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4643,7 +4644,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } - up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4694,11 +4694,16 @@ static void mem_cgroup_clear_mc(void) mc.moved_swap = 0; } + if (mc.mm) { + up_read(&mc.mm->mmap_sem); + mmput(mc.mm); + } spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; - mc.moving_task = NULL; spin_unlock(&mc.lock); + mc.moving_task = NULL; + mc.mm = NULL; mem_cgroup_end_move(from); memcg_oom_recover(from); memcg_oom_recover(to); @@ -4724,12 +4729,21 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { + /* + * We do all the move charge works under one mmap_sem to + * avoid deadlock with down_write(&mmap_sem) + * -> try_charge() -> if (mc.moving_task) -> sleep. + */ + down_read(&mm->mmap_sem); + VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); VM_BUG_ON(mc.moving_task); + VM_BUG_ON(mc.mm); + mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; @@ -4737,14 +4751,16 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, mc.precharge = 0; mc.moved_charge = 0; mc.moved_swap = 0; - mc.moving_task = current; spin_unlock(&mc.lock); + mc.moving_task = current; + mc.mm = mm; ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - } - mmput(mm); + /* We call up_read() and mmput() in clear_mc(). */ + } else + mmput(mm); } return ret; } @@ -4832,7 +4848,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - down_read(&mm->mmap_sem); + /* We've already held the mmap_sem */ for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4851,7 +4867,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } - up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4860,17 +4875,11 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - struct mm_struct *mm; - - if (!mc.to) + if (!mc.mm) /* no need to move charge */ return; - mm = get_task_mm(p); - if (mm) { - mem_cgroup_move_charge(mm); - mmput(mm); - } + mem_cgroup_move_charge(mc.mm); mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/