Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751564Ab0BOFDM (ORCPT ); Mon, 15 Feb 2010 00:03:12 -0500 Received: from fgwmail7.fujitsu.co.jp ([192.51.44.37]:40112 "EHLO fgwmail7.fujitsu.co.jp" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750875Ab0BOFDL (ORCPT ); Mon, 15 Feb 2010 00:03:11 -0500 X-SecurityPolicyCheck-FJ: OK by FujitsuOutboundMailChecker v1.3.1 From: KOSAKI Motohiro To: David Rientjes Subject: Re: [patch 3/7 -mm] oom: select task from tasklist for mempolicy ooms Cc: kosaki.motohiro@jp.fujitsu.com, Andrew Morton , Rik van Riel , KAMEZAWA Hiroyuki , Nick Piggin , Andrea Arcangeli , Balbir Singh , Lubos Lunak , linux-kernel@vger.kernel.org, linux-mm@kvack.org In-Reply-To: References: Message-Id: <20100215120924.7281.A69D9226@jp.fujitsu.com> MIME-Version: 1.0 Content-Type: text/plain; charset="US-ASCII" Content-Transfer-Encoding: 7bit X-Mailer: Becky! ver. 2.50.07 [ja] Date: Mon, 15 Feb 2010 14:03:06 +0900 (JST) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9396 Lines: 283 > The oom killer presently kills current whenever there is no more memory > free or reclaimable on its mempolicy's nodes. There is no guarantee that > current is a memory-hogging task or that killing it will free any > substantial amount of memory, however. > > In such situations, it is better to scan the tasklist for nodes that are > allowed to allocate on current's set of nodes and kill the task with the > highest badness() score. This ensures that the most memory-hogging task, > or the one configured by the user with /proc/pid/oom_adj, is always > selected in such scenarios. > > Signed-off-by: David Rientjes > --- > include/linux/mempolicy.h | 13 +++++++- > mm/mempolicy.c | 39 +++++++++++++++++++++++ > mm/oom_kill.c | 77 +++++++++++++++++++++++++++----------------- > 3 files changed, 98 insertions(+), 31 deletions(-) > > diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h > --- a/include/linux/mempolicy.h > +++ b/include/linux/mempolicy.h > @@ -202,6 +202,8 @@ extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, > unsigned long addr, gfp_t gfp_flags, > struct mempolicy **mpol, nodemask_t **nodemask); > extern bool init_nodemask_of_mempolicy(nodemask_t *mask); > +extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, > + const nodemask_t *mask); > extern unsigned slab_node(struct mempolicy *policy); > > extern enum zone_type policy_zone; > @@ -329,7 +331,16 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, > return node_zonelist(0, gfp_flags); > } > > -static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } > +static inline bool init_nodemask_of_mempolicy(nodemask_t *m) > +{ > + return false; > +} > + > +static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk, > + const nodemask_t *mask) > +{ > + return false; > +} > > static inline int do_migrate_pages(struct mm_struct *mm, > const nodemask_t *from_nodes, > diff --git a/mm/mempolicy.c b/mm/mempolicy.c > --- a/mm/mempolicy.c > +++ b/mm/mempolicy.c > @@ -1638,6 +1638,45 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) > } > #endif > > +/* > + * mempolicy_nodemask_intersects > + * > + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default > + * policy. Otherwise, check for intersection between mask and the policy > + * nodemask for 'bind' or 'interleave' policy, or mask to contain the single > + * node for 'preferred' or 'local' policy. > + */ > +bool mempolicy_nodemask_intersects(struct task_struct *tsk, > + const nodemask_t *mask) > +{ > + struct mempolicy *mempolicy; > + bool ret = true; > + > + mempolicy = tsk->mempolicy; > + mpol_get(mempolicy); Why is this refcount increment necessary? mempolicy is grabbed by tsk, IOW it never be freed in this function. > + if (!mask || !mempolicy) > + goto out; > + > + switch (mempolicy->mode) { > + case MPOL_PREFERRED: > + if (mempolicy->flags & MPOL_F_LOCAL) > + ret = node_isset(numa_node_id(), *mask); Um? Is this good heuristic? The task can migrate various cpus, then "node_isset(numa_node_id(), *mask) == 0" doesn't mean the task doesn't consume *mask's memory. > + else > + ret = node_isset(mempolicy->v.preferred_node, > + *mask); > + break; > + case MPOL_BIND: > + case MPOL_INTERLEAVE: > + ret = nodes_intersects(mempolicy->v.nodes, *mask); > + break; > + default: > + BUG(); > + } > +out: > + mpol_put(mempolicy); > + return ret; > +} > + > /* Allocate a page in interleaved policy. > Own path because it needs to do special accounting. */ > static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, > diff --git a/mm/oom_kill.c b/mm/oom_kill.c > --- a/mm/oom_kill.c > +++ b/mm/oom_kill.c > @@ -26,6 +26,7 @@ > #include > #include > #include > +#include > #include > > int sysctl_panic_on_oom; > @@ -36,19 +37,35 @@ static DEFINE_SPINLOCK(zone_scan_lock); > > /* > * Do all threads of the target process overlap our allowed nodes? > + * @tsk: task struct of which task to consider > + * @mask: nodemask passed to page allocator for mempolicy ooms > */ > -static int has_intersects_mems_allowed(struct task_struct *tsk) > +static bool has_intersects_mems_allowed(struct task_struct *tsk, > + const nodemask_t *mask) > { > - struct task_struct *t; > + struct task_struct *start = tsk; > > - t = tsk; > do { > - if (cpuset_mems_allowed_intersects(current, t)) > - return 1; > - t = next_thread(t); > - } while (t != tsk); > - > - return 0; > + if (mask) { > + /* > + * If this is a mempolicy constrained oom, tsk's > + * cpuset is irrelevant. Only return true if its > + * mempolicy intersects current, otherwise it may be > + * needlessly killed. > + */ > + if (mempolicy_nodemask_intersects(tsk, mask)) > + return true; > + } else { > + /* > + * This is not a mempolicy constrained oom, so only > + * check the mems of tsk's cpuset. > + */ > + if (cpuset_mems_allowed_intersects(current, tsk)) > + return true; > + } > + tsk = next_thread(tsk); > + } while (tsk != start); > + return false; > } > > /** > @@ -236,7 +253,8 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, > * (not docbooked, we don't want this one cluttering up the manual) > */ > static struct task_struct *select_bad_process(unsigned long *ppoints, > - struct mem_cgroup *mem) > + struct mem_cgroup *mem, enum oom_constraint constraint, > + const nodemask_t *mask) > { > struct task_struct *p; > struct task_struct *chosen = NULL; > @@ -258,7 +276,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, > continue; > if (mem && !task_in_mem_cgroup(p, mem)) > continue; > - if (!has_intersects_mems_allowed(p)) > + if (!has_intersects_mems_allowed(p, > + constraint == CONSTRAINT_MEMORY_POLICY ? mask : > + NULL)) > continue; > > /* > @@ -478,7 +498,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) > > read_lock(&tasklist_lock); > retry: > - p = select_bad_process(&points, mem); > + p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL); > if (PTR_ERR(p) == -1UL) > goto out; > > @@ -560,7 +580,8 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) > /* > * Must be called with tasklist_lock held for read. > */ > -static void __out_of_memory(gfp_t gfp_mask, int order) > +static void __out_of_memory(gfp_t gfp_mask, int order, > + enum oom_constraint constraint, const nodemask_t *mask) > { > struct task_struct *p; > unsigned long points; > @@ -574,7 +595,7 @@ retry: > * Rambo mode: Shoot down a process and hope it solves whatever > * issues we may have. > */ > - p = select_bad_process(&points, NULL); > + p = select_bad_process(&points, NULL, constraint, mask); > > if (PTR_ERR(p) == -1UL) > return; > @@ -615,7 +636,8 @@ void pagefault_out_of_memory(void) > panic("out of memory from page fault. panic_on_oom is selected.\n"); > > read_lock(&tasklist_lock); > - __out_of_memory(0, 0); /* unknown gfp_mask and order */ > + /* unknown gfp_mask and order */ > + __out_of_memory(0, 0, CONSTRAINT_NONE, NULL); > read_unlock(&tasklist_lock); > > /* > @@ -632,6 +654,7 @@ rest_and_return: > * @zonelist: zonelist pointer > * @gfp_mask: memory allocation flags > * @order: amount of memory being requested as a power of 2 > + * @nodemask: nodemask passed to page allocator > * > * If we run out of memory, we have the choice between either > * killing a random task (bad), letting the system crash (worse) > @@ -660,24 +683,18 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, > */ > constraint = constrained_alloc(zonelist, gfp_mask, nodemask); > read_lock(&tasklist_lock); > - > - switch (constraint) { > - case CONSTRAINT_MEMORY_POLICY: > - oom_kill_process(current, gfp_mask, order, 0, NULL, > - "No available memory (MPOL_BIND)"); > - break; > - > - case CONSTRAINT_NONE: > - if (sysctl_panic_on_oom) { > + if (unlikely(sysctl_panic_on_oom)) { > + /* > + * panic_on_oom only affects CONSTRAINT_NONE, the kernel > + * should not panic for cpuset or mempolicy induced memory > + * failures. > + */ > + if (constraint == CONSTRAINT_NONE) { > dump_header(NULL, gfp_mask, order, NULL); > - panic("out of memory. panic_on_oom is selected\n"); > + panic("Out of memory: panic_on_oom is enabled\n"); enabled? Its feature is enabled at boot time. triggered? or fired? > } > - /* Fall-through */ > - case CONSTRAINT_CPUSET: > - __out_of_memory(gfp_mask, order); > - break; > } > - > + __out_of_memory(gfp_mask, order, constraint, nodemask); > read_unlock(&tasklist_lock); > > /* > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@kvack.org. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: email@kvack.org -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/