Below is a 2.6.17-rc4-mm3 patch that fixes a problem where the OOM killer was
unnecessarily killing system daemons in addition to memory-hogging user
processes. The patch fixes things so that the following assertion is
satisfied:
If a failed attempt to allocate memory triggers the OOM killer, then the
failed attempt must have occurred _after_ any process previously shot by
the OOM killer has cleaned out its mm_struct.
Thus we avoid situations where concurrent invocations of the OOM killer cause
more processes to be shot than necessary to resolve the OOM condition.
Signed-Off-By: David S. Peterson <[email protected]>
---
Changes in this version of patch:
- restore call to printk_ratelimit() in out_of_memory()
- edit comment above oom_alloc()
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/include/linux/sched.h linux-2.6.17-rc4-mm3-oom/include/linux/sched.h
--- linux-2.6.17-rc4-mm3/include/linux/sched.h 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/include/linux/sched.h 2006-05-22 08:46:17.000000000 -0700
@@ -297,6 +297,9 @@ typedef unsigned long mm_counter_t;
(mm)->hiwater_vm = (mm)->total_vm; \
} while (0)
+/* bit #s for flags in mm_struct->flags... */
+#define MM_FLAG_OOM_NOTIFY 0
+
struct mm_struct {
struct vm_area_struct * mmap; /* list of VMAs */
struct rb_root mm_rb;
@@ -355,6 +358,8 @@ struct mm_struct {
/* aio bits */
rwlock_t ioctx_list_lock;
struct kioctx *ioctx_list;
+
+ unsigned long flags;
};
struct sighand_struct {
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/include/linux/swap.h linux-2.6.17-rc4-mm3-oom/include/linux/swap.h
--- linux-2.6.17-rc4-mm3/include/linux/swap.h 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/include/linux/swap.h 2006-05-22 08:46:17.000000000 -0700
@@ -155,7 +155,27 @@ struct swap_list_t {
#define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
/* linux/mm/oom_kill.c */
-extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
+extern int oom_kill_in_progress;
+
+/* Return 1 if OOM kill in progress. Else return 0. */
+static inline int oom_kill_active(void)
+{
+ return oom_kill_in_progress;
+}
+
+/* Start an OOM kill operation. */
+static inline void oom_kill_start(void)
+{
+ oom_kill_in_progress = 1;
+}
+
+/* Terminate an OOM kill operation. */
+static inline void oom_kill_finish(void)
+{
+ oom_kill_in_progress = 0;
+}
+
+extern int out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order);
/* linux/mm/memory.c */
extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/kernel/fork.c linux-2.6.17-rc4-mm3-oom/kernel/fork.c
--- linux-2.6.17-rc4-mm3/kernel/fork.c 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/kernel/fork.c 2006-05-22 08:46:17.000000000 -0700
@@ -329,6 +329,7 @@ static struct mm_struct * mm_init(struct
mm->ioctx_list = NULL;
mm->free_area_cache = TASK_UNMAPPED_BASE;
mm->cached_hole_size = ~0UL;
+ mm->flags = 0;
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -382,6 +383,8 @@ void mmput(struct mm_struct *mm)
spin_unlock(&mmlist_lock);
}
put_swap_token(mm);
+ if (unlikely(test_bit(MM_FLAG_OOM_NOTIFY, &mm->flags)))
+ oom_kill_finish(); /* terminate pending OOM kill */
mmdrop(mm);
}
}
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/mm/oom_kill.c linux-2.6.17-rc4-mm3-oom/mm/oom_kill.c
--- linux-2.6.17-rc4-mm3/mm/oom_kill.c 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/mm/oom_kill.c 2006-05-22 16:02:13.000000000 -0700
@@ -25,6 +25,8 @@
int sysctl_panic_on_oom;
/* #define DEBUG */
+int oom_kill_in_progress;
+
/**
* badness - calculate a numeric value for how bad this task has been
* @p: task struct of which task we should calculate
@@ -260,27 +262,31 @@ static int oom_kill_task(task_t *p, cons
struct mm_struct *mm;
task_t * g, * q;
+ task_lock(p);
mm = p->mm;
- /* WARNING: mm may not be dereferenced since we did not obtain its
- * value from get_task_mm(p). This is OK since all we need to do is
- * compare mm to q->mm below.
+ if (mm == NULL || mm == &init_mm) {
+ task_unlock(p);
+ return 1;
+ }
+
+ set_bit(MM_FLAG_OOM_NOTIFY, &mm->flags);
+ task_unlock(p);
+
+ /* WARNING: mm may no longer be dereferenced since we did not obtain
+ * its value from get_task_mm(p). This is OK since all we need to do
+ * is compare mm to q->mm below.
*
* Furthermore, even if mm contains a non-NULL value, p->mm may
- * change to NULL at any time since we do not hold task_lock(p).
+ * change to NULL at any time since we no longer hold task_lock(p).
* However, this is of no concern to us.
*/
- if (mm == NULL || mm == &init_mm)
- return 1;
-
- __oom_kill_task(p, message);
/*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * kill all processes that share the ->mm (i.e. all threads)
*/
do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
+ if (q->mm == mm)
__oom_kill_task(q, message);
while_each_thread(g, q);
@@ -313,11 +319,15 @@ static int oom_kill_process(struct task_
* killing a random task (bad), letting the system crash (worse)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
+ *
+ * Return 0 if we actually shot a process. Else return 1.
*/
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+int out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
task_t *p;
unsigned long points = 0;
+ const char *msg = NULL;
+ int ret = 1;
if (printk_ratelimit()) {
printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -335,19 +345,19 @@ void out_of_memory(struct zonelist *zone
*/
switch (constrained_alloc(zonelist, gfp_mask)) {
case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, points,
- "No available memory (MPOL_BIND)");
+ p = current;
+ msg = "No available memory (MPOL_BIND)";
break;
case CONSTRAINT_CPUSET:
- oom_kill_process(current, points,
- "No available memory in cpuset");
+ p = current;
+ msg = "No available memory in cpuset";
break;
case CONSTRAINT_NONE:
if (sysctl_panic_on_oom)
panic("out of memory. panic_on_oom is selected\n");
-retry:
+
/*
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
@@ -364,20 +374,17 @@ retry:
panic("Out of memory and no killable processes...\n");
}
- if (oom_kill_process(p, points, "Out of memory"))
- goto retry;
-
+ msg = "Out of memory";
break;
+
+ default:
+ BUG();
}
+ ret = oom_kill_process(p, points, msg);
+
out:
read_unlock(&tasklist_lock);
cpuset_unlock();
-
- /*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory unless "p" is current
- */
- if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
+ return ret;
}
diff -urNp -X dontdiff linux-2.6.17-rc4-mm3/mm/page_alloc.c linux-2.6.17-rc4-mm3-oom/mm/page_alloc.c
--- linux-2.6.17-rc4-mm3/mm/page_alloc.c 2006-05-22 08:44:28.000000000 -0700
+++ linux-2.6.17-rc4-mm3-oom/mm/page_alloc.c 2006-05-22 16:04:28.000000000 -0700
@@ -992,6 +992,62 @@ static inline void set_page_owner(struct
}
#endif /* CONFIG_PAGE_OWNER */
+/* If an OOM kill is not already in progress, try once more to allocate
+ * memory. If allocation fails this time, invoke the OOM killer.
+ */
+static struct page * oom_alloc(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist)
+{
+ static DECLARE_MUTEX(sem);
+ struct page *page;
+
+ down(&sem);
+
+ /* Prevent parallel OOM kill operations. This fixes a problem where
+ * the OOM killer was observed shooting system daemons in addition to
+ * memory-hogging user processes.
+ */
+ if (oom_kill_active()) {
+ up(&sem);
+ goto out_sleep;
+ }
+
+ /* If we get here, we _know_ that any previous OOM killer victim has
+ * cleaned out its mm_struct. Therefore we should pick a victim to
+ * shoot if this allocation fails.
+ */
+ page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
+ zonelist, ALLOC_WMARK_HIGH | ALLOC_CPUSET);
+
+ if (page) {
+ up(&sem);
+ return page;
+ }
+
+ oom_kill_start();
+ up(&sem);
+
+ /* Try to shoot a process. Call oom_kill_finish() only if the OOM
+ * killer did not shoot anything. If the OOM killer shot something,
+ * mmput() will call oom_kill_finish() once the mm_users count of the
+ * victim's mm_struct has reached 0 and the mm_struct has been cleaned
+ * out.
+ */
+ if (out_of_memory(zonelist, gfp_mask, order))
+ oom_kill_finish(); /* cancel OOM kill */
+
+out_sleep:
+ /* Did we get shot by the OOM killer? If not, sleep for a while to
+ * avoid burning lots of CPU cycles looping in the memory allocator.
+ * If the OOM killer shot a process, this gives the victim a good
+ * chance to die before we retry allocation.
+ */
+ if (!test_thread_flag(TIF_MEMDIE))
+ schedule_timeout_uninterruptible(1);
+
+ return NULL;
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -1103,18 +1159,9 @@ rebalance:
if (page)
goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
- /*
- * Go through the zonelist yet one more time, keep
- * very high watermark here, this is only to catch
- * a parallel oom killing, we must fail if we're still
- * under heavy pressure.
- */
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
- zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+ page = oom_alloc(gfp_mask, order, zonelist);
if (page)
goto got_pg;
-
- out_of_memory(zonelist, gfp_mask, order);
goto restart;
}
Dave Peterson wrote:
> Below is a 2.6.17-rc4-mm3 patch that fixes a problem where the OOM killer was
> unnecessarily killing system daemons in addition to memory-hogging user
> processes. The patch fixes things so that the following assertion is
> satisfied:
>
> If a failed attempt to allocate memory triggers the OOM killer, then the
> failed attempt must have occurred _after_ any process previously shot by
> the OOM killer has cleaned out its mm_struct.
>
> Thus we avoid situations where concurrent invocations of the OOM killer cause
> more processes to be shot than necessary to resolve the OOM condition.
Does this fix observed problems on real (or fake) workloads? Can we have
some more information about that?
I still don't quite understand why all this mechanism is needed. Suppose
that we single-thread the oom kill path (which isn't unreasonable, unless
you need really good OOM throughput :P), isn't it enough to find that any
process has TIF_MEMDIE set in order to know that an OOM kill is in progress?
down(&oom_sem);
for each process {
if TIF_MEMDIE
goto oom_in_progress;
else
calculate badness;
}
up(&oom_sem);
I have one other comment, below
> +/* If an OOM kill is not already in progress, try once more to allocate
> + * memory. If allocation fails this time, invoke the OOM killer.
> + */
> +static struct page * oom_alloc(gfp_t gfp_mask, unsigned int order,
> + struct zonelist *zonelist)
> +{
> + static DECLARE_MUTEX(sem);
> + struct page *page;
> +
> + down(&sem);
> +
> + /* Prevent parallel OOM kill operations. This fixes a problem where
> + * the OOM killer was observed shooting system daemons in addition to
> + * memory-hogging user processes.
> + */
> + if (oom_kill_active()) {
> + up(&sem);
> + goto out_sleep;
> + }
> +
> + /* If we get here, we _know_ that any previous OOM killer victim has
> + * cleaned out its mm_struct. Therefore we should pick a victim to
> + * shoot if this allocation fails.
> + */
> + page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
> + zonelist, ALLOC_WMARK_HIGH | ALLOC_CPUSET);
> +
> + if (page) {
> + up(&sem);
> + return page;
> + }
> +
> + oom_kill_start();
> + up(&sem);
> +
> + /* Try to shoot a process. Call oom_kill_finish() only if the OOM
> + * killer did not shoot anything. If the OOM killer shot something,
> + * mmput() will call oom_kill_finish() once the mm_users count of the
> + * victim's mm_struct has reached 0 and the mm_struct has been cleaned
> + * out.
> + */
> + if (out_of_memory(zonelist, gfp_mask, order))
> + oom_kill_finish(); /* cancel OOM kill */
> +
> +out_sleep:
> + /* Did we get shot by the OOM killer? If not, sleep for a while to
> + * avoid burning lots of CPU cycles looping in the memory allocator.
> + * If the OOM killer shot a process, this gives the victim a good
> + * chance to die before we retry allocation.
> + */
> + if (!test_thread_flag(TIF_MEMDIE))
> + schedule_timeout_uninterruptible(1);
> +
> + return NULL;
> +}
Is all this really required? Shouldn't you just have in place the
mechanism to prevent concurrent OOM killings in the OOM code, and
so the page allocator doesn't have to bother with it at all (ie.
it can just call into the OOM killer, which may or may not actually
kill anything).
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
At 10:39 PM 5/22/2006, Nick Piggin wrote:
>Does this fix observed problems on real (or fake) workloads? Can we have
>some more information about that?
The problems were observed when executing the C program shown below on a
machine with swap turned off. Soon we will be deploying diskless clusters
(i.e. clusters with no swap space). Our goal is to get an idea of how well
the machines will recover if users push their memory allocations a bit too
far. This is a rather common occurrence in our environment since our users
run memory-intensive workloads and tend to try to push the machines to
their limits. We are doing tests such as the one below in an effort to
identify and resolve problems before the diskless machines go into
production. The fact that we see the bad behavior with reasonable
frequency even when testing on a single machine suggests to us that we
are likely to see it much more often in production on our 1000+ node
clusters.
On somewhat of a tangent, our motivations for going diskless are as follows:
- Hard drive failure is by far our largest source of equipment failure.
- Hard drives generate extra heat and take up space. Both of these are
substantial drawbacks when dealing with large clusters (i.e. 1000+ nodes).
- cost savings (hard drives cost money)
>I still don't quite understand why all this mechanism is needed. Suppose
>that we single-thread the oom kill path (which isn't unreasonable, unless
>you need really good OOM throughput :P), isn't it enough to find that any
>process has TIF_MEMDIE set in order to know that an OOM kill is in progress?
>
>down(&oom_sem);
>for each process {
> if TIF_MEMDIE
> goto oom_in_progress;
> else
> calculate badness;
>}
>up(&oom_sem);
That would be another way to do things. It's a tradeoff between either
option A: Each task that enters the OOM code path must loop over all
tasks to determine whether an OOM kill is in progress.
or...
option B: We must declare an oom_kill_in_progress variable and add
the following snippet of code to mmput():
put_swap_token(mm);
+ if (unlikely(test_bit(MM_FLAG_OOM_NOTIFY, &mm->flags)))
+ oom_kill_finish(); /* terminate pending OOM kill */
mmdrop(mm);
I think either option is reasonable (although I have a slight preference
for B since it eliminates substantial looping through the tasklist).
>Is all this really required? Shouldn't you just have in place the
>mechanism to prevent concurrent OOM killings in the OOM code, and
>so the page allocator doesn't have to bother with it at all (ie.
>it can just call into the OOM killer, which may or may not actually
>kill anything).
I agree it's desirable to keep the OOM killing logic as encapsulated
as possible. However unless you are holding the oom kill semaphore
when you make your final attempt to allocate memory it's a bit racy.
Holding the OOM kill semaphore guarantees that our final allocation
failure before invoking the OOM killer occurred _after_ any previous
OOM kill victim freed its memory. Thus we know we are not shooting
another process prematurely (i.e. before the memory-freeing effects
of our previous OOM kill have been felt).
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define CHUNKS 32
int
main(int argc, char *argv[])
{
unsigned long mb;
unsigned long iter = 1;
char *buf[CHUNKS];
int i;
if (argc < 2 || argc > 3) {
fprintf(stderr, "usage: usemem megabytes [iterations]\n");
exit(1);
}
mb = strtoul(argv[1], NULL, 0);
if (argc == 3)
iter = strtoul(argv[2], NULL, 0);
if (mb < CHUNKS) {
fprintf(stderr, "megabytes must be >= %d\n", CHUNKS);
exit(1);
}
for (i = 0; i < CHUNKS; i++) {
fprintf(stderr, "%d: Mallocing %lu megabytes\n", i, mb/CHUNKS);
buf[i] = (char *)malloc(mb/CHUNKS * 1024L * 1024L);
if (!buf[i]) {
fprintf(stderr, "malloc failure\n");
exit(1);
}
}
while (iter-- > 0) {
for (i = 0; i < CHUNKS; i++) {
fprintf(stderr, "%d: Zeroing %lu megabytes at %p\n",
i, mb/CHUNKS, buf[i]);
memset(buf[i], 0, mb/CHUNKS * 1024L * 1024L);
}
}
exit(0);
}
Dave Peterson wrote:
> At 10:39 PM 5/22/2006, Nick Piggin wrote:
>
>>Does this fix observed problems on real (or fake) workloads? Can we have
>>some more information about that?
[snip]
OK, thanks.
>>I still don't quite understand why all this mechanism is needed. Suppose
>>that we single-thread the oom kill path (which isn't unreasonable, unless
>>you need really good OOM throughput :P), isn't it enough to find that any
>>process has TIF_MEMDIE set in order to know that an OOM kill is in progress?
>>
>>down(&oom_sem);
>>for each process {
>> if TIF_MEMDIE
>> goto oom_in_progress;
>> else
>> calculate badness;
>>}
>>up(&oom_sem);
>
>
> That would be another way to do things. It's a tradeoff between either
>
> option A: Each task that enters the OOM code path must loop over all
> tasks to determine whether an OOM kill is in progress.
>
> or...
>
> option B: We must declare an oom_kill_in_progress variable and add
> the following snippet of code to mmput():
>
> put_swap_token(mm);
> + if (unlikely(test_bit(MM_FLAG_OOM_NOTIFY, &mm->flags)))
> + oom_kill_finish(); /* terminate pending OOM kill */
> mmdrop(mm);
>
> I think either option is reasonable (although I have a slight preference
> for B since it eliminates substantial looping through the tasklist).
Don't you have to loop through the tasklist anyway? To find a task
to kill?
Either way, at the point of OOM, usually they should have gone through
the LRU lists several times, so a little bit more CPU time shouldn't
hurt.
>
>
>>Is all this really required? Shouldn't you just have in place the
>>mechanism to prevent concurrent OOM killings in the OOM code, and
>>so the page allocator doesn't have to bother with it at all (ie.
>>it can just call into the OOM killer, which may or may not actually
>>kill anything).
>
>
> I agree it's desirable to keep the OOM killing logic as encapsulated
> as possible. However unless you are holding the oom kill semaphore
> when you make your final attempt to allocate memory it's a bit racy.
> Holding the OOM kill semaphore guarantees that our final allocation
> failure before invoking the OOM killer occurred _after_ any previous
> OOM kill victim freed its memory. Thus we know we are not shooting
> another process prematurely (i.e. before the memory-freeing effects
> of our previous OOM kill have been felt).
But there is so much fudge in it that I don't think it matters:
pages could be freed from other sources, some reclaim might happen,
the point at which OOM is declared is pretty arbitrary anyway, etc.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com
At 04:43 PM 5/23/2006, Nick Piggin wrote:
>>That would be another way to do things. It's a tradeoff between either
>> option A: Each task that enters the OOM code path must loop over all
>> tasks to determine whether an OOM kill is in progress.
>> or...
>> option B: We must declare an oom_kill_in_progress variable and add
>> the following snippet of code to mmput():
>> put_swap_token(mm);
>>+ if (unlikely(test_bit(MM_FLAG_OOM_NOTIFY, &mm->flags)))
>>+ oom_kill_finish(); /* terminate pending OOM kill */
>> mmdrop(mm);
>>I think either option is reasonable (although I have a slight preference
>>for B since it eliminates substantial looping through the tasklist).
>
>Don't you have to loop through the tasklist anyway? To find a task
>to kill?
>
>Either way, at the point of OOM, usually they should have gone through
>the LRU lists several times, so a little bit more CPU time shouldn't
>hurt.
ok, I'll change the patch to use option A.
>>>Is all this really required? Shouldn't you just have in place the
>>>mechanism to prevent concurrent OOM killings in the OOM code, and
>>>so the page allocator doesn't have to bother with it at all (ie.
>>>it can just call into the OOM killer, which may or may not actually
>>>kill anything).
>>
>>I agree it's desirable to keep the OOM killing logic as encapsulated
>>as possible. However unless you are holding the oom kill semaphore
>>when you make your final attempt to allocate memory it's a bit racy.
>>Holding the OOM kill semaphore guarantees that our final allocation
>>failure before invoking the OOM killer occurred _after_ any previous
>>OOM kill victim freed its memory. Thus we know we are not shooting
>>another process prematurely (i.e. before the memory-freeing effects
>>of our previous OOM kill have been felt).
>
>But there is so much fudge in it that I don't think it matters:
>pages could be freed from other sources, some reclaim might happen,
>the point at which OOM is declared is pretty arbitrary anyway, etc.
There's definitely some fudge in it. However the main scenario I'm
concerned with is where one big process is hogging most of the memory
(as opposed to a case where the collective memory-hogging effect of
lots of little processes triggers the OOM killer). In the first case
we want to shoot the one big process and leave the little processes
undisturbed.
If the final allocation failure before invoking the OOM killer
occurs when we don't yet hold the OOM kill semaphore then I'd
be concerned about processes queueing up on the OOM kill semaphore
after they fail their memory allocations. If only one of these
ends up getting awakened _after_ the death of the big memory hog,
then that process will enter the OOM killer and shoot a little
process unnecessarily.
Alternately (perhaps less likely), if your kernel is preemptible,
after the memory hog has been shot but not yet expired a process
may get preempted between its final allocation failure and its
choosing an OOM kill victim (with the memory hog expiring before
the preempted process gets rescheduled). Then the preempted
process shoots a little process when rescheduled.
Dave Peterson wrote:
> At 04:43 PM 5/23/2006, Nick Piggin wrote:
>>>I agree it's desirable to keep the OOM killing logic as encapsulated
>>>as possible. However unless you are holding the oom kill semaphore
>>>when you make your final attempt to allocate memory it's a bit racy.
>>>Holding the OOM kill semaphore guarantees that our final allocation
>>>failure before invoking the OOM killer occurred _after_ any previous
>>>OOM kill victim freed its memory. Thus we know we are not shooting
>>>another process prematurely (i.e. before the memory-freeing effects
>>>of our previous OOM kill have been felt).
>>
>>But there is so much fudge in it that I don't think it matters:
>>pages could be freed from other sources, some reclaim might happen,
>>the point at which OOM is declared is pretty arbitrary anyway, etc.
>
>
> There's definitely some fudge in it. However the main scenario I'm
> concerned with is where one big process is hogging most of the memory
> (as opposed to a case where the collective memory-hogging effect of
> lots of little processes triggers the OOM killer). In the first case
> we want to shoot the one big process and leave the little processes
> undisturbed.
>
> If the final allocation failure before invoking the OOM killer
> occurs when we don't yet hold the OOM kill semaphore then I'd
> be concerned about processes queueing up on the OOM kill semaphore
> after they fail their memory allocations. If only one of these
> ends up getting awakened _after_ the death of the big memory hog,
> then that process will enter the OOM killer and shoot a little
> process unnecessarily.
>
> Alternately (perhaps less likely), if your kernel is preemptible,
> after the memory hog has been shot but not yet expired a process
> may get preempted between its final allocation failure and its
> choosing an OOM kill victim (with the memory hog expiring before
> the preempted process gets rescheduled). Then the preempted
> process shoots a little process when rescheduled.
But just call into the oom killer, and let it queue up and/or do
nothing according to whether there is still a task being shot or
not.
page allocation would then just try again.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com