Hi,
This is an improved version of OOM-kill-from-kswapd patch.
I believe triggering the OOM killer from task reclaim context
is broken because the chances that it happens increases as the amount
of tasks inside reclaim increases - and that approach ignores efforts
being done by kswapd, who is the main entity responsible for
freeing pages.
There have been a few problems pointed out by others (Andrea, Nick) on the
last patch - this one solves them.
First, Andrea noted that if progress had been made in the high zone,
the OOM killer would not be triggered. Now it conditions the triggering
on "DMA+Normal" reclaiming success.
Nick noted that the last patch would do wrong on the NUMA case (because
of per-node kswapd) - its not a problem now because we will only
kill if there is any task not being able to allocate and free pages.
The memory allocation fallback to other nodes will prevent that
from happening.
Another drawback from the last patch was that it disable the
"all_unreclaimable" logic which attempts to avoid scanning storms -
that way kswapd was able to detect OOM condition.
What it does now is to disable the all_unreclaimable logic after
5 seconds that its been set. This is enough time for the system
to complete IO of (at least some) pages which have been written-out
for reclaiming purposes.
After that period (which can be sysctl'ed BTW), it then performs
a full scan. In case no progress has been made, and both
DMA and normal zones are below the pages_min watermark, the OOM
killer is triggered.
It looks very reliable in my testing - but I need others to test
it as well (Martin and Thomas especially who have good test cases).
diff -Nur --exclude='*.orig' linux-2.6.10-rc1-mm2.orig/include/linux/mmzone.h linux-2.6.10-rc1-mm2/include/linux/mmzone.h
--- linux-2.6.10-rc1-mm2.orig/include/linux/mmzone.h 2004-11-09 14:56:05.000000000 -0200
+++ linux-2.6.10-rc1-mm2/include/linux/mmzone.h 2004-11-11 09:36:10.512588568 -0200
@@ -146,6 +146,7 @@
unsigned long nr_inactive;
unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */
+ unsigned long all_unreclaimable_set; /* When it was set, jiffies */
/*
* prev_priority holds the scanning priority for this zone. It is
diff -Nur --exclude='*.orig' linux-2.6.10-rc1-mm2.orig/mm/oom_kill.c linux-2.6.10-rc1-mm2/mm/oom_kill.c
--- linux-2.6.10-rc1-mm2.orig/mm/oom_kill.c 2004-11-09 14:56:05.000000000 -0200
+++ linux-2.6.10-rc1-mm2/mm/oom_kill.c 2004-11-05 18:33:29.000000000 -0200
@@ -240,23 +240,23 @@
* If it's been a long time since last failure,
* we're not oom.
*/
- if (since > 5*HZ)
- goto reset;
+ //if (since > 5*HZ)
+ // goto reset;
/*
* If we haven't tried for at least one second,
* we're not really oom.
*/
- since = now - first;
- if (since < HZ)
- goto out_unlock;
+ //since = now - first;
+ //if (since < HZ)
+ // goto out_unlock;
/*
* If we have gotten only a few failures,
* we're not really oom.
*/
- if (++count < 10)
- goto out_unlock;
+// if (++count < 10)
+// goto out_unlock;
/*
* If we just killed a process, wait a while
diff -Nur --exclude='*.orig' linux-2.6.10-rc1-mm2.orig/mm/page_alloc.c linux-2.6.10-rc1-mm2/mm/page_alloc.c
--- linux-2.6.10-rc1-mm2.orig/mm/page_alloc.c 2004-11-09 14:56:05.000000000 -0200
+++ linux-2.6.10-rc1-mm2/mm/page_alloc.c 2004-11-11 09:55:09.587422872 -0200
@@ -305,6 +305,7 @@
base = zone->zone_mem_map;
spin_lock_irqsave(&zone->lock, flags);
zone->all_unreclaimable = 0;
+ zone->all_unreclaimable_set = 0;
zone->pages_scanned = 0;
while (!list_empty(list) && count--) {
page = list_entry(list->prev, struct page, lru);
diff -Nur --exclude='*.orig' linux-2.6.10-rc1-mm2.orig/mm/vmscan.c linux-2.6.10-rc1-mm2/mm/vmscan.c
--- linux-2.6.10-rc1-mm2.orig/mm/vmscan.c 2004-11-09 14:56:05.000000000 -0200
+++ linux-2.6.10-rc1-mm2/mm/vmscan.c 2004-11-11 11:03:54.884282424 -0200
@@ -878,6 +878,8 @@
shrink_zone(zone, sc);
}
}
+
+int task_looping_oom = 0;
/*
* This is the main entry point to direct page reclaim.
@@ -952,8 +954,8 @@
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
- out_of_memory(gfp_mask);
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
+ task_looping_oom = 1;
out:
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];
@@ -963,6 +965,8 @@
zone->prev_priority = zone->temp_priority;
}
+ if (ret || total_reclaimed)
+ task_looping_oom = 0;
return ret;
}
@@ -997,13 +1001,17 @@
int all_zones_ok;
int priority;
int i;
- int total_scanned, total_reclaimed;
+ int total_scanned, total_reclaimed, low_reclaimed;
+ int worked_norm, worked_dma;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc;
+
loop_again:
total_scanned = 0;
total_reclaimed = 0;
+ low_reclaimed = 0;
+ worked_norm = worked_dma = 0;
sc.gfp_mask = GFP_KERNEL;
sc.may_writepage = 0;
sc.nr_mapped = read_page_state(nr_mapped);
@@ -1072,6 +1080,17 @@
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
+ /* if we're scanning dma or normal, and priority
+ * reached zero, set "worked_dma" or "worked_norm"
+ * accordingly.
+ */
+ if (i <= 1 && priority == 0) {
+ if (!i)
+ worked_dma = 1;
+ else
+ worked_norm = 1;
+ }
+
if (nr_pages == 0) { /* Not software suspend */
if (!zone_watermark_ok(zone, order,
zone->pages_high, end_zone, 0, 0))
@@ -1088,11 +1107,17 @@
shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_reclaimed += sc.nr_reclaimed;
+
+ if (i <= 1)
+ low_reclaimed += sc.nr_reclaimed;
+
if (zone->all_unreclaimable)
continue;
if (zone->pages_scanned >= (zone->nr_active +
- zone->nr_inactive) * 4)
+ zone->nr_inactive) * 4) {
zone->all_unreclaimable = 1;
+ zone->all_unreclaimable_set = jiffies;
+ }
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
@@ -1127,7 +1152,37 @@
struct zone *zone = pgdat->node_zones + i;
zone->prev_priority = zone->temp_priority;
+
+ /* if the zone has been all_unreclaimable for
+ * more than 5 seconds, clear it to proceed
+ * with a full scan.
+ * This way kswapd can detect that the zone is OOM.
+ */
+ if (zone->all_unreclaimable) {
+ if (time_after(jiffies,
+ zone->all_unreclaimable_set + (500*HZ)/100)) {
+ zone->all_unreclaimable = 0;
+ zone->all_unreclaimable_set = 0;
+ zone->pages_scanned = 0;
+ }
+ }
}
+
+ if (!low_reclaimed && worked_dma && worked_norm && task_looping_oom) {
+ /*
+ * Only kill if ZONE_NORMAL/ZONE_DMA are both below
+ * pages_min
+ */
+ for (i = pgdat->nr_zones - 2; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
+
+ if (zone->free_pages > zone->pages_min)
+ return 0;
+ }
+ out_of_memory(GFP_KERNEL);
+ task_looping_oom = 0;
+ }
+
if (!all_zones_ok) {
cond_resched();
goto loop_again;
@@ -1196,7 +1251,7 @@
*/
order = new_order;
} else {
- schedule();
+ schedule_timeout((600*HZ)/100);
order = pgdat->kswapd_max_order;
}
finish_wait(&pgdat->kswapd_wait, &wait);
On Thu, Nov 11, 2004 at 09:29:22AM -0200, Marcelo Tosatti wrote:
> Hi,
>
> This is an improved version of OOM-kill-from-kswapd patch.
>
> I believe triggering the OOM killer from task reclaim context
> is broken because the chances that it happens increases as the amount
> of tasks inside reclaim increases - and that approach ignores efforts
> being done by kswapd, who is the main entity responsible for
> freeing pages.
>
> There have been a few problems pointed out by others (Andrea, Nick) on the
> last patch - this one solves them.
I disagree about the design of killing anything from kswapd. kswapd is
an async helper like pdflush and it has no knowledge on the caller (it
cannot know if the caller is ok with the memory currently available in
the freelists, before triggering the oom). I'm just about to move the
oom killing away from vmscan.c to page_alloc.c which is basically the
opposite of moving the oom invocation from the task context to kswapd.
page_alloc.c in the task context is the only one who can know if
something has to be killed, vmscan.c cannot know. vmscan.c can only know
if something is still freeable, but if something isn't freeable it
doesn't mean that we've to kill anything (for example if a task exited
or some dma or normal-zone or highmem memory was released by another
task while we were paging waiting for I/O). Every allocation is
different and page_alloc.c is the only one who knows what has to be done
for every single allocation.
Hi!
On Thu, Nov 11, 2004 at 04:42:38PM +0100, Andrea Arcangeli wrote:
> On Thu, Nov 11, 2004 at 09:29:22AM -0200, Marcelo Tosatti wrote:
> > Hi,
> >
> > This is an improved version of OOM-kill-from-kswapd patch.
> >
> > I believe triggering the OOM killer from task reclaim context
> > is broken because the chances that it happens increases as the amount
> > of tasks inside reclaim increases - and that approach ignores efforts
> > being done by kswapd, who is the main entity responsible for
> > freeing pages.
> >
> > There have been a few problems pointed out by others (Andrea, Nick) on the
> > last patch - this one solves them.
>
> I disagree about the design of killing anything from kswapd. kswapd is
> an async helper like pdflush and it has no knowledge on the caller (it
> cannot know if the caller is ok with the memory currently available in
> the freelists, before triggering the oom).
If zone_dma / zone_normal are below pages_min no caller is "OK with
memory currently available" except GFP_ATOMIC/realtime callers.
And the system can't make progress with only those callers happy.
> I'm just about to move the
> oom killing away from vmscan.c to page_alloc.c which is basically the
> opposite of moving the oom invocation from the task context to kswapd.
> page_alloc.c in the task context is the only one who can know if
> something has to be killed, vmscan.c cannot know. vmscan.c can only know
> if something is still freeable, but if something isn't freeable it
> doesn't mean that we've to kill anything
Well Andrea, its not about "if something isnt freeable", its about
"the VM is unable to make progress reclaiming pages".
> (for example if a task exited
> or some dma or normal-zone or highmem memory was released by another
> task while we were paging waiting for I/O).
My last patch checks for pages_min before OOM killing, have you read it?
> Every allocation is different and page_alloc.c is the only one who
> knows what has to be done for every single allocation.
OK, what do you propose? Its the third time I ask you this and got no
concrete answer yet.
Sure, allocators should receive -ENOMEM whenever possible, but this
is not the issue here.
Triggering OOM killer on __alloc_pages() failure ?
Show us the code, please :)
Thanks
On Thu, Nov 11, 2004 at 10:38:50AM -0200, Marcelo Tosatti wrote:
>
> Hi!
>
> On Thu, Nov 11, 2004 at 04:42:38PM +0100, Andrea Arcangeli wrote:
> > On Thu, Nov 11, 2004 at 09:29:22AM -0200, Marcelo Tosatti wrote:
> > > Hi,
> > >
> > > This is an improved version of OOM-kill-from-kswapd patch.
> > >
> > > I believe triggering the OOM killer from task reclaim context
> > > is broken because the chances that it happens increases as the amount
> > > of tasks inside reclaim increases - and that approach ignores efforts
> > > being done by kswapd, who is the main entity responsible for
> > > freeing pages.
> > >
> > > There have been a few problems pointed out by others (Andrea, Nick) on the
> > > last patch - this one solves them.
> >
> > I disagree about the design of killing anything from kswapd. kswapd is
> > an async helper like pdflush and it has no knowledge on the caller (it
> > cannot know if the caller is ok with the memory currently available in
> > the freelists, before triggering the oom).
>
> If zone_dma / zone_normal are below pages_min no caller is "OK with
> memory currently available" except GFP_ATOMIC/realtime callers.
If the GFP_DMA zone is filled, and nobody allocates with GFP_DMA,
nothing should be killed and everything should run fine, how can you
get this right from kswapd?
> > I'm just about to move the
> > oom killing away from vmscan.c to page_alloc.c which is basically the
> > opposite of moving the oom invocation from the task context to kswapd.
> > page_alloc.c in the task context is the only one who can know if
> > something has to be killed, vmscan.c cannot know. vmscan.c can only know
> > if something is still freeable, but if something isn't freeable it
> > doesn't mean that we've to kill anything
>
> Well Andrea, its not about "if something isnt freeable", its about
> "the VM is unable to make progress reclaiming pages".
"VM is unable to reclaim pages" == "nothing is freeable"
> > (for example if a task exited
> > or some dma or normal-zone or highmem memory was released by another
> > task while we were paging waiting for I/O).
>
> My last patch checks for pages_min before OOM killing, have you read it?
checking pages_min isn't correct anyways, the lowmem_reserve must taken
into account or you may not kill tasks when you should really kill
tasks.
Plus you're checking for all zones, but kswapd cannot know that it
doesn't matter if the zone dma is under pages_min, as far as there's no
GFP_DMA.
> > Every allocation is different and page_alloc.c is the only one who
> > knows what has to be done for every single allocation.
>
> OK, what do you propose? Its the third time I ask you this and got no
> concrete answer yet.
I want to move it to page_alloc.c (and up to the caller) and not in
kswapd, I mention this a few times.
> Sure, allocators should receive -ENOMEM whenever possible, but this
> is not the issue here.
it is the issue, because only the context of the task can choose if to
return -ENOMEM or to invoke the oom killer and try again.
> Triggering OOM killer on __alloc_pages() failure ?
yes, ideally I'd put the oom killer _outside_ alloc_pages, but just
moving it into alloc_pages should make things better than they are right
now in vmscan.c.
> Show us the code, please :)
I'm supposedly listening to a meeting right now, then I've a bad kernel
crash to debug with random mem corruption that I just managed to
reproduce deterministcally inside uml by emulating numa inside uml and
I'll be busy until next week at the very least. So I doubt I'll be able
to write any oom-related code until next week, sorry.
>> > I disagree about the design of killing anything from kswapd. kswapd is
>> > an async helper like pdflush and it has no knowledge on the caller (it
>> > cannot know if the caller is ok with the memory currently available in
>> > the freelists, before triggering the oom).
>>
>> If zone_dma / zone_normal are below pages_min no caller is "OK with
>> memory currently available" except GFP_ATOMIC/realtime callers.
>
> If the GFP_DMA zone is filled, and nobody allocates with GFP_DMA,
> nothing should be killed and everything should run fine, how can you
> get this right from kswapd?
Technically, that seems correct, but does it really matter much? We're
talking about
"it's full of unreclaimable stuff" vs
"it's full of unreclaimable stuff and someone tried to allocate a page".
So the difference is only ever one page, right? Doesn't really seem
worth worrying about - we'll burn that in code space for the algorithms
to do this ;-)
M.
On Thu, Nov 11, 2004 at 05:50:51PM +0100, Andrea Arcangeli wrote:
> On Thu, Nov 11, 2004 at 10:38:50AM -0200, Marcelo Tosatti wrote:
> >
> > Hi!
> >
> > On Thu, Nov 11, 2004 at 04:42:38PM +0100, Andrea Arcangeli wrote:
> > > On Thu, Nov 11, 2004 at 09:29:22AM -0200, Marcelo Tosatti wrote:
> > > > Hi,
> > > >
> > > > This is an improved version of OOM-kill-from-kswapd patch.
> > > >
> > > > I believe triggering the OOM killer from task reclaim context
> > > > is broken because the chances that it happens increases as the amount
> > > > of tasks inside reclaim increases - and that approach ignores efforts
> > > > being done by kswapd, who is the main entity responsible for
> > > > freeing pages.
> > > >
> > > > There have been a few problems pointed out by others (Andrea, Nick) on the
> > > > last patch - this one solves them.
> > >
> > > I disagree about the design of killing anything from kswapd. kswapd is
> > > an async helper like pdflush and it has no knowledge on the caller (it
> > > cannot know if the caller is ok with the memory currently available in
> > > the freelists, before triggering the oom).
> >
> > If zone_dma / zone_normal are below pages_min no caller is "OK with
> > memory currently available" except GFP_ATOMIC/realtime callers.
>
> If the GFP_DMA zone is filled, and nobody allocates with GFP_DMA,
> nothing should be killed and everything should run fine, how can you
> get this right from kswapd?
It does get it right. It only triggers OOM killer if _both_
GFP_DMA / GFP_KERNEL are full _and_ there is a task failing
to allocate/free memory.
I think you missed the "task_looping_oom" variable in the patch, which is
set as soon as a task is unable to allocate/free memory. This variable
is set where the code used to call the OOM killer.
> > > I'm just about to move the
> > > oom killing away from vmscan.c to page_alloc.c which is basically the
> > > opposite of moving the oom invocation from the task context to kswapd.
> > > page_alloc.c in the task context is the only one who can know if
> > > something has to be killed, vmscan.c cannot know. vmscan.c can only know
> > > if something is still freeable, but if something isn't freeable it
> > > doesn't mean that we've to kill anything
> >
> > Well Andrea, its not about "if something isnt freeable", its about
> > "the VM is unable to make progress reclaiming pages".
>
> "VM is unable to reclaim pages" == "nothing is freeable"
OK, correct, silly me. I noted the gaffe after sending the email.
But still, the main idea is valid here.
I'll say this again just in case: If ZONE_DMA and ZONE_NORMAL reclaiming
efforts are in vain, and there is task which is looping on try_to_free_pages()
unable to succeed, _and_ both DMA/normal are below pages_min, the OOM
killer will be triggered.
(should be pages_min + higher protection).
> > > (for example if a task exited
> > > or some dma or normal-zone or highmem memory was released by another
> > > task while we were paging waiting for I/O).
> >
> > My last patch checks for pages_min before OOM killing, have you read it?
>
> checking pages_min isn't correct anyways, the lowmem_reserve must taken
> into account or you may not kill tasks when you should really kill
> tasks.
Indeed - this can be improved.
> Plus you're checking for all zones, but kswapd cannot know that it
> doesn't matter if the zone dma is under pages_min, as far as there's no
> GFP_DMA.
You mean boxes with no DMA zone?
If the normal zone is below pages_min+protection, then GFP_KERNEL allocations
will fallback and eat from DMA zone.
I dont get you?
> > > Every allocation is different and page_alloc.c is the only one who
> > > knows what has to be done for every single allocation.
> >
> > OK, what do you propose? Its the third time I ask you this and got no
> > concrete answer yet.
>
> I want to move it to page_alloc.c (and up to the caller) and not in
> kswapd, I mention this a few times.
>
> > Sure, allocators should receive -ENOMEM whenever possible, but this
> > is not the issue here.
>
> it is the issue, because only the context of the task can choose if to
> return -ENOMEM or to invoke the oom killer and try again.
If the task chooses to return -ENOMEM it wont set "task_looping_oom" flag.
OK - you are right to say that "only the context of the task can choose
to return -ENOMEM or invoke the oom killer".
This allocator-context-only information is passed to kswapd via
"task_looping_oom".
> > Triggering OOM killer on __alloc_pages() failure ?
>
> yes, ideally I'd put the oom killer _outside_ alloc_pages, but just
> moving it into alloc_pages should make things better than they are right
> now in vmscan.c.
>
> > Show us the code, please :)
>
> I'm supposedly listening to a meeting right now, then I've a bad kernel
> crash to debug with random mem corruption that I just managed to
> reproduce deterministcally inside uml by emulating numa inside uml and
> I'll be busy until next week at the very least. So I doubt I'll be able
> to write any oom-related code until next week, sorry.
Good luck!
On Thu, Nov 11, 2004 at 11:56:14AM -0200, Marcelo Tosatti wrote:
> It does get it right. It only triggers OOM killer if _both_
> GFP_DMA / GFP_KERNEL are full _and_ there is a task failing
> to allocate/free memory.
you should check for highmem too before killing the task.
> I think you missed the "task_looping_oom" variable in the patch, which is
> set as soon as a task is unable to allocate/free memory. This variable
> is set where the code used to call the OOM killer.
why do you use a global variable to pass a message from the task context
to kswapd, when you can do the real thing in the task context since the
first place?
> But still, the main idea is valid here.
Putting the oom killer in kswapd is just flawed. I recall somebody
already put it in kswapd in the old days, but that can't work right in
all scenarios.
The single fact you had to implement the task_looping_oom variable,
means you also felt the need to send some bit of info to kswapd from the
task context, then why don't you do the real thing from the task
context where you've a load of additional information to know exactly
what to do in the first place instead of adding global variables?
> I'll say this again just in case: If ZONE_DMA and ZONE_NORMAL reclaiming
> efforts are in vain, and there is task which is looping on try_to_free_pages()
> unable to succeed, _and_ both DMA/normal are below pages_min, the OOM
> killer will be triggered.
the oom killer must be triggered even if only ZONE_DMA is under page_min
if somebody allocates with __GFP_NOFAIL|GFP_DMA. Those special cases
don't make much sense to me. The only one that can know what to do is
the task context, never kswapd. And I don't see the point of using the
task_looping_oom variable when you can invoke the oom killer from
page_alloc _after_ checking the watermarks again with lowmem_reserve
into the equation (something that sure kswapd can't do, unless you pass
more bits of info than just a 0 or 1 via task_looping_oom).
> (should be pages_min + higher protection).
higher protection requires you to define the classzone where the task is
allocating from, only the task context knows it.
> > Plus you're checking for all zones, but kswapd cannot know that it
> > doesn't matter if the zone dma is under pages_min, as far as there's no
> > GFP_DMA.
>
> You mean boxes with no DMA zone?
I mean GFP_DMA as an allocation from the DMA classzone. If nobody
allocates ram passing GFP_DMA to alloc pages, nobody should worry or
notice if the GFP_DMA is under pages_min, because no allocation risk to
fail.
The oom killing must be strictly connected with a classzone allocation
failing (the single zone doesn't matter) and if nobody uses GFP_DMA it
doesn't matter if the dma zone is under pages_min.
2.4 gets all of this perfectly right and for sure it's not even remotely
attempting at killing anything from kswapd (and infact there's not a
single bugreport outstanding). all it can happen in 2.4 is some wasted
cpu while kswapd tries to do the paging, but exactly because kswapd is
purerly an helper (like 2.6 right now too and I don't want to change
this bit, since kswapd in 2.6 looks sane, much saner than the task
context itself which is the real problematic part that needs fixing),
nothing risk to go wrong (you can only argue about performance issues
when the dma zone gets pinned and under watermark[].min ;).
> If the task chooses to return -ENOMEM it wont set "task_looping_oom"
> flag.
>
> OK - you are right to say that "only the context of the task can choose
> to return -ENOMEM or invoke the oom killer".
>
> This allocator-context-only information is passed to kswapd via
> "task_looping_oom".
what's the point of passing info to kswapd? why don't you schedule a
callback instead, why don't you use keventd instead of kswapd? I just
don't get what benefit you get from that except form complexity and
overhead. And to do it right right like this you'd need to pass more
than 1 bit of info.
I mean, one could even change the code to send the whole task
information to an userspace daemon, that will open a device driver and
issue an ioctl that eventually calls the oom killer on a certain pid, in
order to invoke the oom killer. I just don't see why to waste any effort
with non trivial message passing when the oom killer itself can be
invoked by page_alloc.c where all the watermark checks are already
implemented, without passing down information to some random kernel
daemon.
> Good luck!
thanks! ;)
On Thu, Nov 11, 2004 at 09:42:17AM -0800, Martin J. Bligh wrote:
> >> > I disagree about the design of killing anything from kswapd. kswapd is
> >> > an async helper like pdflush and it has no knowledge on the caller (it
> >> > cannot know if the caller is ok with the memory currently available in
> >> > the freelists, before triggering the oom).
> >>
> >> If zone_dma / zone_normal are below pages_min no caller is "OK with
> >> memory currently available" except GFP_ATOMIC/realtime callers.
> >
> > If the GFP_DMA zone is filled, and nobody allocates with GFP_DMA,
> > nothing should be killed and everything should run fine, how can you
> > get this right from kswapd?
>
> Technically, that seems correct, but does it really matter much? We're
> talking about
>
> "it's full of unreclaimable stuff" vs
> "it's full of unreclaimable stuff and someone tried to allocate a page".
exactly, that's the difference.
> So the difference is only ever one page, right? Doesn't really seem
there's not a single page of difference.
> worth worrying about - we'll burn that in code space for the algorithms
> to do this ;-)
are you kidding? burnt space in the algorithm? the burnt space is to
move the thing in kswapd, period. that global variable and message
passing protocol between the task context and kswapd is the total waste.
There's no waste at all in moving the oom killer up the stack to
alloc_pages and in the future up outside alloc_pages with some more
higher level API.
Marcelo Tosatti escreveu:
> This is an improved version of OOM-kill-from-kswapd patch.
It seems good. My normal repeatable test of building umlsim on my 64MB
P2 builds fine with this patch. On recent unpatched kernels it's
guaranteed to fail when the oom killer strikes at the linking stage.
Regards,
Chris R.
On Thu, Nov 11, 2004 at 10:45:50PM +0100, Andrea Arcangeli wrote:
> On Thu, Nov 11, 2004 at 11:56:14AM -0200, Marcelo Tosatti wrote:
> > It does get it right. It only triggers OOM killer if _both_
> > GFP_DMA / GFP_KERNEL are full _and_ there is a task failing
> > to allocate/free memory.
>
> you should check for highmem too before killing the task.
How's that? It doesnt really matter for OOM killing purposes if highmem
is full or not because of the fallback scheme, right?
> > I think you missed the "task_looping_oom" variable in the patch, which is
> > set as soon as a task is unable to allocate/free memory. This variable
> > is set where the code used to call the OOM killer.
>
> why do you use a global variable to pass a message from the task context
> to kswapd, when you can do the real thing in the task context since the
> first place?
OK - I'll try to explain my reasoning below.
> > But still, the main idea is valid here.
>
> Putting the oom killer in kswapd is just flawed. I recall somebody
> already put it in kswapd in the old days, but that can't work right in
> all scenarios.
It should work right as long as you provide the necessary information
to kswapd.
> The single fact you had to implement the task_looping_oom variable,
> means you also felt the need to send some bit of info to kswapd from the
> task context, then why don't you do the real thing from the task
> context where you've a load of additional information to know exactly
> what to do in the first place instead of adding global variables?
Because its not a centralized place, the chances the OOM killer will
get triggered increases as the number of parallel tasks inside
reclaim increases.
And that is the main thing that feels wrong to me in killing from task context.
> > I'll say this again just in case: If ZONE_DMA and ZONE_NORMAL reclaiming
> > efforts are in vain, and there is task which is looping on try_to_free_pages()
> > unable to succeed, _and_ both DMA/normal are below pages_min, the OOM
> > killer will be triggered.
>
> the oom killer must be triggered even if only ZONE_DMA is under page_min
> if somebody allocates with __GFP_NOFAIL|GFP_DMA.
True. Needs fixing.
> Those special cases
> don't make much sense to me. The only one that can know what to do is
> the task context, never kswapd. And I don't see the point of using the
> task_looping_oom variable when you can invoke the oom killer from
> page_alloc _after_ checking the watermarks again with lowmem_reserve
> into the equation (something that sure kswapd can't do, unless you pass
> more bits of info than just a 0 or 1 via task_looping_oom).
True, more bits of info passed to kswapd are needed for perfect functionality
(ie more information about the failing tasks).
> > (should be pages_min + higher protection).
>
> higher protection requires you to define the classzone where the task is
> allocating from, only the task context knows it.
>
> > > Plus you're checking for all zones, but kswapd cannot know that it
> > > doesn't matter if the zone dma is under pages_min, as far as there's no
> > > GFP_DMA.
> >
> > You mean boxes with no DMA zone?
>
> I mean GFP_DMA as an allocation from the DMA classzone. If nobody
> allocates ram passing GFP_DMA to alloc pages, nobody should worry or
> notice if the GFP_DMA is under pages_min, because no allocation risk to
> fail.
Sure.
> The oom killing must be strictly connected with a classzone allocation
> failing (the single zone doesn't matter) and if nobody uses GFP_DMA it
> doesn't matter if the dma zone is under pages_min.
Sure.
> 2.4 gets all of this perfectly right and for sure it's not even remotely
> attempting at killing anything from kswapd (and infact there's not a
> single bugreport outstanding). all it can happen in 2.4 is some wasted
> cpu while kswapd tries to do the paging, but exactly because kswapd is
> purerly an helper (like 2.6 right now too and I don't want to change
> this bit, since kswapd in 2.6 looks sane, much saner than the task
> context itself which is the real problematic part that needs fixing),
> nothing risk to go wrong (you can only argue about performance issues
> when the dma zone gets pinned and under watermark[].min ;).
>
> > If the task chooses to return -ENOMEM it wont set "task_looping_oom"
> > flag.
> >
> > OK - you are right to say that "only the context of the task can choose
> > to return -ENOMEM or invoke the oom killer".
> >
> > This allocator-context-only information is passed to kswapd via
> > "task_looping_oom".
>
> what's the point of passing info to kswapd? why don't you schedule a
> callback instead, why don't you use keventd instead of kswapd?
Because keventd or whatever userspace daemon are not responsible for
freeing pages.
> I just
> don't get what benefit you get from that except form complexity and
> overhead. And to do it right right like this you'd need to pass more
> than 1 bit of info.
My thinking is this, please argue back if it doenst make any sense to you:
The tasks on reclaiming procedure do not know about kswapd's
reclaiming efforts (again, I'm thinking of kswapd as the main entity
freeing pages, not purely a helper), and also do not know about other
tasks reclaiming progress.
Its impossible for a task to know if kswapd or any other reclaiming
tasks have been able to free pages (progress freeing pages means
the OOM killer should NOT be triggered).
So, to illustrate, if you have 100 tasks inside reclaim, and 99 of them
successfully free pages, but one of them doesnt (mainly due to the competition),
it will wrongly trigger the OOM killer.
So my thinking is "we need to centralize OOM detection for it to be reliable".
And to detect OOM in a certain zone, you need to fully scan it. You dont
want to be scanning the full ZONE_NORMAL zone multiplied by number of
tasks inside reclaim, do you?
This are the benefits, and the need, to move the OOM killer inside one
single kernel daemon who is responsible for keeping the free page
reserves full.
Thats it.
> I mean, one could even change the code to send the whole task
> information to an userspace daemon, that will open a device driver and
> issue an ioctl that eventually calls the oom killer on a certain pid, in
> order to invoke the oom killer. I just don't see why to waste any effort
> with non trivial message passing when the oom killer itself can be
> invoked by page_alloc.c where all the watermark checks are already
> implemented, without passing down information to some random kernel
> daemon.
Its not really "random kernel daemon" - its the daemon running the
page reclaiming code.
> > Good luck!
>
> thanks! ;)
Hello everyone,
On Thu, Nov 11, 2004 at 05:50:51PM +0100, Andrea Arcangeli wrote:
> [..] I've a bad kernel
> crash to debug with random mem corruption [..]
With the inline symlink shmem_inode_info structure is overwritten with
data until vfs_inode, and that caused the ->policy to be a corrupted
pointer during unlink. It wasn't immediatly easy to see what was going
on due the random mm corruption that generated a weird oops, it looked
more like a race condition on freed memory at first.
There's simply no need to set a policy for inodes, since the idx is
always zero. All we have to do is to initialize the data structure (the
semaphore may need to run during the page allocation for the non-inline
symlink) but we don't need to allocate the rb nodes. This way we don't
need to call mpol_free during the destroy_inode (not doable at all if
the policy rbtree is corrupt by the inline symlink ;).
An equivalent version of this patch based on a 2.6.5 tree with
additional numa features on top of this (i.e. interleaved by default,
and that's prompted me to add a comment in the LNK init path), works
fine in a numa simulation on my laptop (untested on the bare hardware).
The patch includes another unrelated bugfix I did while checking
mempolicy.c code that would return the wrong policy in some case and
some unrelated optimizations again in mempolicy.c (like to avoid
rebalancing the tree while destroying it and by breaking loops early and
not checking for invariant conditions in the replace operation). You
want to review the rebalance optimization I did in
shared_policy_replace, that's tricky code.
Signed-off-by: Andrea Arcangeli <[email protected]>
Index: mm/mempolicy.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/mempolicy.c,v
retrieving revision 1.19
diff -u -p -r1.19 mempolicy.c
--- mm/mempolicy.c 28 Oct 2004 15:16:58 -0000 1.19
+++ mm/mempolicy.c 12 Nov 2004 11:04:11 -0000
@@ -902,7 +902,7 @@ sp_lookup(struct shared_policy *sp, unsi
struct sp_node *p = rb_entry(n, struct sp_node, nd);
if (start >= p->end) {
n = n->rb_right;
- } else if (end < p->start) {
+ } else if (end <= p->start) {
n = n->rb_left;
} else {
break;
@@ -1015,12 +1015,10 @@ restart:
return -ENOMEM;
goto restart;
}
- n->end = end;
+ n->end = start;
sp_insert(sp, new2);
- new2 = NULL;
- }
- /* Old crossing beginning, but not end (easy) */
- if (n->start < start && n->end > start)
+ break;
+ } else
n->end = start;
}
if (!next)
@@ -1073,11 +1071,11 @@ void mpol_free_shared_policy(struct shar
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
- rb_erase(&n->nd, &p->root);
mpol_free(n->policy);
kmem_cache_free(sn_cache, n);
}
spin_unlock(&p->lock);
+ p->root = RB_ROOT;
}
/* assumes fs == KERNEL_DS */
Index: mm/shmem.c
===================================================================
RCS file: /home/andrea/crypto/cvs/linux-2.5/mm/shmem.c,v
retrieving revision 1.160
diff -u -p -r1.160 shmem.c
--- mm/shmem.c 28 Oct 2004 15:18:00 -0000 1.160
+++ mm/shmem.c 12 Nov 2004 11:01:13 -0000
@@ -1292,7 +1292,6 @@ shmem_get_inode(struct super_block *sb,
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
- mpol_shared_policy_init(&info->policy);
INIT_LIST_HEAD(&info->swaplist);
switch (mode & S_IFMT) {
@@ -1303,6 +1302,7 @@ shmem_get_inode(struct super_block *sb,
case S_IFREG:
inode->i_op = &shmem_inode_operations;
inode->i_fop = &shmem_file_operations;
+ mpol_shared_policy_init(&info->policy);
break;
case S_IFDIR:
inode->i_nlink++;
@@ -1312,6 +1312,11 @@ shmem_get_inode(struct super_block *sb,
inode->i_fop = &simple_dir_operations;
break;
case S_IFLNK:
+ /*
+ * Must not load anything in the rbtree,
+ * mpol_free_shared_policy will not be called.
+ */
+ mpol_shared_policy_init(&info->policy);
break;
}
}
@@ -2024,7 +2029,9 @@ static struct inode *shmem_alloc_inode(s
static void shmem_destroy_inode(struct inode *inode)
{
- mpol_free_shared_policy(&SHMEM_I(inode)->policy);
+ if ((inode->i_mode & S_IFMT) == S_IFREG) {
+ /* only struct inode is valid if it's an inline symlink */
+ mpol_free_shared_policy(&SHMEM_I(inode)->policy);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}
Chris Ross escreveu:
> It seems good.
Sorry Marcelo, I spoke to soon. The oom killer still goes haywire even
with your new patch. I even got this one whilst the machine was booting!
Ignore the big numbers, they are cured by Kame's patch. I haven't
applied that to this kernel. This tree is pure 2.6.10-rc1-mm2 with only
your recent oom patch applied.
Regards,
Chris R.
Nov 12 17:32:21 sleepy Free pages: 268kB (0kB HighMem)
Nov 12 17:32:21 sleepy Active:7853 inactive:4921 dirty:0 writeback:0
unstable:0
free:67 slab:1243 mapped:5773 pagetables:103
Nov 12 17:32:21 sleepy DMA free:60kB min:60kB low:120kB high:180kB
active:6436kB
inactive:5624kB present:16384kB pages_scanned:7108 all_unreclaimable? no
Nov 12 17:32:21 sleepy protections[]: 0 0 0
Nov 12 17:32:21 sleepy Normal free:208kB min:188kB low:376kB high:564kB
active:2
4976kB inactive:14060kB present:49144kB pages_scanned:19668
all_unreclaimable? n
o
Nov 12 17:32:21 sleepy protections[]: 0 0 0
Nov 12 17:32:21 sleepy HighMem free:0kB min:128kB low:256kB high:384kB
active:0k
B inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 12 17:32:21 sleepy protections[]: 0 0 0
Nov 12 17:32:21 sleepy DMA: 4294944789*4kB 4294964727*8kB
4294966668*16kB 429496
7076*32kB 4294967238*64kB 4294967233*128kB 4294967253*256kB
4294967284*512kB 429
4967290*1024kB 4294967293*2048kB 4294967294*4096kB = 4294790220kB
Nov 12 17:32:21 sleepy Normal: 4294803738*4kB 4294928464*8kB
4294957447*16kB 429
4964898*32kB 4294966867*64kB 4294967050*128kB 4294967186*256kB
4294967246*512kB
4294967268*1024kB 4294967283*2048kB 4294967286*4096kB = 4293559128kB
Nov 12 17:32:21 sleepy HighMem: empty
Nov 12 17:32:21 sleepy Swap cache: add 13796, delete 10099, find
2839/3488, race
0+0
Nov 12 17:32:21 sleepy Out of Memory: Killed process 6806 (qmgr).
Chris Ross wrote:
>
> Chris Ross escreveu:
>
>> It seems good.
>
>
> Sorry Marcelo, I spoke to soon. The oom killer still goes haywire even
> with your new patch. I even got this one whilst the machine was booting!
>
> Ignore the big numbers, they are cured by Kame's patch. I haven't
> applied that to this kernel. This tree is pure 2.6.10-rc1-mm2 with only
> your recent oom patch applied.
>
But those big numbers are going to cause things to stop working properly.
You'd be best off to upgrade to the latest -mm kernel.
Thanks,
Nick
On Fri, Nov 12, 2004 at 05:52:21PM +0100, Chris Ross wrote:
>
>
> Chris Ross escreveu:
> >It seems good.
>
> Sorry Marcelo, I spoke to soon. The oom killer still goes haywire even
> with your new patch. I even got this one whilst the machine was booting!
On monday I'll make a patch to place the oom killer at the right place.
Marcelo's argument that kswapd is a localized place isn't sound to me,
kswapd is still racing against all other task contexts, so if the task
context isn't reliable, there's no reason why kswapd should be more
reliable than the task context. the trick is to check the _right_
watermarks before invoking the oom killer, it's not about racing against
each other, 2.6 is buggy in not checking the watermarks. Moving the oom
killer in kswapd can only make thing worse, fix is simple, and it's the
opposite thing: move the oom killer up the stack outside vmscan.c.
On Sun, Nov 14, 2004 at 12:37:40AM +0100, Andrea Arcangeli wrote:
> On Fri, Nov 12, 2004 at 05:52:21PM +0100, Chris Ross wrote:
> >
> >
> > Chris Ross escreveu:
> > >It seems good.
> >
> > Sorry Marcelo, I spoke to soon. The oom killer still goes haywire even
> > with your new patch. I even got this one whilst the machine was booting!
>
> On monday I'll make a patch to place the oom killer at the right place.
>
> Marcelo's argument that kswapd is a localized place isn't sound to me,
> kswapd is still racing against all other task contexts, so if the task
> context isn't reliable, there's no reason why kswapd should be more
> reliable than the task context. the trick is to check the _right_
> watermarks before invoking the oom killer, it's not about racing against
> each other, 2.6 is buggy in not checking the watermarks. Moving the oom
> killer in kswapd can only make thing worse, fix is simple, and it's the
> opposite thing: move the oom killer up the stack outside vmscan.c.
Its hard to detect OOM situation with zone->all_unreclaimable logic.
Well, I'll wait for your correct and definitive approach.
On Sun, Nov 14, 2004 at 07:44:17AM -0200, Marcelo Tosatti wrote:
> On Sun, Nov 14, 2004 at 12:37:40AM +0100, Andrea Arcangeli wrote:
> > On Fri, Nov 12, 2004 at 05:52:21PM +0100, Chris Ross wrote:
> > >
> > >
> > > Chris Ross escreveu:
> > > >It seems good.
> > >
> > > Sorry Marcelo, I spoke to soon. The oom killer still goes haywire even
> > > with your new patch. I even got this one whilst the machine was booting!
> >
> > On monday I'll make a patch to place the oom killer at the right place.
> >
> > Marcelo's argument that kswapd is a localized place isn't sound to me,
> > kswapd is still racing against all other task contexts, so if the task
> > context isn't reliable, there's no reason why kswapd should be more
> > reliable than the task context. the trick is to check the _right_
> > watermarks before invoking the oom killer, it's not about racing against
> > each other, 2.6 is buggy in not checking the watermarks. Moving the oom
> > killer in kswapd can only make thing worse, fix is simple, and it's the
> > opposite thing: move the oom killer up the stack outside vmscan.c.
>
> Its hard to detect OOM situation with zone->all_unreclaimable logic.
>
> Well, I'll wait for your correct and definitive approach.
Take zone->all_unreclaimable into account when you move oom_kill in page_alloc.c,
which I now think might be the simpler fix.
shrink_caches() will fail early due to all_unreclaimable() logic (it wont
scan/writeout at lower priorities):
if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
I disabled all_unreclaimable after 5 seconds allowed kswapd to scan
the full zone and reliably detect OOM in my kill-from-kswapd patch -
you might want something similar.
That seems one the main reasons for the spurious OOM kills.
Anxious to see your patch!
On Sun, Nov 14, 2004 at 07:44:17AM -0200, Marcelo Tosatti wrote:
> Well, I'll wait for your correct and definitive approach.
I doubt my patch will be definitive and you're welcome to keep hacking
without waiting ;). There are various problems, one issue is the
try_to_free_pages side, the other severe and obvious bug is the
invocation of the oom killer in vmscan.c that cannot know if enough
memory is already free via racing tasks (like other context and like
kswapd).
I'm looking at the latter first since that is easy to fix, but as you
said the zone->all_unreclaimable is hard and it may be buggy too, so I
doubt my fix will be definitive ;) but it'll worth a try. Feel free to
work on the zone->all_unreclaimable for example.
Especially the fact your patch didn't help make me think my firts patch
also won't fix it and we'll have to look into the try_to_free_pages
internals to fix it completely.
My patch compared to yours will only save .text/.data/.bss bloat (i.e.
the opposite of what Martin was worried about) to avoid message passing
via global variable w/o locks from task context to kswapd.
Chris, since you can reproduce so easily, could you try a `vmstat 1`
while the oom killing happens, and can you post it?
We've also to differentiate between true early-oom kills, and genuine
oom-kills. The oom killing triggering is not always by mistake ;). Chris
if you could post the vmstat 1 that would help to be sure it's really a
kernel bug (if you already posted it in another thread just let me know
and I'll search for such an email ;). Thanks!
On Sun, Nov 14, 2004 at 08:02:27AM -0200, Marcelo Tosatti wrote:
> Take zone->all_unreclaimable into account when you move oom_kill in page_alloc.c,
> which I now think might be the simpler fix.
>
> shrink_caches() will fail early due to all_unreclaimable() logic (it wont
> scan/writeout at lower priorities):
>
> if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
> continue; /* Let kswapd poll it */
>
> I disabled all_unreclaimable after 5 seconds allowed kswapd to scan
> the full zone and reliably detect OOM in my kill-from-kswapd patch -
> you might want something similar.
>
> That seems one the main reasons for the spurious OOM kills.
>
> Anxious to see your patch!
could you make a patch for the above part? I agree likely we've to work
on the shrink_cache stuff to fix it.
Your patch does many things, some of which I believe it's wrong, but you
can split it, and take care of the above part so Chris can test it in
the meantime. In the spare time of the weekend I'm trying not to do
linux coding since I've to code for a private project that has to keep
going, so I'll move the oom killing in page_alloc.c only tomorrow (it
won't take very long to do it at it will at last fix one strict bug,
that your patch didn't fix competely and it'll avoid the complexity of
message passing to do it).
But today you can already post your change for the above sc->priority !=
DEF_PRIORITY if you think it's correct, I don't see why you need to wait
for my patch before posting it (they're pretty orthogonal, I didn't mean
to change shrink_cache at all but just to fixup and serialize the oom
invocation by moving it in page_alloc.c).
> My patch compared to yours will only save .text/.data/.bss bloat (i.e.
> the opposite of what Martin was worried about) to avoid message passing
> via global variable w/o locks from task context to kswapd.
Heh, I wasn't really worried about the code size at all ... I was just
pointing out that 1 page was a trivial amount to be worried about, in
terms of when we start reclaim.
M.
On Sun, Nov 14, 2004 at 10:16:32AM -0800, Martin J. Bligh wrote:
> Heh, I wasn't really worried about the code size at all ... I was just
> pointing out that 1 page was a trivial amount to be worried about, in
> terms of when we start reclaim.
Ok, my point is that the code size will be smaller and simpler without
message passing, the locking will be a lot simpler since there will be
no locking at all (all info is in the local stack, no need to send local
info to a global kswapd). Plus kswapd when fails the paging is no
different from task context failing the paging, since kswapd will be
racing against all task context, like all task context races against
each other and kswapd too.
About the 1 page trivial amount, I missed/forgot where this 1 page
trivial amount comes from. There's not at all any 1 page trivial amount
of difference between doing oom kill in kswapd based on information
passed from the page_alloc.c task-context, or doing it in the
page_alloc.c task-context directly. Obviously if it was only 1
off-by-one issue it couldn't make any difference, the watermarks are big
enough not having to care about 1 page more or less. Perhaps I wasn't
very clear in explaning why it's better to do the oom killing in
page_alloc.c (where we've the task local information to know if the
allocation is just going to fail) instead of vmscan.c.
Hi,
On Sun, Nov 14, 2004 at 06:03:39PM +0100, Andrea Arcangeli wrote:
> On Sun, Nov 14, 2004 at 07:44:17AM -0200, Marcelo Tosatti wrote:
> > Well, I'll wait for your correct and definitive approach.
>
> I doubt my patch will be definitive and you're welcome to keep hacking
> without waiting ;). There are various problems, one issue is the
> try_to_free_pages side, the other severe and obvious bug is the
> invocation of the oom killer in vmscan.c that cannot know if enough
> memory is already free via racing tasks (like other context and like
> kswapd).
>
> I'm looking at the latter first since that is easy to fix, but as you
> said the zone->all_unreclaimable is hard and it may be buggy too, so I
> doubt my fix will be definitive ;) but it'll worth a try. Feel free to
> work on the zone->all_unreclaimable for example.
>
> Especially the fact your patch didn't help make me think my firts patch
> also won't fix it and we'll have to look into the try_to_free_pages
> internals to fix it completely.
I'm not sure about Chris's case - Nick mentions that the corrupt perzone
accounting freepage bug might be interfering with the system.
If its not the case, increasing the all_unreclaimable "timer" to a higher value
than 5 seconds will certainly delay the OOM killer such to a point where
its not triggered until the VM reclaiming efforts make progress.
> My patch compared to yours will only save .text/.data/.bss bloat (i.e.
> the opposite of what Martin was worried about) to avoid message passing
> via global variable w/o locks from task context to kswapd.
>
> Chris, since you can reproduce so easily, could you try a `vmstat 1`
> while the oom killing happens, and can you post it?
Chris, can you change the "500*HZ" in mm/vmscan.c balance_pgdat() function
to "1000*HZ" and see what you get, please?
> We've also to differentiate between true early-oom kills, and genuine
> oom-kills. The oom killing triggering is not always by mistake ;). Chris
> if you could post the vmstat 1 that would help to be sure it's really a
> kernel bug (if you already posted it in another thread just let me know
> and I'll search for such an email ;). Thanks!
This uncompiled/untested patch moves OOM killer call to page_alloc.c as you
strongly advocates.
It changes shrink_caches() to return 1 if it has scanned all zones which are
part of the allocation zonelist. That is, it will only return 1 if all eligible
zones do _NOT_ have zone->all_unreclaimable set.
try_to_free_pages() then uses the return value from shrink_caches() and
gfp_mask's GFP_FS capability (meaning the task can writeout data) to
decide if it should return "-2" (just a magic value meaning
"we have scanned all eligible zones down to priority zero without
success").
This magic return value from "try_to_free_pages()" is the indicator
that the allocator can trigger the OOM killer because it tried "hard
enough" to free pages.
This has to work in combination with the all_unreclaimable timer
(which turns off zone->all_unreclaimable after a certain period its in place).
The problem is that kswapd can turn all_unreclaimable ON again
in the meantime on any of the zonelist's zone's
if (zone->pages_scanned >= (zone->nr_active + zone->nr_inactive) * 4)
zone->all_unreclaimable = 1;
which in turn can make shrink_caches() not return 1, meaning OOM killer
will fail to be triggered.
See the problem ?
We need some way to account for per-zone page reclaiming efforts globally and
not thread localized, so to reliably detect per-zone OOM.
Such accounting would make reliable OOM decision possible with
both multiple reclaiming threads _and_ perzone all_unreclaimable logic.
The easy alternative is to assume kswapd will detect OOM reliably (which
is what I've tried). But can not be always true as you have have noted.
Its not trivial but we will get it right.
Are you guys following me or I'm flying high alone here?
--- vmscan.c.ori 2004-11-14 19:51:30.572496864 -0200
+++ vmscan.c 2004-11-14 20:04:38.022786224 -0200
@@ -854,10 +854,11 @@
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static void
+static int
shrink_caches(struct zone **zones, struct scan_control *sc)
{
int i;
+ int scanned_all = 1;
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
@@ -872,11 +873,15 @@
if (zone->prev_priority > sc->priority)
zone->prev_priority = sc->priority;
- if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) {
+ scanned_all = 0;
continue; /* Let kswapd poll it */
+ }
shrink_zone(zone, sc);
}
+
+ return scanned_all;
}
/*
@@ -901,7 +906,7 @@
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc;
unsigned long lru_pages = 0;
- int i;
+ int i, scanned_all;
sc.gfp_mask = gfp_mask;
sc.may_writepage = 0;
@@ -923,7 +928,7 @@
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
sc.priority = priority;
- shrink_caches(zones, &sc);
+ scanned_all = shrink_caches(zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -952,8 +957,8 @@
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
- if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
- out_of_memory(gfp_mask);
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY) && scanned_all)
+ ret = -2;
out:
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];
--- page_alloc.c.orig 2004-11-14 19:50:54.536975096 -0200
+++ page_alloc.c 2004-11-14 20:11:37.254053384 -0200
@@ -773,6 +773,7 @@
int alloc_type;
int do_retry;
int can_try_harder;
+ int reclaim_failure;
might_sleep_if(wait);
@@ -857,7 +858,8 @@
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
- try_to_free_pages(zones, gfp_mask, order);
+ if (try_to_free_pages(zones, gfp_mask, order) == -2)
+ reclaim_failure = 1;
p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -892,6 +894,8 @@
do_retry = 1;
}
if (do_retry) {
+ if (reclaim_failure)
+ out_of_memory();
blk_congestion_wait(WRITE, HZ/50);
goto rebalance;
}
Andrea Arcangeli escreveu:
> Chris, since you can reproduce so easily, could you try a `vmstat 1`
> while the oom killing happens, and can you post it?
This is with linux-2.6.10-rc2 (and no patches).
procs -----------memory---------- ---swap-- -----io---- --system--
----cpu----
r b swpd free buff cache si so bi bo in cs us
sy id wa
2 0 5044 2888 5128 12696 320 1892 4532 1892 1157 295 20
20 0 60
0 1 6860 756 3076 15368 148 1920 4428 1920 1175 261 13
14 0 73
0 1 9460 712 2492 12404 28 2736 4896 2736 1142 247 15
19 0 66
0 2 22228 3176 2460 4144 240 13316 2556 13356 1146 132 14
30 0 56
0 1 21612 960 2556 7932 300 0 4936 172 1145 195 16
12 0 72
0 2 22964 552 2440 4088 992 1720 6760 1720 1316 407 11
27 0 62
0 3 23492 712 2456 3520 1476 876 3968 876 1487 376 2
18 0 81
2 2 24904 580 2484 3784 1156 1664 2592 1664 1432 319 1
13 0 86
0 3 35012 704 2616 5604 864 10116 2872 10184 1139 207 4
15 0 81
2 2 34756 880 2848 6444 232 0 1072 340 1248 171 5
12 0 83
0 2 34740 752 2992 7220 1448 0 2308 0 1111 267 23
13 0 65
0 2 34740 736 3200 7308 1288 0 1544 0 1109 318 46
18 0 35
2 0 34740 548 3364 7436 924 0 1892 0 1116 320 55
22 0 23
1 1 34740 932 3216 6592 1196 0 1676 0 1104 183 51
19 0 30
0 2 34732 652 3440 5572 1752 12 1752 420 1085 162 74
4 0 22
0 1 34736 788 3088 4300 2656 8 2656 384 1235 245 46
7 0 48
1 2 35076 700 2740 5068 1940 440 3512 440 1290 303 11
10 0 80
0 1 35208 676 2756 5020 1096 192 4480 192 1275 309 1
11 0 88
0 1 35432 672 2760 3700 280 408 3884 1076 1579 346 10
23 0 67
0 6 35432 396 2760 3300 2512 272 3936 568 4782 2045 0
25 0 74
1 1 6260 41364 2800 4480 2220 0 3812 0 1180 359 1
8 0 91
1 0 6260 41200 2908 4488 64 0 84 168 1040 52 0
2 92 6
0 0 6260 41200 2908 4488 4 0 4 0 1005 8 0
0 99 1
Nov 16 09:25:20 sleepy oom-killer: gfp_mask=0xd2
Nov 16 09:25:20 sleepy DMA per-cpu:
Nov 16 09:25:20 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 16 09:25:20 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 16 09:25:20 sleepy Normal per-cpu:
Nov 16 09:25:20 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 16 09:25:20 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 16 09:25:20 sleepy HighMem per-cpu: empty
Nov 16 09:25:20 sleepy
Nov 16 09:25:20 sleepy Free pages: 244kB (0kB HighMem)
Nov 16 09:25:20 sleepy Active:12246 inactive:392 dirty:0 writeback:0
unstable:0 free:61 slab:1294 mapped:11933 pagetables:172
Nov 16 09:25:20 sleepy DMA free:60kB min:60kB low:120kB high:180kB
active:11876kB inactive:0kB present:16384kB pages_scanned:12100
all_unreclaimable? yes
Nov 16 09:25:20 sleepy protections[]: 0 0 0
Nov 16 09:25:20 sleepy Normal free:184kB min:188kB low:376kB high:564kB
active:37108kB inactive:1568kB present:49144kB pages_scanned:40510
all_unreclaimable
? yes
Nov 16 09:25:20 sleepy protections[]: 0 0 0
Nov 16 09:25:20 sleepy HighMem free:0kB min:128kB low:256kB high:384kB
active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 16 09:25:20 sleepy protections[]: 0 0 0
Nov 16 09:25:20 sleepy DMA: 1*4kB 1*8kB 1*16kB 1*32kB 0*64kB 0*128kB
0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 60kB
Nov 16 09:25:20 sleepy Normal: 0*4kB 1*8kB 1*16kB 1*32kB 0*64kB 1*128kB
0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 184kB
Nov 16 09:25:20 sleepy HighMem: empty
Nov 16 09:25:20 sleepy Swap cache: add 8591929, delete 8588661, find
723569/2416017, race 3+15
Nov 16 09:25:20 sleepy Out of Memory: Killed process 22511 (ld).
Marcelo Tosatti escreveu:
> If its not the case, increasing the all_unreclaimable "timer" to a higher value
> than 5 seconds will certainly delay the OOM killer such to a point where
> its not triggered until the VM reclaiming efforts make progress.
[...]
>
> Chris, can you change the "500*HZ" in mm/vmscan.c balance_pgdat() function
> to "1000*HZ" and see what you get, please?
Changed. FWIW it's been running happily for hours without a single oom,
including the normally guaranteed build UML test. I'll leave it running
and see how it goes. The daily cron run is a usually a popular time for
killing off a few essential daemons (ntpd, sshd &c), in fact I think the
OOM Killer actually looks forward to it :)
Regards,
Chris R.
Chris Ross <[email protected]> wrote:
>
> the oom killer strikes at the linking stage.
Can you beat on this patch a bit?
--- 25/mm/vmscan.c~a 2004-11-16 19:25:55.360041112 -0800
+++ 25-akpm/mm/vmscan.c 2004-11-16 19:26:45.791374384 -0800
@@ -918,11 +918,11 @@ int try_to_free_pages(struct zone **zone
lru_pages += zone->nr_active + zone->nr_inactive;
}
- for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+ for (priority = DEF_PRIORITY; priority >= -1; priority--) {
sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
- sc.priority = priority;
+ sc.priority = (priority < 0) ? 0 : priority;
shrink_caches(zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
_
It just adds another priority-0 scanning pass before declaring oom. It
works for me.
See, when redoing the 2.5 scanning code a couple of years ago I reduced the
amount of scanning which we do before declaring oom (compared with 2.4) by
quite a lot. It was basically a "lets try this and see who complains"
exercise.
And since that time, the way in which `priority' is interpreted has
changed, which may have worsened things.
Presently we're scanning the entire active list twice and the entire
inactive list twice. I suspect that if the inactive list is full of
referenced pages, that just isn't enough. However it's hard to work out
what _is_ enough. Still, it doesn't hurt to do a bit more scanning before
going off killing things, so the above seems a safe approach.
If the above still doesn't work, try replacing -1 with -2, etc.
Nov 17 06:19:30 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:30 sleepy DMA per-cpu:
Nov 17 06:19:30 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:30 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:30 sleepy Normal per-cpu:
Nov 17 06:19:30 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:30 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:30 sleepy HighMem per-cpu: empty
Nov 17 06:19:30 sleepy
Nov 17 06:19:30 sleepy Free pages: 236kB (0kB HighMem)
Nov 17 06:19:30 sleepy Active:6386 inactive:6367 dirty:0 writeback:0 unstable:0 free:59 slab:1280 mapped:962 pagetables:96
Nov 17 06:19:30 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:5912kB inactive:5788kB present:16384kB pages_scanned:3131 all_unreclaimable? no
Nov 17 06:19:30 sleepy protections[]: 0 0 0
Nov 17 06:19:30 sleepy Normal free:176kB min:188kB low:376kB high:564kB active:19632kB inactive:19680kB present:49144kB pages_scanned:10289 all_unreclaimable? no
Nov 17 06:19:30 sleepy protections[]: 0 0 0
Nov 17 06:19:30 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:30 sleepy protections[]: 0 0 0
Nov 17 06:19:30 sleepy DMA: 4293918323*4kB 4294857582*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547244kB
Nov 17 06:19:30 sleepy Normal: 4291283363*4kB 4294690597*8kB 4294945040*16kB 4294963914*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268916kB
Nov 17 06:19:30 sleepy HighMem: empty
Nov 17 06:19:30 sleepy Swap cache: add 189724, delete 189463, find 44119/68973, race 0+0
Nov 17 06:19:30 sleepy Out of Memory: Killed process 21982 (pickup).
Nov 17 06:19:30 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:30 sleepy DMA per-cpu:
Nov 17 06:19:30 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:30 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:30 sleepy Normal per-cpu:
Nov 17 06:19:30 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:30 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:30 sleepy HighMem per-cpu: empty
Nov 17 06:19:30 sleepy
Nov 17 06:19:30 sleepy Free pages: 244kB (0kB HighMem)
Nov 17 06:19:30 sleepy Active:6366 inactive:6386 dirty:0 writeback:0 unstable:0 free:61 slab:1278 mapped:951 pagetables:92
Nov 17 06:19:30 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:5884kB inactive:5828kB present:16384kB pages_scanned:3237 all_unreclaimable? no
Nov 17 06:19:30 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:184kB min:188kB low:376kB high:564kB active:19580kB inactive:19716kB present:49144kB pages_scanned:10810 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857581*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547136kB
Nov 17 06:19:31 sleepy Normal: 4291283355*4kB 4294690595*8kB 4294945039*16kB 4294963913*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268820kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189751, delete 189495, find 44119/68977, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 6813 (qmgr).
Nov 17 06:19:31 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:31 sleepy DMA per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:31 sleepy Normal per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:31 sleepy HighMem per-cpu: empty
Nov 17 06:19:31 sleepy
Nov 17 06:19:31 sleepy Free pages: 244kB (0kB HighMem)
Nov 17 06:19:31 sleepy Active:6495 inactive:6257 dirty:0 writeback:0 unstable:0 free:61 slab:1279 mapped:951 pagetables:92
Nov 17 06:19:31 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:6272kB inactive:5440kB present:16384kB pages_scanned:3922 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:184kB min:188kB low:376kB high:564kB active:19708kB inactive:19588kB present:49144kB pages_scanned:12326 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857581*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547136kB
Nov 17 06:19:31 sleepy Normal: 4291283355*4kB 4294690595*8kB 4294945039*16kB 4294963913*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268820kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189751, delete 189495, find 44119/68977, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 6473 (distccd).
Nov 17 06:19:31 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:31 sleepy DMA per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:31 sleepy Normal per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:31 sleepy HighMem per-cpu: empty
Nov 17 06:19:31 sleepy
Nov 17 06:19:31 sleepy Free pages: 244kB (0kB HighMem)
Nov 17 06:19:31 sleepy Active:6423 inactive:6337 dirty:0 writeback:0 unstable:0 free:61 slab:1277 mapped:951 pagetables:88
Nov 17 06:19:31 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:5884kB inactive:5828kB present:16384kB pages_scanned:3191 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:184kB min:188kB low:376kB high:564kB active:19808kB inactive:19520kB present:49144kB pages_scanned:10581 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857581*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547136kB
Nov 17 06:19:31 sleepy Normal: 4291283353*4kB 4294690595*8kB 4294945039*16kB 4294963913*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268812kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189751, delete 189495, find 44119/68977, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 6474 (distccd).
Nov 17 06:19:31 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:31 sleepy DMA per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:31 sleepy Normal per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:31 sleepy HighMem per-cpu: empty
Nov 17 06:19:31 sleepy
Nov 17 06:19:31 sleepy Free pages: 244kB (0kB HighMem)
Nov 17 06:19:31 sleepy Active:6488 inactive:6273 dirty:0 writeback:0 unstable:0 free:61 slab:1279 mapped:951 pagetables:84
Nov 17 06:19:31 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:6076kB inactive:5636kB present:16384kB pages_scanned:3727 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:184kB min:188kB low:376kB high:564kB active:19876kB inactive:19456kB present:49144kB pages_scanned:12483 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857581*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547136kB
Nov 17 06:19:31 sleepy Normal: 4291283351*4kB 4294690595*8kB 4294945039*16kB 4294963913*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268804kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189751, delete 189495, find 44119/68977, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 6598 (distccd).
Nov 17 06:19:31 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:31 sleepy DMA per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:31 sleepy Normal per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:31 sleepy HighMem per-cpu: empty
Nov 17 06:19:31 sleepy
Nov 17 06:19:31 sleepy Free pages: 244kB (0kB HighMem)
Nov 17 06:19:31 sleepy Active:6334 inactive:6427 dirty:0 writeback:0 unstable:0 free:61 slab:1278 mapped:951 pagetables:80
Nov 17 06:19:31 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:5756kB inactive:5956kB present:16384kB pages_scanned:9099 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:184kB min:188kB low:376kB high:564kB active:19580kB inactive:19752kB present:49144kB pages_scanned:30953 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857581*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547136kB
Nov 17 06:19:31 sleepy Normal: 4291283351*4kB 4294690595*8kB 4294945039*16kB 4294963913*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268804kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189751, delete 189495, find 44119/68977, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 6703 (distccd).
Nov 17 06:19:31 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:31 sleepy DMA per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:31 sleepy Normal per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:31 sleepy HighMem per-cpu: empty
Nov 17 06:19:31 sleepy
Nov 17 06:19:31 sleepy Free pages: 228kB (0kB HighMem)
Nov 17 06:19:31 sleepy Active:6506 inactive:6263 dirty:0 writeback:0 unstable:0 free:57 slab:1272 mapped:951 pagetables:76
Nov 17 06:19:31 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:6236kB inactive:5476kB present:16384kB pages_scanned:3925 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:168kB min:188kB low:376kB high:564kB active:19788kB inactive:19576kB present:49144kB pages_scanned:13009 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857581*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547136kB
Nov 17 06:19:31 sleepy Normal: 4291283343*4kB 4294690596*8kB 4294945038*16kB 4294963913*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268764kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189759, delete 189503, find 44120/68980, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 6688 (ntpd).
Nov 17 06:19:31 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:31 sleepy DMA per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:31 sleepy Normal per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:31 sleepy HighMem per-cpu: empty
Nov 17 06:19:31 sleepy
Nov 17 06:19:31 sleepy Free pages: 228kB (0kB HighMem)
Nov 17 06:19:31 sleepy Active:6448 inactive:6321 dirty:0 writeback:0 unstable:0 free:57 slab:1274 mapped:951 pagetables:76
Nov 17 06:19:31 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:5884kB inactive:5828kB present:16384kB pages_scanned:5865 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:168kB min:188kB low:376kB high:564kB active:19908kB inactive:19456kB present:49144kB pages_scanned:20131 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857581*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547136kB
Nov 17 06:19:31 sleepy Normal: 4291283343*4kB 4294690596*8kB 4294945038*16kB 4294963913*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268764kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189759, delete 189503, find 44120/68980, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 22018 (sh).
Nov 17 06:19:31 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:31 sleepy DMA per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:31 sleepy Normal per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:31 sleepy HighMem per-cpu: empty
Nov 17 06:19:31 sleepy
Nov 17 06:19:31 sleepy Free pages: 244kB (0kB HighMem)
Nov 17 06:19:31 sleepy Active:6516 inactive:6253 dirty:0 writeback:0 unstable:0 free:61 slab:1272 mapped:951 pagetables:72
Nov 17 06:19:31 sleepy DMA free:60kB min:60kB low:120kB high:180kB active:6036kB inactive:5676kB present:16384kB pages_scanned:3228 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:184kB min:188kB low:376kB high:564kB active:20028kB inactive:19336kB present:49144kB pages_scanned:11010 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857581*8kB 4294953437*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547136kB
Nov 17 06:19:31 sleepy Normal: 4291283342*4kB 4294690595*8kB 4294945038*16kB 4294963913*32kB 4294966633*64kB 4294966927*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268752kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189759, delete 189503, find 44120/68980, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 22019 (run-crons).
Nov 17 06:19:31 sleepy oom-killer: gfp_mask=0xd0
Nov 17 06:19:31 sleepy DMA per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 2, high 6, batch 1
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 2, batch 1
Nov 17 06:19:31 sleepy Normal per-cpu:
Nov 17 06:19:31 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 17 06:19:31 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 17 06:19:31 sleepy HighMem per-cpu: empty
Nov 17 06:19:31 sleepy
Nov 17 06:19:31 sleepy Free pages: 172kB (0kB HighMem)
Nov 17 06:19:31 sleepy Active:6515 inactive:6271 dirty:0 writeback:0 unstable:0 free:43 slab:1273 mapped:966 pagetables:72
Nov 17 06:19:31 sleepy DMA free:44kB min:60kB low:120kB high:180kB active:6168kB inactive:5552kB present:16384kB pages_scanned:14058 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy Normal free:128kB min:188kB low:376kB high:564kB active:19892kB inactive:19532kB present:49144kB pages_scanned:48284 all_unreclaimable? yes
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy HighMem free:0kB min:128kB low:256kB high:384kB active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 17 06:19:31 sleepy protections[]: 0 0 0
Nov 17 06:19:31 sleepy DMA: 4293918298*4kB 4294857580*8kB 4294953436*16kB 4294965767*32kB 4294966994*64kB 4294967193*128kB 4294967265*256kB 4294967275*512kB 4294967290*1024kB 4294967293*2048kB 4294967293*4096kB = 4289547112kB
Nov 17 06:19:31 sleepy Normal: 4291283261*4kB 4294690580*8kB 4294945031*16kB 4294963911*32kB 4294966633*64kB 4294966926*128kB 4294967154*256kB 4294967204*512kB 4294967257*1024kB 4294967281*2048kB 4294967286*4096kB = 4277268004kB
Nov 17 06:19:31 sleepy HighMem: empty
Nov 17 06:19:31 sleepy Swap cache: add 189810, delete 189544, find 44123/68990, race 0+0
Nov 17 06:19:31 sleepy Out of Memory: Killed process 29354 (slocate).
Nov 17 06:19:31 sleepy run-crons: page allocation failure. order:0, mode:0x1d2
Nov 17 06:19:31 sleepy [<c0105247>] dump_stack+0x16/0x18
Nov 17 06:19:31 sleepy [<c0145c75>] __alloc_pages+0x2fe/0x31a
Nov 17 06:19:31 sleepy [<c0148b53>] do_page_cache_readahead+0x101/0x18d
Nov 17 06:19:31 sleepy [<c0141d10>] filemap_nopage+0x163/0x337
Nov 17 06:19:31 sleepy [<c0153f36>] do_no_page+0x10f/0x52a
Nov 17 06:19:31 sleepy [<c0154568>] handle_mm_fault+0xe2/0x261
Nov 17 06:19:31 sleepy [<c01159b0>] do_page_fault+0x1ef/0x578
Nov 17 06:19:31 sleepy [<c0104dc9>] error_code+0x2d/0x38
Nov 17 06:19:31 sleepy run-crons: page allocation failure. order:0, mode:0x1d2
Nov 17 06:19:31 sleepy [<c0105247>] dump_stack+0x16/0x18
Nov 17 06:19:31 sleepy [<c0145c75>] __alloc_pages+0x2fe/0x31a
Nov 17 06:19:31 sleepy [<c0141b1c>] page_cache_read+0x2e/0xbf
Nov 17 06:19:31 sleepy [<c0141d7a>] filemap_nopage+0x1cd/0x337
Nov 17 06:19:31 sleepy [<c0153f36>] do_no_page+0x10f/0x52a
Nov 17 06:19:31 sleepy [<c0154568>] handle_mm_fault+0xe2/0x261
Nov 17 06:19:31 sleepy [<c01159b0>] do_page_fault+0x1ef/0x578
Nov 17 06:19:31 sleepy [<c0104dc9>] error_code+0x2d/0x38
Nov 17 06:19:31 sleepy VM: killing process run-crons
Nov 17 06:19:31 sleepy postfix/master[6783]: warning: process /usr/lib/postfix/qmgr pid 6813 killed by signal 9
Nov 17 06:19:31 sleepy postfix/master[6783]: warning: process /usr/lib/postfix/pickup pid 21982 killed by signal 9
Chris Ross <[email protected]> wrote:
>
> As I suspected, like a recalcitrant teenager it was sneakily waiting
> until everyone was out then it threw a wild party with several ooms and
> an oops. See below...
That's not an oops - it's just a stack trace.
> This, obviously is still without Kame's patch, just the same tree as
> before with the one change you asked for.
Please ignore the previous patch and try the below. It looks like Rik's
analysis is correct: when the caller doesn't have the swap token it just
cannot reclaim referenced pages and scans its way into an oom. Defeating
that logic when we've hit the highest scanning priority does seem to fix
the problem and those nice qsbench numbers which the thrashing control gave
us appear to be unaffected.
diff -puN mm/vmscan.c~vmscan-ignore-swap-token-when-in-trouble mm/vmscan.c
--- 25/mm/vmscan.c~vmscan-ignore-swap-token-when-in-trouble 2004-11-16 20:30:00.000000000 -0800
+++ 25-akpm/mm/vmscan.c 2004-11-16 20:30:00.000000000 -0800
@@ -380,7 +380,7 @@ static int shrink_list(struct list_head
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
- referenced = page_referenced(page, 1);
+ referenced = page_referenced(page, 1, sc->priority <= 0);
/* In active use or really unfreeable? Activate it. */
if (referenced && page_mapping_inuse(page))
goto activate_locked;
@@ -724,7 +724,7 @@ refill_inactive_zone(struct zone *zone,
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0)) {
+ page_referenced(page, 0, sc->priority <= 0)) {
list_add(&page->lru, &l_active);
continue;
}
diff -puN mm/rmap.c~vmscan-ignore-swap-token-when-in-trouble mm/rmap.c
--- 25/mm/rmap.c~vmscan-ignore-swap-token-when-in-trouble 2004-11-16 20:30:00.000000000 -0800
+++ 25-akpm/mm/rmap.c 2004-11-16 20:30:00.000000000 -0800
@@ -248,7 +248,7 @@ unsigned long page_address_in_vma(struct
* repeatedly from either page_referenced_anon or page_referenced_file.
*/
static int page_referenced_one(struct page *page,
- struct vm_area_struct *vma, unsigned int *mapcount)
+ struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -288,7 +288,7 @@ static int page_referenced_one(struct pa
if (ptep_clear_flush_young(vma, address, pte))
referenced++;
- if (mm != current->mm && has_swap_token(mm))
+ if (mm != current->mm && !ignore_token && has_swap_token(mm))
referenced++;
(*mapcount)--;
@@ -301,7 +301,7 @@ out:
return referenced;
}
-static int page_referenced_anon(struct page *page)
+static int page_referenced_anon(struct page *page, int ignore_token)
{
unsigned int mapcount;
struct anon_vma *anon_vma;
@@ -314,7 +314,8 @@ static int page_referenced_anon(struct p
mapcount = page_mapcount(page);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- referenced += page_referenced_one(page, vma, &mapcount);
+ referenced += page_referenced_one(page, vma, &mapcount,
+ ignore_token);
if (!mapcount)
break;
}
@@ -333,7 +334,7 @@ static int page_referenced_anon(struct p
*
* This function is only called from page_referenced for object-based pages.
*/
-static int page_referenced_file(struct page *page)
+static int page_referenced_file(struct page *page, int ignore_token)
{
unsigned int mapcount;
struct address_space *mapping = page->mapping;
@@ -371,7 +372,8 @@ static int page_referenced_file(struct p
referenced++;
break;
}
- referenced += page_referenced_one(page, vma, &mapcount);
+ referenced += page_referenced_one(page, vma, &mapcount,
+ ignore_token);
if (!mapcount)
break;
}
@@ -388,7 +390,7 @@ static int page_referenced_file(struct p
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
-int page_referenced(struct page *page, int is_locked)
+int page_referenced(struct page *page, int is_locked, int ignore_token)
{
int referenced = 0;
@@ -400,14 +402,15 @@ int page_referenced(struct page *page, i
if (page_mapped(page) && page->mapping) {
if (PageAnon(page))
- referenced += page_referenced_anon(page);
+ referenced += page_referenced_anon(page, ignore_token);
else if (is_locked)
- referenced += page_referenced_file(page);
+ referenced += page_referenced_file(page, ignore_token);
else if (TestSetPageLocked(page))
referenced++;
else {
if (page->mapping)
- referenced += page_referenced_file(page);
+ referenced += page_referenced_file(page,
+ ignore_token);
unlock_page(page);
}
}
diff -puN include/linux/rmap.h~vmscan-ignore-swap-token-when-in-trouble include/linux/rmap.h
--- 25/include/linux/rmap.h~vmscan-ignore-swap-token-when-in-trouble 2004-11-16 20:30:00.000000000 -0800
+++ 25-akpm/include/linux/rmap.h 2004-11-16 20:30:00.000000000 -0800
@@ -89,7 +89,7 @@ static inline void page_dup_rmap(struct
/*
* Called from mm/vmscan.c to handle paging out
*/
-int page_referenced(struct page *, int is_locked);
+int page_referenced(struct page *, int is_locked, int ignore_token);
int try_to_unmap(struct page *);
/*
@@ -103,7 +103,7 @@ unsigned long page_address_in_vma(struct
#define anon_vma_prepare(vma) (0)
#define anon_vma_link(vma) do {} while (0)
-#define page_referenced(page,l) TestClearPageReferenced(page)
+#define page_referenced(page,l,i) TestClearPageReferenced(page)
#define try_to_unmap(page) SWAP_FAIL
#endif /* CONFIG_MMU */
_
On Wed, Nov 17, 2004 at 01:23:46AM -0800, Andrew Morton wrote:
> Chris Ross <[email protected]> wrote:
> >
> > As I suspected, like a recalcitrant teenager it was sneakily waiting
> > until everyone was out then it threw a wild party with several ooms and
> > an oops. See below...
>
> That's not an oops - it's just a stack trace.
>
> > This, obviously is still without Kame's patch, just the same tree as
> > before with the one change you asked for.
>
> Please ignore the previous patch and try the below. It looks like Rik's
> analysis is correct: when the caller doesn't have the swap token it just
> cannot reclaim referenced pages and scans its way into an oom. Defeating
> that logic when we've hit the highest scanning priority does seem to fix
> the problem and those nice qsbench numbers which the thrashing control gave
> us appear to be unaffected.
Oh, this fixes my testcase, and was the reason for the hog slow speed.
Excellent, wasted several days in vain. :(
> diff -puN mm/vmscan.c~vmscan-ignore-swap-token-when-in-trouble mm/vmscan.c
> --- 25/mm/vmscan.c~vmscan-ignore-swap-token-when-in-trouble 2004-11-16 20:30:00.000000000 -0800
> +++ 25-akpm/mm/vmscan.c 2004-11-16 20:30:00.000000000 -0800
> @@ -380,7 +380,7 @@ static int shrink_list(struct list_head
> if (page_mapped(page) || PageSwapCache(page))
> sc->nr_scanned++;
>
> - referenced = page_referenced(page, 1);
> + referenced = page_referenced(page, 1, sc->priority <= 0);
> /* In active use or really unfreeable? Activate it. */
> if (referenced && page_mapping_inuse(page))
> goto activate_locked;
> @@ -724,7 +724,7 @@ refill_inactive_zone(struct zone *zone,
> if (page_mapped(page)) {
> if (!reclaim_mapped ||
> (total_swap_pages == 0 && PageAnon(page)) ||
> - page_referenced(page, 0)) {
> + page_referenced(page, 0, sc->priority <= 0)) {
> list_add(&page->lru, &l_active);
> continue;
> }
> diff -puN mm/rmap.c~vmscan-ignore-swap-token-when-in-trouble mm/rmap.c
> --- 25/mm/rmap.c~vmscan-ignore-swap-token-when-in-trouble 2004-11-16 20:30:00.000000000 -0800
> +++ 25-akpm/mm/rmap.c 2004-11-16 20:30:00.000000000 -0800
> @@ -248,7 +248,7 @@ unsigned long page_address_in_vma(struct
> * repeatedly from either page_referenced_anon or page_referenced_file.
> */
> static int page_referenced_one(struct page *page,
> - struct vm_area_struct *vma, unsigned int *mapcount)
> + struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
> {
> struct mm_struct *mm = vma->vm_mm;
> unsigned long address;
> @@ -288,7 +288,7 @@ static int page_referenced_one(struct pa
> if (ptep_clear_flush_young(vma, address, pte))
> referenced++;
>
> - if (mm != current->mm && has_swap_token(mm))
> + if (mm != current->mm && !ignore_token && has_swap_token(mm))
> referenced++;
>
> (*mapcount)--;
> @@ -301,7 +301,7 @@ out:
> return referenced;
> }
>
> -static int page_referenced_anon(struct page *page)
> +static int page_referenced_anon(struct page *page, int ignore_token)
> {
> unsigned int mapcount;
> struct anon_vma *anon_vma;
> @@ -314,7 +314,8 @@ static int page_referenced_anon(struct p
>
> mapcount = page_mapcount(page);
> list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
> - referenced += page_referenced_one(page, vma, &mapcount);
> + referenced += page_referenced_one(page, vma, &mapcount,
> + ignore_token);
> if (!mapcount)
> break;
> }
> @@ -333,7 +334,7 @@ static int page_referenced_anon(struct p
> *
> * This function is only called from page_referenced for object-based pages.
> */
> -static int page_referenced_file(struct page *page)
> +static int page_referenced_file(struct page *page, int ignore_token)
> {
> unsigned int mapcount;
> struct address_space *mapping = page->mapping;
> @@ -371,7 +372,8 @@ static int page_referenced_file(struct p
> referenced++;
> break;
> }
> - referenced += page_referenced_one(page, vma, &mapcount);
> + referenced += page_referenced_one(page, vma, &mapcount,
> + ignore_token);
> if (!mapcount)
> break;
> }
> @@ -388,7 +390,7 @@ static int page_referenced_file(struct p
> * Quick test_and_clear_referenced for all mappings to a page,
> * returns the number of ptes which referenced the page.
> */
> -int page_referenced(struct page *page, int is_locked)
> +int page_referenced(struct page *page, int is_locked, int ignore_token)
> {
> int referenced = 0;
>
> @@ -400,14 +402,15 @@ int page_referenced(struct page *page, i
>
> if (page_mapped(page) && page->mapping) {
> if (PageAnon(page))
> - referenced += page_referenced_anon(page);
> + referenced += page_referenced_anon(page, ignore_token);
> else if (is_locked)
> - referenced += page_referenced_file(page);
> + referenced += page_referenced_file(page, ignore_token);
> else if (TestSetPageLocked(page))
> referenced++;
> else {
> if (page->mapping)
> - referenced += page_referenced_file(page);
> + referenced += page_referenced_file(page,
> + ignore_token);
> unlock_page(page);
> }
> }
> diff -puN include/linux/rmap.h~vmscan-ignore-swap-token-when-in-trouble include/linux/rmap.h
> --- 25/include/linux/rmap.h~vmscan-ignore-swap-token-when-in-trouble 2004-11-16 20:30:00.000000000 -0800
> +++ 25-akpm/include/linux/rmap.h 2004-11-16 20:30:00.000000000 -0800
> @@ -89,7 +89,7 @@ static inline void page_dup_rmap(struct
> /*
> * Called from mm/vmscan.c to handle paging out
> */
> -int page_referenced(struct page *, int is_locked);
> +int page_referenced(struct page *, int is_locked, int ignore_token);
> int try_to_unmap(struct page *);
>
> /*
> @@ -103,7 +103,7 @@ unsigned long page_address_in_vma(struct
> #define anon_vma_prepare(vma) (0)
> #define anon_vma_link(vma) do {} while (0)
>
> -#define page_referenced(page,l) TestClearPageReferenced(page)
> +#define page_referenced(page,l,i) TestClearPageReferenced(page)
> #define try_to_unmap(page) SWAP_FAIL
>
> #endif /* CONFIG_MMU */
> _
On Wed, Nov 17, 2004 at 04:06:48AM -0200, Marcelo Tosatti wrote:
> On Wed, Nov 17, 2004 at 01:23:46AM -0800, Andrew Morton wrote:
> > Chris Ross <[email protected]> wrote:
> > >
> > > As I suspected, like a recalcitrant teenager it was sneakily waiting
> > > until everyone was out then it threw a wild party with several ooms and
> > > an oops. See below...
> >
> > That's not an oops - it's just a stack trace.
> >
> > > This, obviously is still without Kame's patch, just the same tree as
> > > before with the one change you asked for.
> >
> > Please ignore the previous patch and try the below. It looks like Rik's
> > analysis is correct: when the caller doesn't have the swap token it just
> > cannot reclaim referenced pages and scans its way into an oom. Defeating
> > that logic when we've hit the highest scanning priority does seem to fix
> > the problem and those nice qsbench numbers which the thrashing control gave
> > us appear to be unaffected.
>
> Oh, this fixes my testcase, and was the reason for the hog slow speed.
>
> Excellent, wasted several days in vain. :(
Before the swap token patches went in you remember spurious OOM reports
or things were working fine then?
Marcelo Tosatti <[email protected]> wrote:
>
> Before the swap token patches went in you remember spurious OOM reports
> or things were working fine then?
Hard to say, really. Yes, I think the frequency of reports has increased a
bit in the last month or two. But it's always been a really small number
of people so that may not be statistically significant.
umm,
there was one report in January (seems to be a kernel memory leak)
One in February (ditto)
One in June
Two in July (one was with no swap, one with laptop_mode)
A few in August, but mainly due to the CDROM memory leak.
On August 23 the thrashing control was added.
After that, two or three people have been reporting it.
So yes, we do seem to have gone from basically zero reports up to a trickle.
On Wed, Nov 17, 2004 at 04:08:52AM -0200, Marcelo Tosatti wrote:
> On Wed, Nov 17, 2004 at 04:06:48AM -0200, Marcelo Tosatti wrote:
> > On Wed, Nov 17, 2004 at 01:23:46AM -0800, Andrew Morton wrote:
> > > Chris Ross <[email protected]> wrote:
> > > >
> > > > As I suspected, like a recalcitrant teenager it was sneakily waiting
> > > > until everyone was out then it threw a wild party with several ooms and
> > > > an oops. See below...
> > >
> > > That's not an oops - it's just a stack trace.
> > >
> > > > This, obviously is still without Kame's patch, just the same tree as
> > > > before with the one change you asked for.
> > >
> > > Please ignore the previous patch and try the below. It looks like Rik's
> > > analysis is correct: when the caller doesn't have the swap token it just
> > > cannot reclaim referenced pages and scans its way into an oom. Defeating
> > > that logic when we've hit the highest scanning priority does seem to fix
> > > the problem and those nice qsbench numbers which the thrashing control gave
> > > us appear to be unaffected.
> >
> > Oh, this fixes my testcase, and was the reason for the hog slow speed.
> >
> > Excellent, wasted several days in vain. :(
>
> Before the swap token patches went in you remember spurious OOM reports
> or things were working fine then?
Just went on through the archives and indeed the spurious OOM kills started
happening when the swap token code was added to the tree.
Next time I should be looking into the easy stuff before trying miraculous
solutions. :(
Marcelo Tosatti escreveu:
> On Wed, Nov 17, 2004 at 04:06:48AM -0200, Marcelo Tosatti wrote:
> Before the swap token patches went in you remember spurious OOM reports
> or things were working fine then?
The oom killer problems arose before and independently of the
token-based-thrashing patches. I know this because I took a special
interest in the tbtc patches too (which is why my test machine came to
have 64MB RAM but 1GB swap).
Regards,
Chris R.
Marcelo Tosatti escreveu:
> On Wed, Nov 17, 2004 at 04:08:52AM -0200, Marcelo Tosatti wrote:
> Just went on through the archives and indeed the spurious OOM kills started
> happening when the swap token code was added to the tree.
The LKML archives? We had been discussing this on Con Kolivas's list
previously where we determined it was a problem in mainline so on Con's
suggestion I signed up to LKML and discussed it here.
However, looking back through my mailbox the first report I made was
2.6.8.1-ck3 *with the tbtc patches added*
I'm away to try 2.6.8.1 without....
Later,
Chris
On Wed, Nov 17, 2004 at 11:50:36AM +0100, Chris Ross wrote:
>
> Marcelo Tosatti escreveu:
> >On Wed, Nov 17, 2004 at 04:06:48AM -0200, Marcelo Tosatti wrote:
> >Before the swap token patches went in you remember spurious OOM reports
> >or things were working fine then?
>
> The oom killer problems arose before and independently of the
> token-based-thrashing patches. I know this because I took a special
> interest in the tbtc patches too (which is why my test machine came to
> have 64MB RAM but 1GB swap).
So even when reaping referenced pages on zero priority scanning
the OOM killer might be triggered in extreme cases. And as the
number of tasks increases the chances things go wrong increase.
Please test Andrew's patch, its hopefully good enough for most
scenarios. Extreme cases are probably still be problematic.
What are the "tbtc" patches ?
Your testing is of huge value Chris. Thanks.
Marcelo Tosatti escreveu:
> Please test Andrew's patch, its hopefully good enough for most
> scenarios. Extreme cases are probably still be problematic.
Will do, though currently testing 2.6.8.1 and it goes without saying
this is a slow machine.
> What are the "tbtc" patches ?
Token based thrashing control from Rik van Riel
http://marc.theaimsgroup.com/?l=linux-kernel&m=109122597407401&w=2
Regards,
Chris R.
On Wed, 17 Nov 2004, Marcelo Tosatti wrote:
> So even when reaping referenced pages on zero priority scanning
> the OOM killer might be triggered in extreme cases. And as the
> number of tasks increases the chances things go wrong increase.
Ignoring the referenced bit should also make the system
"immune" from the "fake" references that the swap token
holding task will show.
However, since it also ignores real referenced bits, it
might mess up VM balancing a bit, which is what Andrew
was worried about.
> Please test Andrew's patch, its hopefully good enough for most
> scenarios. Extreme cases are probably still be problematic.
Actually, because Andrew's patch makes the real referenced
bit stand, but ignores the "fake" reference that the swap
token gives, I suspect it'll make this swap token corner
case a lot "softer" than my patch.
--
"Debugging is twice as hard as writing the code in the first place.
Therefore, if you write the code as cleverly as possible, you are,
by definition, not smart enough to debug it." - Brian W. Kernighan
Andrew Morton escreveu:
> Please ignore the previous patch and try the below.
Running 2.6.10-rc2-mm1 with just your new patch. It's got through the
first tests, building umlsim whilst simultaneously doing an 'emerge
sync' (this is a Gentoo box). I'll now try harder to break it.
> It looks like Rik's analysis is correct: when the caller doesn't have
> the swap token it just cannot reclaim referenced pages and scans its
> way into an oom. Defeating that logic when we've hit the highest
> scanning priority does seem to fix the problem and those nice qsbench
> numbers which the thrashing control gave us appear to be unaffected.
I assume Rik's analysis was not copied to the list? If it was I missed
it. Is your summary fairly complete?
Regards,
Chris R.
Martin MOKREJ__ <[email protected]> wrote:
>
> I'm sorry for the delay. I had to re-invent my old memory tests.
> I have just compared 2.4.26-gentoo-r9 kernel, plain 2.4.28 and
> plain 2.6.10-rc2. The last has OOM problems, as it kills unnecessarilly
> 2 xterms "in addition" to the application which caused memory outsourcing.
>
> I'm not sure which patches sent to the list last days still make
> sense to be tested.
Please test
ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.10-rc2/2.6.10-rc2-mm2/broken-out/vmscan-ignore-swap-token-when-in-trouble.patch
against 2.6.10-rc2, or latest -linus.
Hi,
I just tested 2.6.7 and it works comparably to 2.4.28.
swpd value reported by vmstat is actually 1172736,
actually inspecting the previous results from 2.4.28
show also this number. But "swapon -s" reports 1172732.
I'm puzzled. vmstat comes from gentoo procps-3.2.3-r1.
Anyway, plain 2.6.7 kills only the application asking for
so much memory and logs via syslog:
Out of Memory: Killed process 58888 (RNAsubopt)
It's a lot better compared to what we have in 2.6.10-rc2,
from my user's view.
I cannot easily reverse the tbtc patch from 2.6.10-rc2 tree,
but applying the tbtc patch over 2.6.7 gives me same behaviour
as on plain 2.6.7 - so only the RNAsubopt application get's
killed.
The problem must have been introduced between 2.6.7
and 2.6.10-rc2 but is not directly related to tbtc patch in it's
original form:
http://marc.theaimsgroup.com/?l=linux-kernel&m=109122597407401&w=2
Hope this helps.
Martin
Martin MOKREJ__ <[email protected]> wrote:
>
> Anyway, plain 2.6.7 kills only the application asking for
> so much memory and logs via syslog:
> Out of Memory: Killed process 58888 (RNAsubopt)
>
> It's a lot better compared to what we have in 2.6.10-rc2,
> from my user's view.
We haven't made any changes to the oom-killer algorithm since July 2003.
Weird.
On Thu, Nov 18, 2004 at 04:08:24PM -0800, Andrew Morton wrote:
> Martin MOKREJ__ <[email protected]> wrote:
> >
> > Anyway, plain 2.6.7 kills only the application asking for
> > so much memory and logs via syslog:
> > Out of Memory: Killed process 58888 (RNAsubopt)
> >
> > It's a lot better compared to what we have in 2.6.10-rc2,
> > from my user's view.
>
> We haven't made any changes to the oom-killer algorithm since July 2003.
> Weird.
As Thomas Gleixner has investigated, the OOM killer selection is problematic.
When testing your ignore-page-referenced patch it first killed the memory hog
then shortly afterwards the shell I was running it on.
You've seen Thomas emails, he has nice description there.
On Fri, 2004-11-19 at 06:09 -0200, Marcelo Tosatti wrote:
> As Thomas Gleixner has investigated, the OOM killer selection is problematic.
>
> When testing your ignore-page-referenced patch it first killed the memory hog
> then shortly afterwards the shell I was running it on.
>
> You've seen Thomas emails, he has nice description there.
I had another go on 2.6.10-rc2-mm2.
The reentrancy blocking and the additional test of freepages in
out_of_memory() make all the ugly time and counter checks superfluid.
I think they were neccecary to make the spurious kill triggering less
obvious. :)
Can somebody else check with his test cases, if the behaviour is
correct ?
tglx
On Sat, 2004-11-20 at 00:30 +0100, Martin MOKREJ? wrote:
> OK, I can say kernel 2.6.7, 2.6.8.1, 2.6.9-rc1 kill just the RNAsubopt
> application in my test.
>
> 2.6.9-rc2 kills RNAsubopt and also two xterms, one running vmstat,
> the other is parent of RNAsubopt program I used to eat memory with.
>
> 2.6.9 has killed just those 2 xterms, as a sideeffect the RNAsubopt
> got killed as parent shell got killed.
>
> 2.6.10-rc1 killed all three.
>
> I conclude the major problem got introduced between
> 2.6.9-rc1 and 2.6.9-rc2.
Can you please try 2.6.10-rc2-mm2 + the patch I posted yesterday night ?
It will still kill RNAsubopt, but it should not longer touch the xterm,
which runs vmstat.
> Second problem with 2.6 tree is that I think application should receive
> some errocode when asking for more memory, so it can exit itself.
> This used to work well under 2.4 tree and was demostrated in my
> previous reports where you see application exist with "not enough memory"
> rather than with "Killed". ;-)
One good reason might be that in the out of memory situation the system
has no idea, whether the requester will gracefully shutdown when
recieving ENOMEM or keep trying to get some more memory.
The decision to return ENOMEM or finally calling the oom-killer depends
on the flags for this allocation request. The criteria are __GFP_FS set
and not __GFP_NORETRY set.
So all allocations GFP_KERNEL, GFP_USER and GFP_HIGHUSER are candidates
to end up in the oom_killer. The only caller which ever sets the
__GFP_NORETRY flag is fs/xfs.
tglx
Thomas Gleixner wrote:
> On Sat, 2004-11-20 at 00:30 +0100, Martin MOKREJ? wrote:
>
>>OK, I can say kernel 2.6.7, 2.6.8.1, 2.6.9-rc1 kill just the RNAsubopt
>>application in my test.
>>
>>2.6.9-rc2 kills RNAsubopt and also two xterms, one running vmstat,
>>the other is parent of RNAsubopt program I used to eat memory with.
>>
>>2.6.9 has killed just those 2 xterms, as a sideeffect the RNAsubopt
>>got killed as parent shell got killed.
>>
>>2.6.10-rc1 killed all three.
>>
>>I conclude the major problem got introduced between
>>2.6.9-rc1 and 2.6.9-rc2.
>
>
> Can you please try 2.6.10-rc2-mm2 + the patch I posted yesterday night ?
> It will still kill RNAsubopt, but it should not longer touch the xterm,
> which runs vmstat.
Will do in a minute.
>
>
>>Second problem with 2.6 tree is that I think application should receive
>>some errocode when asking for more memory, so it can exit itself.
>>This used to work well under 2.4 tree and was demostrated in my
>>previous reports where you see application exist with "not enough memory"
>>rather than with "Killed". ;-)
>
>
> One good reason might be that in the out of memory situation the system
> has no idea, whether the requester will gracefully shutdown when
> recieving ENOMEM or keep trying to get some more memory.
>
> The decision to return ENOMEM or finally calling the oom-killer depends
> on the flags for this allocation request. The criteria are __GFP_FS set
> and not __GFP_NORETRY set.
>
> So all allocations GFP_KERNEL, GFP_USER and GFP_HIGHUSER are candidates
> to end up in the oom_killer. The only caller which ever sets the
> __GFP_NORETRY flag is fs/xfs.
I don't understand ansi C ;) ... so just tell me why the kernel doesn't
try the "old" way as on 2.4 kernel, and if teh appliation wouldn't get killed,
I don't care I have to wait another 10 seconds to het it killed.
The application wil close it's files, remove tempfiles etc.
The sources of the program are available. Can you have a brif look into
the code and tell me if this application does or does not treat those
memory allocation in whatever that *GFP* thing is? ;-) In other words,
is this application written correctly or not? Does it have to be killed
ro should it exit weel also on 2.6 kernel (note, it does exit on 2.4 ...).
Thanks for help!
Martin
BTW: my last 2 relies got lost on the way through lkml. :((
My SMTPserver said:
@40000000419f13a321bbf6a4 status: local 0/10 remote 9/20
@40000000419f13a41cea7d1c delivery 9: success: ([email protected])_213.239.205.147_accepted_message./Remote_host_said:_250_Ok
:_queued_as_1D47365C044/
@40000000419f13a41cebadcc status: local 0/10 remote 8/20
@40000000419f13a504926324 delivery 3: success: ([email protected])_82.161.9.107_accepted_message./Remote_host_said:_250_Ok:_q
ueued_as_A38AD684/
@40000000419f13a504937c64 status: local 0/10 remote 7/20
@40000000419f13a508f3214c delivery 5: failure: 130.57.1.11_does_not_like_recipient./Remote_host_said:_550_5.1.1_<andrea@novell
.com>_is_not_a_valid_mailbox._550_No_such_recipient/Giving_up_on_130.57.1.11./
@40000000419f13a508f8bae4 status: local 0/10 remote 6/20
@40000000419f13a602653a54 delivery 8: success: ([email protected])_66.187.233.32_accepted_message./Remote_host_said:_250_2.0.0_i
AK9pMqT000648_Message_accepted_for_delivery/
@40000000419f13a602666b04 status: local 0/10 remote 5/20
@40000000419f13a63041b8bc delivery 6: success: ([email protected])_12.107.209.244_accepted_message./Remote_host_sai
d:_250_2.7.0_nothing_apparently_wrong_in_the_message.;_S261490AbUKTJvY/
@40000000419f13a63042ddb4 status: local 0/10 remote 4/20
@40000000419f13a8042341ec delivery 7: success: ([email protected])_66.96.29.28_accepted_message./Remote_host_said:_250_2.6.0_
S26517AbUKTJvQ_message_accepted/
@40000000419f13a804248a0c status: local 0/10 remote 3/20
@40000000419f13a8181adc3c delivery 1: success: ([email protected])_65.172.181.4_accepted_message./Remote_host_said:_250_2.0.0_iAK9
pMPE008452_Message_accepted_for_delivery/
@40000000419f13a8181af794 status: local 0/10 remote 2/20
@40000000419f13aa22abd8cc delivery 2: success: ([email protected])_203.29.91.92_accepted_message./Remote_host_said:_250_O
K_id=1CVRtv-0003cD-LZ/
@40000000419f13aa22ad191c status: local 0/10 remote 1/20
@40000000419f13ac38f1240c delivery 4: success: ([email protected])_64.186.161.6_accepted_message./Remote_host_said:
_250_Ok:_queued_as_041F080041B/
oom-killer: gfp_mask=0xd2
DMA per-cpu:
cpu 0 hot: low 2, high 6, batch 1
cpu 0 cold: low 0, high 2, batch 1
Normal per-cpu:
cpu 0 hot: low 32, high 96, batch 16
cpu 0 cold: low 0, high 32, batch 16
HighMem per-cpu:
cpu 0 hot: low 14, high 42, batch 7
cpu 0 cold: low 0, high 14, batch 7
Free pages: 3916kB (112kB HighMem)
Active:131900 inactive:121851 dirty:0 writeback:0 unstable:0 free:979 slab:1918 mapped:253638 pagetables:794
DMA free:68kB min:68kB low:84kB high:100kB active:5388kB inactive:5432kB present:16384kB pages_scanned:12292 all_unreclaimable? yes
protections[]: 0 0 0
Normal free:3736kB min:3756kB low:4692kB high:5632kB active:457320kB inactive:416944kB present:901120kB pages_scanned:906311 all_unreclaimable? yes
protections[]: 0 0 0
HighMem free:112kB min:128kB low:160kB high:192kB active:64892kB inactive:65028kB present:131044kB pages_scanned:134185 all_unreclaimable? yes
protections[]: 0 0 0
DMA: 1*4kB 0*8kB 0*16kB 0*32kB 1*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 68kB
Normal: 0*4kB 1*8kB 1*16kB 0*32kB 0*64kB 1*128kB 0*256kB 1*512kB 1*1024kB 1*2048kB 0*4096kB = 3736kB
HighMem: 0*4kB 0*8kB 1*16kB 1*32kB 1*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 112kB
Swap cache: add 293354, delete 293354, find 65/84, race 0+0
Out of Memory: Killed process 6612 (RNAsubopt).
Out of Memory: Killed process 6603 (bash).
oom-killer: gfp_mask=0x1d2
DMA per-cpu:
cpu 0 hot: low 2, high 6, batch 1
cpu 0 cold: low 0, high 2, batch 1
Normal per-cpu:
cpu 0 hot: low 32, high 96, batch 16
cpu 0 cold: low 0, high 32, batch 16
HighMem per-cpu:
cpu 0 hot: low 14, high 42, batch 7
cpu 0 cold: low 0, high 14, batch 7
Free pages: 3916kB (112kB HighMem)
Active:131832 inactive:121892 dirty:0 writeback:38 unstable:0 free:979 slab:1918 mapped:253573 pagetables:790
DMA free:68kB min:68kB low:84kB high:100kB active:5388kB inactive:5432kB present:16384kB pages_scanned:12292 all_unreclaimable? yes
protections[]: 0 0 0
Normal free:3736kB min:3756kB low:4692kB high:5632kB active:457080kB inactive:417076kB present:901120kB pages_scanned:906479 all_unreclaimable? yes
protections[]: 0 0 0
HighMem free:112kB min:128kB low:160kB high:192kB active:64860kB inactive:65060kB present:131044kB pages_scanned:134217 all_unreclaimable? yes
protections[]: 0 0 0
DMA: 1*4kB 0*8kB 0*16kB 0*32kB 1*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 68kB
Normal: 0*4kB 1*8kB 1*16kB 0*32kB 0*64kB 1*128kB 0*256kB 1*512kB 1*1024kB 1*2048kB 0*4096kB = 3736kB
HighMem: 0*4kB 0*8kB 1*16kB 1*32kB 1*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 112kB
Swap cache: add 293419, delete 293381, find 65/84, race 0+0
Out of Memory: Killed process 6598 (FvwmPager).
Out of Memory: Killed process 6599 (xterm).
Out of Memory: Killed process 6606 (xterm).
Out of Memory: Killed process 6564 (fvwm2).
mtrr: no MTRR for d8000000,2000000 found
[drm:radeon_cp_init] *ERROR* radeon_cp_init called without lock held
[drm:drm_unlock] *ERROR* Process 6536 using kernel context 0
On Sat, 2004-11-20 at 12:29 +0100, Martin MOKREJ? wrote:
> > Can you please try 2.6.10-rc2-mm2 + the patch I posted yesterday night ?
> > It will still kill RNAsubopt, but it should not longer touch the xterm,
> > which runs vmstat.
>
> No, it doesn't help at all! See attached file.
Strange. I have no idea what's going on there.
> oom-killer: gfp_mask=0xd2
> Free pages: 3916kB (112kB HighMem)
> Out of Memory: Killed process 6612 (RNAsubopt).
> Out of Memory: Killed process 6603 (bash).
This one is expected, but the next one is complete nonsense. After
killing RNAsubopt and bash the number of free pages must be greater than
before.
> oom-killer: gfp_mask=0x1d2
> Free pages: 3916kB (112kB HighMem)
> Out of Memory: Killed process 6598 (FvwmPager).
> Out of Memory: Killed process 6599 (xterm).
> Out of Memory: Killed process 6606 (xterm).
> Out of Memory: Killed process 6564 (fvwm2).
Damn, removing the timer/counter stuff in there was not the brightest
idea. The process needs some time to be recycled.
I moved the ugly timer counter check back in. Could you please try
again ?
It should only kill RNAsubopt and bash and touch nothing else.
tglx
diff -urN 2.6.10-rc2-mm2.orig/mm/oom_kill.c 2.6.10-rc2-mm2/mm/oom_kill.c
--- 2.6.10-rc2-mm2.orig/mm/oom_kill.c 2004-11-19 14:52:16.000000000 +0100
+++ 2.6.10-rc2-mm2/mm/oom_kill.c 2004-11-20 14:21:49.000000000 +0100
@@ -45,8 +45,10 @@
static unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time, s;
+ struct list_head *tsk;
- if (!p->mm)
+ /* Ignore mm-less tasks and init */
+ if (!p->mm || p->pid == 1)
return 0;
if (p->flags & PF_MEMDIE)
@@ -57,6 +59,19 @@
points = p->mm->total_vm;
/*
+ * Processes which fork a lot of child processes are likely
+ * a good choice. We add the vmsize of the childs if they
+ * have an own mm. This prevents forking servers to flood the
+ * machine with an endless amount of childs
+ */
+ list_for_each(tsk, &p->children) {
+ struct task_struct *chld;
+ chld = list_entry(tsk, struct task_struct, sibling);
+ if (chld->mm != p->mm && chld->mm)
+ points += chld->mm->total_vm;
+ }
+
+ /*
* CPU time is in tens of seconds and run time is in thousands
* of seconds. There is no particular reason for this other than
* that it turned out to work very well in practice.
@@ -176,6 +191,27 @@
return mm;
}
+static struct mm_struct *oom_kill_process(task_t *p)
+{
+ struct mm_struct *mm;
+ struct task_struct *g, *q;
+
+ mm = oom_kill_task(p);
+ if (!mm)
+ return NULL;
+ /*
+ * kill all processes that share the ->mm (i.e. all threads),
+ * but are in a different thread group
+ */
+ do_each_thread(g, q)
+ if (q->mm == mm && q->tgid != p->tgid)
+ __oom_kill_task(q);
+
+ while_each_thread(g, q);
+ if (!p->mm)
+ printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+ return mm;
+}
/**
* oom_kill - kill the "best" process when we run out of memory
@@ -188,7 +224,9 @@
void oom_kill(void)
{
struct mm_struct *mm;
- struct task_struct *g, *p, *q;
+ struct task_struct *c, *p;
+ struct list_head *tsk;
+ int mmcnt = 0;
read_lock(&tasklist_lock);
retry:
@@ -200,21 +238,25 @@
panic("Out of memory and no killable processes...\n");
}
- mm = oom_kill_task(p);
- if (!mm)
- goto retry;
/*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * Kill the child processes first
*/
- do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q);
- while_each_thread(g, q);
- if (!p->mm)
- printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+ list_for_each(tsk, &p->children) {
+ c = list_entry(tsk, struct task_struct, sibling);
+ if (c->mm == p->mm)
+ continue;
+ mm = oom_kill_process(c);
+ if (mm) {
+ mmcnt ++;
+ mmput(mm);
+ }
+ }
+ mm = oom_kill_process(p);
+ if (!mmcnt && !mm)
+ goto retry;
+ if (mm)
+ mmput(mm);
read_unlock(&tasklist_lock);
- mmput(mm);
return;
}
@@ -224,14 +266,22 @@
void out_of_memory(int gfp_mask)
{
/*
- * oom_lock protects out_of_memory()'s static variables.
- * It's a global lock; this is not performance-critical.
- */
- static DEFINE_SPINLOCK(oom_lock);
+ * inprogress protects out_of_memory()'s static variables.
+ * We don't use a spin_lock here, as spinlocks are
+ * nops on UP systems.
+ */
+ static unsigned long inprogress;
+ static unsigned int freepages = 1000000;
static unsigned long first, last, count, lastkill;
unsigned long now, since;
- spin_lock(&oom_lock);
+ if (test_and_set_bit(0, &inprogress))
+ return;
+
+ /* Check, if memory was freed since the last oom kill */
+ if (freepages < nr_free_pages())
+ goto out_unlock;
+
now = jiffies;
since = now - last;
last = now;
@@ -271,12 +321,10 @@
* Ok, really out of memory. Kill something.
*/
lastkill = now;
-
printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
show_free_areas();
-
- /* oom_kill() sleeps */
- spin_unlock(&oom_lock);
+ /* Store free pages * 2 for the check above */
+ freepages = (nr_free_pages() << 1);
oom_kill();
/*
* Make kswapd go out of the way, so "p" has a good chance of
@@ -284,7 +332,6 @@
* for more memory.
*/
yield();
- spin_lock(&oom_lock);
reset:
/*
@@ -294,7 +341,7 @@
if (time_after(now, first))
first = now;
count = 0;
-
+
out_unlock:
- spin_unlock(&oom_lock);
+ clear_bit(0, &inprogress);
}
On Sat, 2004-11-20 at 22:19 +0100, Martin MOKREJ? wrote:
> > It should only kill RNAsubopt and bash and touch nothing else.
>
> Yes, that's true, this patch has helped. Actually the other xterm got
> closed, but that's because bash is the controlling application of it.
> I believe that's expected.
>
> I'd prefer to get only RNAsubopt killed. ;)
Ok. To kill only RNAsubopt it might be neccecary to refine the criteria
in the whom to kill selection.
> And still, there weren't
> that many changes to memory management between 2.6.9-rc1 and 2.6.9-rc2. ;)
> I can test those VM changes separately, if someone would provide me with
> those changes split into 2 or 3 patchsets.
The oom trouble was definitly not invented there. The change between
2.6.9-rc1 and rc2 is justs triggering your special testcase.
Other testcases show the problems with earlier 2.6 kernels too.
tglx
Thomas Gleixner wrote:
> On Sat, 2004-11-20 at 22:19 +0100, Martin MOKREJ? wrote:
>
>>>It should only kill RNAsubopt and bash and touch nothing else.
>>
>>Yes, that's true, this patch has helped. Actually the other xterm got
>>closed, but that's because bash is the controlling application of it.
>>I believe that's expected.
>>
>>I'd prefer to get only RNAsubopt killed. ;)
>
>
> Ok. To kill only RNAsubopt it might be neccecary to refine the criteria
> in the whom to kill selection.
Why can't the algorithm first find the asking for memory now.
When found, kernel should kill first it's children, wait some time,
then kill this process if still exists (it might exit itself when children
get closed).
You have said it's safer to kill that to send ENOMEM as happens
in 2.4, but I still don't undertand why kernel first doesn't send
ENOMEM, and only if that doesn't help it can start after those 5 seconds
OOM killer, and try to kill the very same application.
I don't get the idea why to kill immediately.
As it has happened to me in the past, that random OOM selection has killed
sshd or init, I believe the algorithm should be improved to not to try
to kill these. First of all, sshd is well tested, so it will never
be source of memleaks. Second, if the algorithm would really insist on
killing either of these, I personally prefer it rather do clean reboot
than a system in a state without sshd. I have to get to the console.
Actually, it's several kilometers for me. :(
>
>
>>And still, there weren't
>>that many changes to memory management between 2.6.9-rc1 and 2.6.9-rc2. ;)
>>I can test those VM changes separately, if someone would provide me with
>>those changes split into 2 or 3 patchsets.
>
>
> The oom trouble was definitly not invented there. The change between
> 2.6.9-rc1 and rc2 is justs triggering your special testcase.
>
> Other testcases show the problems with earlier 2.6 kernels too.
It's a pitty no-one has time to at least figure out why those changes have
exposed this stupid random part of the algorithm. Before 2.6.9-rc2
OOM killer was also started in my tests, but it worked deterministically.
I wouldn't prefer extra algorithm to check what we kill now, I'd rather look
why we kill randomly since -rc2.
Of course your patch is still helpfull, sure.
Martin
On Sun, 2004-11-21 at 13:17 +0100, Martin MOKREJ? wrote:
> Why can't the algorithm first find the asking for memory now.
> When found, kernel should kill first it's children, wait some time,
> then kill this process if still exists (it might exit itself when children
> get closed).
> You have said it's safer to kill that to send ENOMEM as happens
> in 2.4, but I still don't undertand why kernel first doesn't send
> ENOMEM, and only if that doesn't help it can start after those 5 seconds
> OOM killer, and try to kill the very same application.
> I don't get the idea why to kill immediately.
I see your concern. There are some more changes neccecary to make this
reliably work. I'm not sure if it can be done without really big
changes. I will look a bit deeper into this.
> As it has happened to me in the past, that random OOM selection has killed
> sshd or init, I believe the algorithm should be improved to not to try
> to kill these. First of all, sshd is well tested, so it will never
> be source of memleaks. Second, if the algorithm would really insist on
> killing either of these, I personally prefer it rather do clean reboot
> than a system in a state without sshd. I have to get to the console.
> Actually, it's several kilometers for me. :(
Yeah, I observed this too and therefor came up with the whom to kill and
reentrancy patch.
> It's a pitty no-one has time to at least figure out why those changes have
> exposed this stupid random part of the algorithm. Before 2.6.9-rc2
> OOM killer was also started in my tests, but it worked deterministically.
> I wouldn't prefer extra algorithm to check what we kill now, I'd rather look
> why we kill randomly since -rc2.
As I said before the random behaviour was _not_ introduced in -rc2. It
might have changed in -rc2. The random kill with overkill can also be
triggered in 2.6.7 and 2.6.8. I have not tried elder versions though.
tglx
Andrew Morton escreveu:
> Please ignore the previous patch and try the below.
I still get OOM kills with this (well one, anyway). It does seem harder
to trigger though.
Regards,
Chris R.
Nov 20 03:24:46 sleepy cpu 0 hot: low 4, high 12, batch 2
Nov 20 03:24:46 sleepy cpu 0 cold: low 0, high 4, batch 2
Nov 20 03:24:46 sleepy HighMem per-cpu: empty
Nov 20 03:24:46 sleepy
Nov 20 03:24:46 sleepy Free pages: 1012kB (0kB HighMem)
Nov 20 03:24:46 sleepy Active:6290 inactive:6215 dirty:0 writeback:0
unstable:0 free:253 slab:1324 mapped:1 pagetables:56
Nov 20 03:24:46 sleepy DMA free:252kB min:252kB low:312kB high:376kB
active:5700
kB inactive:5540kB present:16384kB pages_scanned:15950
all_unreclaimable? yes
Nov 20 03:24:46 sleepy protections[]: 0 0 0
Nov 20 03:24:46 sleepy Normal free:760kB min:764kB low:952kB high:1144kB
active:19460kB inactive:19320kB present:49144kB pages_scanned:40497
all_unreclaimabl
e? yes
Nov 20 03:24:46 sleepy protections[]: 0 0 0
Nov 20 03:24:46 sleepy HighMem free:0kB min:128kB low:160kB high:192kB
active:0kB inactive:0kB present:0kB pages_scanned:0 all_unreclaimable? no
Nov 20 03:24:46 sleepy protections[]: 0 0 0
Nov 20 03:24:46 sleepy DMA: 3*4kB 4*8kB 3*16kB 1*32kB 2*64kB 0*128kB
0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 252kB
Nov 20 03:24:46 sleepy Normal: 10*4kB 2*8kB 4*16kB 4*32kB 2*64kB 3*128kB
0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 760kB
Nov 20 03:24:46 sleepy HighMem: empty
Nov 20 03:24:46 sleepy Swap cache: add 4076265, delete 4076262, find
482017/1245636, race 1+9
Nov 20 03:24:46 sleepy Out of Memory: Killed process 6841 (sshd).
Nov 20 03:24:46 sleepy sshd: page allocation failure. order:0, mode:0x1d2
Nov 20 03:24:46 sleepy [<c010395b>] dump_stack+0x16/0x18
Nov 20 03:24:46 sleepy [<c0144859>] __alloc_pages+0x2fe/0x31a
Nov 20 03:24:46 sleepy [<c0146f2f>] do_page_cache_readahead+0x101/0x18d
Nov 20 03:24:46 sleepy [<c01407c4>] filemap_nopage+0x163/0x337
Nov 20 03:24:46 sleepy [<c015215d>] do_no_page+0x10f/0x534
Nov 20 03:24:46 sleepy [<c0152799>] handle_mm_fault+0xe2/0x261
Nov 20 03:24:46 sleepy [<c0114030>] do_page_fault+0x216/0x5d2
Nov 20 03:24:46 sleepy [<c01034ef>] error_code+0x2b/0x30
Nov 20 03:24:46 sleepy sshd: page allocation failure. order:0, mode:0xd2
Nov 20 03:24:46 sleepy [<c010395b>] dump_stack+0x16/0x18
Nov 20 03:24:46 sleepy [<c0144859>] __alloc_pages+0x2fe/0x31a
Nov 20 03:24:46 sleepy [<c015a3d6>] read_swap_cache_async+0x33/0xa5
Nov 20 03:24:46 sleepy [<c0151813>] swapin_readahead+0x3c/0x81
Nov 20 03:24:46 sleepy [<c0151920>] do_swap_page+0xc8/0x4c6
Nov 20 03:24:46 sleepy [<c01527c1>] handle_mm_fault+0x10a/0x261
Nov 20 03:24:46 sleepy [<c0114030>] do_page_fault+0x216/0x5d2
Nov 20 03:24:46 sleepy [<c01034ef>] error_code+0x2b/0x30
Nov 20 03:24:46 sleepy sshd: page allocation failure. order:0, mode:0xd2
Nov 20 03:24:46 sleepy [<c010395b>] dump_stack+0x16/0x18
Nov 20 03:24:46 sleepy [<c0144859>] __alloc_pages+0x2fe/0x31a
On Sun, 2004-11-21 at 14:57 +0100, Thomas Gleixner wrote:
> On Sun, 2004-11-21 at 13:17 +0100, Martin MOKREJŠ wrote:
> > Why can't the algorithm first find the asking for memory now.
> > When found, kernel should kill first it's children, wait some time,
> > then kill this process if still exists (it might exit itself when children
> > get closed).
> > You have said it's safer to kill that to send ENOMEM as happens
> > in 2.4, but I still don't undertand why kernel first doesn't send
> > ENOMEM, and only if that doesn't help it can start after those 5 seconds
> > OOM killer, and try to kill the very same application.
> > I don't get the idea why to kill immediately.
>
> I see your concern. There are some more changes neccecary to make this
> reliably work. I'm not sure if it can be done without really big
> changes. I will look a bit deeper into this.
One big problem when killing the requesting process or just sending
ENOMEM to the requesting process is, that exactly this process might be
a ssh login, when you try to log into to machine after some application
went crazy and ate up most of the memory. The result is that you
_cannot_ log into the machine, because the login is either killed or
cannot start because it receives ENOMEM.
Putting hard coded decisions like "prefer sshd, xyz,...", " don't kill
a, b, c" are out of discussion.
The ideas which were proposed to have a possibility to set a "don't kill
me" or "yes, I'm a candidate" flag are likely to be a future way to go.
But at the moment we have no way to make this work in current userlands.
I refined the decision, so it does not longer kill the parent, if there
were forked child processes available to kill. So it now should keep
your bash alive.
tglx
Hi Andrew,
Chris Ross escreveu:
> Andrew Morton escreveu:
>> Please ignore the previous patch and try the below.
>
> I still get OOM kills with this (well one, anyway). It does seem harder
> to trigger though.
Turns out it's not that hard. Sorry for the slight delay, I've been away
a few days.
root@sleepy chris # grep Killed /var/log/messages
Nov 21 22:24:22 sleepy Out of Memory: Killed process 6800 (qmgr).
Nov 21 22:24:32 sleepy Out of Memory: Killed process 6799 (pickup).
Nov 21 22:24:57 sleepy Out of Memory: Killed process 6472 (distccd).
Nov 21 22:25:00 sleepy Out of Memory: Killed process 6473 (distccd).
Nov 21 22:25:00 sleepy Out of Memory: Killed process 6582 (distccd).
Nov 21 22:25:00 sleepy Out of Memory: Killed process 6686 (distccd).
Nov 21 22:25:00 sleepy Out of Memory: Killed process 6687 (ntpd).
If you want to seem the actual oom messages just ask.
This is with 2.6.10-rc2-mm1 + your patch whilst doing an "emerge sync"
which isn't ridiculously memory hungry and shouldn't result in oom kills.
Informally I felt I had better results from Marcelo's patch, though I
should test both under the same conditions before I say that...
Regards,
Chris R.
On Mon, Nov 22, 2004 at 01:15:12PM +0100, Chris Ross wrote:
> Hi Andrew,
>
> Chris Ross escreveu:
> > Andrew Morton escreveu:
> >> Please ignore the previous patch and try the below.
> >
> > I still get OOM kills with this (well one, anyway). It does seem harder
> > to trigger though.
>
> Turns out it's not that hard. Sorry for the slight delay, I've been away
> a few days.
>
> root@sleepy chris # grep Killed /var/log/messages
> Nov 21 22:24:22 sleepy Out of Memory: Killed process 6800 (qmgr).
> Nov 21 22:24:32 sleepy Out of Memory: Killed process 6799 (pickup).
> Nov 21 22:24:57 sleepy Out of Memory: Killed process 6472 (distccd).
> Nov 21 22:25:00 sleepy Out of Memory: Killed process 6473 (distccd).
> Nov 21 22:25:00 sleepy Out of Memory: Killed process 6582 (distccd).
> Nov 21 22:25:00 sleepy Out of Memory: Killed process 6686 (distccd).
> Nov 21 22:25:00 sleepy Out of Memory: Killed process 6687 (ntpd).
>
> If you want to seem the actual oom messages just ask.
>
> This is with 2.6.10-rc2-mm1 + your patch whilst doing an "emerge sync"
> which isn't ridiculously memory hungry and shouldn't result in oom kills.
>
> Informally I felt I had better results from Marcelo's patch, though I
> should test both under the same conditions before I say that...
Chris,
The kill-from-kswapd approach on top of recent -mm which includes
"ignore referenced information on zero priority" should be quite
reliable. Would be nice if you could try that.
The current scheme is broken yes, the main problem being the all_unreclaimable
logic which conflicts with OOM detection - I (we) were hoping
"ignore-referenced-information-on-zero-priority" would be enough for 99% of
cases, but it doesnt seem so.
Either way killing from kswapd or from task context all_unreclaimable logic
is conflitant with OOM detection.
But we are going the right way :)
diff -urN --exclude='*~' --exclude='.#*' 2.6.10-rc2-mm2.orig/mm/oom_kill.c 2.6.10-rc2-mm/mm/oom_kill.c
--- 2.6.10-rc2-mm2.orig/mm/oom_kill.c 2004-11-19 14:52:16.000000000 +0100
+++ 2.6.10-rc2-mm/mm/oom_kill.c 2004-11-22 11:53:16.000000000 +0100
@@ -45,8 +45,10 @@
static unsigned long badness(struct task_struct *p, unsigned long uptime)
{
unsigned long points, cpu_time, run_time, s;
+ struct list_head *tsk;
- if (!p->mm)
+ /* Ignore mm-less tasks and init */
+ if (!p->mm || p->pid == 1)
return 0;
if (p->flags & PF_MEMDIE)
@@ -57,6 +59,19 @@
points = p->mm->total_vm;
/*
+ * Processes which fork a lot of child processes are likely
+ * a good choice. We add the vmsize of the childs if they
+ * have an own mm. This prevents forking servers to flood the
+ * machine with an endless amount of childs
+ */
+ list_for_each(tsk, &p->children) {
+ struct task_struct *chld;
+ chld = list_entry(tsk, struct task_struct, sibling);
+ if (chld->mm != p->mm && chld->mm)
+ points += chld->mm->total_vm;
+ }
+
+ /*
* CPU time is in tens of seconds and run time is in thousands
* of seconds. There is no particular reason for this other than
* that it turned out to work very well in practice.
@@ -176,6 +191,27 @@
return mm;
}
+static struct mm_struct *oom_kill_process(task_t *p)
+{
+ struct mm_struct *mm;
+ struct task_struct *g, *q;
+
+ mm = oom_kill_task(p);
+ if (!mm)
+ return NULL;
+ /*
+ * kill all processes that share the ->mm (i.e. all threads),
+ * but are in a different thread group
+ */
+ do_each_thread(g, q)
+ if (q->mm == mm && q->tgid != p->tgid)
+ __oom_kill_task(q);
+
+ while_each_thread(g, q);
+ if (!p->mm)
+ printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+ return mm;
+}
/**
* oom_kill - kill the "best" process when we run out of memory
@@ -188,7 +224,9 @@
void oom_kill(void)
{
struct mm_struct *mm;
- struct task_struct *g, *p, *q;
+ struct task_struct *c, *p;
+ struct list_head *tsk;
+ int mmcnt = 0;
read_lock(&tasklist_lock);
retry:
@@ -200,21 +238,32 @@
panic("Out of memory and no killable processes...\n");
}
- mm = oom_kill_task(p);
- if (!mm)
- goto retry;
/*
- * kill all processes that share the ->mm (i.e. all threads),
- * but are in a different thread group
+ * Kill the forked child processes first
*/
- do_each_thread(g, q)
- if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q);
- while_each_thread(g, q);
- if (!p->mm)
- printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
+ list_for_each(tsk, &p->children) {
+ c = list_entry(tsk, struct task_struct, sibling);
+ /* Do not touch threads, as they get killed later */
+ if (c->mm == p->mm)
+ continue;
+ mm = oom_kill_process(c);
+ if (mm) {
+ mmcnt ++;
+ mmput(mm);
+ }
+ }
+
+ /*
+ * If we managed to kill one or more child processes
+ * then let the parent live for now
+ */
+ if (!mmcnt) {
+ mm = oom_kill_process(p);
+ if (!mm)
+ goto retry;
+ mmput(mm);
+ }
read_unlock(&tasklist_lock);
- mmput(mm);
return;
}
@@ -224,14 +273,22 @@
void out_of_memory(int gfp_mask)
{
/*
- * oom_lock protects out_of_memory()'s static variables.
- * It's a global lock; this is not performance-critical.
- */
- static DEFINE_SPINLOCK(oom_lock);
+ * inprogress protects out_of_memory()'s static variables.
+ * We don't use a spin_lock here, as spinlocks are
+ * nops on UP systems.
+ */
+ static unsigned long inprogress;
+ static unsigned int freepages = 1000000;
static unsigned long first, last, count, lastkill;
unsigned long now, since;
- spin_lock(&oom_lock);
+ if (test_and_set_bit(0, &inprogress))
+ return;
+
+ /* Check, if memory was freed since the last oom kill */
+ if (freepages < nr_free_pages())
+ goto out_unlock;
+
now = jiffies;
since = now - last;
last = now;
@@ -271,12 +328,10 @@
* Ok, really out of memory. Kill something.
*/
lastkill = now;
-
printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
show_free_areas();
-
- /* oom_kill() sleeps */
- spin_unlock(&oom_lock);
+ /* Store free pages * 2 for the check above */
+ freepages = (nr_free_pages() << 1);
oom_kill();
/*
* Make kswapd go out of the way, so "p" has a good chance of
@@ -284,17 +339,11 @@
* for more memory.
*/
yield();
- spin_lock(&oom_lock);
reset:
- /*
- * We dropped the lock above, so check to be sure the variable
- * first only ever increases to prevent false OOM's.
- */
- if (time_after(now, first))
- first = now;
+ first = jiffies;
count = 0;
out_unlock:
- spin_unlock(&oom_lock);
+ clear_bit(0, &inprogress);
}
On Tue, 2004-11-23 at 08:41 +0100, Martin MOKREJŠ wrote:
> > One big problem when killing the requesting process or just sending
> > ENOMEM to the requesting process is, that exactly this process might be
> > a ssh login, when you try to log into to machine after some application
> > went crazy and ate up most of the memory. The result is that you
> > _cannot_ log into the machine, because the login is either killed or
> > cannot start because it receives ENOMEM.
>
> I believe the application is _first_ who will get ENOMEM. It must be
> terrible luck that it would ask exactly for the size of remaining free
> memory. Most probably, it will ask for less or more. "Less" in not
> a problem in this case, so consider it asks for more. Then, OOM killer
> might well expect the application asking for memory is most probably
> exactly the application which caused the trouble.
For one application, which eats up all memory the 2.4 ENOMEM bahviour
works.
The scenario which made one of my boxes unusable under 2.4 is a forking
server, which gets out of control. The last fork gets ENOMEM and does
not happen, but the other forked processes are still there and consuming
memory. The server application does the correct thing. It receives
ENOMEM on fork() and cancels the connection request. On the next request
the game starts again. Somebody notices that the box is not repsonding
anymore and tries to login via ssh. Guess what happens. ssh login cannot
fork due to ENOMEM. The same will happen on 2.6 if we make it behave
like 2.4.
We have TWO problems in oom handling:
1. When do we trigger the out of memory killer
As far as my test cases go, 2.6.10-rc2-mm3 does not longer trigger the
oom without reason.
2. Which process do we select to kill
The decision is screwed since the oom killer was introduced. Also the
reentrancy problem and some of the mechanisms in the out_of_memory
function have to be modified to make it work.
That's what my patch is addressing.
> >
> > Putting hard coded decisions like "prefer sshd, xyz,...", " don't kill
> > a, b, c" are out of discussion.
>
> I'd go for it at least nowadays.
Sure, you can do so on your box, but can you accept, that we _CANNOT_
hard code a list of do not kill apps, except init, into the kernel. I
don't want to see the mail thread on LKML, where the list of precious
application is discussed.
> >
> > The ideas which were proposed to have a possibility to set a "don't kill
> > me" or "yes, I'm a candidate" flag are likely to be a future way to go.
> > But at the moment we have no way to make this work in current userlands.
>
> Do you think login or sshd will ever use flag "yes, I'm a candidate"?
> I think exactly same bahaviour we get right now with those hard coded decisions
> you mention above. Otherwise the hard coded decision is programmed into
> every sshd, init instance anyway. I think it's not necessary to put
> login and shells on thsi ban list, user will re-login again. ;)
Having a generic interface to make this configurable is the only way to
go. So users can decide what is important in their environment. There is
more than a desktop PC environment and a lot of embedded boxes need to
protect special applications.
> >
> > I refined the decision, so it does not longer kill the parent, if there
> > were forked child processes available to kill. So it now should keep
> > your bash alive.
>
> Yes, it doesn't kill parent bash. I don't understand the _doubled_ output
> in syslog, but maybe you do. Is that related to hyperthreading? ;)
> Tested on 2.6.10-rc2-mm2.
> oom-killer: gfp_mask=0xd2
> Free pages: 3924kB (112kB HighMem)
> oom-killer: gfp_mask=0x1d2
> Free pages: 3924kB (112kB HighMem)
No, it's not related to hyperthreading. It's on the way out.
I put an additional check into the page allocator. Does this help ?
tglx
oom-killer: gfp_mask=0x1d2
DMA per-cpu:
cpu 0 hot: low 2, high 6, batch 1
cpu 0 cold: low 0, high 2, batch 1
Normal per-cpu:
cpu 0 hot: low 32, high 96, batch 16
cpu 0 cold: low 0, high 32, batch 16
HighMem per-cpu:
cpu 0 hot: low 14, high 42, batch 7
cpu 0 cold: low 0, high 14, batch 7
Free pages: 3932kB (112kB HighMem)
Active:128331 inactive:125322 dirty:0 writeback:0 unstable:0 free:983 slab:1941 mapped:253443 pagetables:794
DMA free:68kB min:68kB low:84kB high:100kB active:5444kB inactive:5304kB present:16384kB pages_scanned:11374 all_unreclaimable? yes
protections[]: 0 0 0
Normal free:3752kB min:3756kB low:4692kB high:5632kB active:442984kB inactive:431092kB present:901120kB pages_scanned:936308 all_unreclaimable? yes
protections[]: 0 0 0
HighMem free:112kB min:128kB low:160kB high:192kB active:64896kB inactive:64892kB present:131044kB pages_scanned:134868 all_unreclaimable? yes
protections[]: 0 0 0
DMA: 1*4kB 0*8kB 0*16kB 0*32kB 1*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 68kB
Normal: 0*4kB 1*8kB 0*16kB 1*32kB 0*64kB 1*128kB 0*256kB 1*512kB 1*1024kB 1*2048kB 0*4096kB = 3752kB
HighMem: 0*4kB 0*8kB 1*16kB 1*32kB 1*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 112kB
Swap cache: add 293335, delete 293335, find 44/60, race 0+0
[<c0103dfd>] dump_stack+0x1e/0x22
[<c013ec1c>] out_of_memory+0x97/0xcf
[<c0146dc8>] try_to_free_pages+0x163/0x184
[<c013fde2>] __alloc_pages+0x27e/0x400
[<c0142368>] do_page_cache_readahead+0x15b/0x1b9
[<c013c5aa>] filemap_nopage+0x2d4/0x375
[<c014aa82>] do_no_page+0xc4/0x38c
[<c014af30>] handle_mm_fault+0xde/0x189
[<c01166d5>] do_page_fault+0x456/0x6ad
[<c0103a43>] error_code+0x2b/0x30
Out of Memory: Killed process 6672 (RNAsubopt).
RNAsubopt: page allocation failure. order:0, mode:0x1d2
[<c0103dfd>] dump_stack+0x1e/0x22
[<c013fd90>] __alloc_pages+0x22c/0x400
[<c0142368>] do_page_cache_readahead+0x15b/0x1b9
[<c013c5aa>] filemap_nopage+0x2d4/0x375
[<c014aa82>] do_no_page+0xc4/0x38c
[<c014af30>] handle_mm_fault+0xde/0x189
[<c01166d5>] do_page_fault+0x456/0x6ad
[<c0103a43>] error_code+0x2b/0x30
On Wed, 2004-11-24 at 16:52 +0100, Martin MOKREJŠ wrote:
> > No, it's not related to hyperthreading. It's on the way out.
> >
> > I put an additional check into the page allocator. Does this help ?
>
> The application got killed. But, consider yourself the stacktrace ... ;)
> oom-killer: gfp_mask=0x1d2
> Free pages: 3932kB (112kB HighMem)
> [<c0103dfd>] dump_stack+0x1e/0x22
> [<c013ec1c>] out_of_memory+0x97/0xcf
> [<c0146dc8>] try_to_free_pages+0x163/0x184
> [<c013fde2>] __alloc_pages+0x27e/0x400
> [<c0142368>] do_page_cache_readahead+0x15b/0x1b9
> [<c013c5aa>] filemap_nopage+0x2d4/0x375
> [<c014aa82>] do_no_page+0xc4/0x38c
> [<c014af30>] handle_mm_fault+0xde/0x189
> [<c01166d5>] do_page_fault+0x456/0x6ad
> [<c0103a43>] error_code+0x2b/0x30
> Out of Memory: Killed process 6672 (RNAsubopt).
> RNAsubopt: page allocation failure. order:0, mode:0x1d2
> [<c0103dfd>] dump_stack+0x1e/0x22
> [<c013fd90>] __alloc_pages+0x22c/0x400
> [<c0142368>] do_page_cache_readahead+0x15b/0x1b9
> [<c013c5aa>] filemap_nopage+0x2d4/0x375
> [<c014aa82>] do_no_page+0xc4/0x38c
> [<c014af30>] handle_mm_fault+0xde/0x189
> [<c01166d5>] do_page_fault+0x456/0x6ad
> [<c0103a43>] error_code+0x2b/0x30
Yep, that's expected. The first stacktrace is from my patch. I added
this to see from where the allocation is called. The second trace is
from the page fault handler, as the allocation request now returns
failed to prevent the second call into oom.
tglx