diff -aurpN -X /home/mbligh/.diff.exclude linux-2.6.18/mm/vmscan.c 2.6.18-prev_reset/mm/vmscan.c
--- linux-2.6.18/mm/vmscan.c 2006-09-20 12:24:42.000000000 -0700
+++ 2.6.18-prev_reset/mm/vmscan.c 2006-10-16 17:23:48.000000000 -0700
@@ -962,7 +962,6 @@ static unsigned long shrink_zones(int pr
unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
{
int priority;
- int ret = 0;
unsigned long total_scanned = 0;
unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -1000,8 +999,15 @@ unsigned long try_to_free_pages(struct z
}
total_scanned += sc.nr_scanned;
if (nr_reclaimed >= sc.swap_cluster_max) {
- ret = 1;
- goto out;
+ for (i = 0; zones[i] != 0; i++) {
+ struct zone *zone = zones[i];
+
+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ continue;
+
+ zone->prev_priority = zone->temp_priority;
+ }
+ return 1;
}
/*
@@ -1021,16 +1027,15 @@ unsigned long try_to_free_pages(struct z
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
blk_congestion_wait(WRITE, HZ/10);
}
-out:
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;
- zone->prev_priority = zone->temp_priority;
+ zone->prev_priority = 0;
}
- return ret;
+ return 0;
}
/*
@@ -1186,7 +1191,10 @@ out:
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
- zone->prev_priority = zone->temp_priority;
+ if (priority < 0) /* we failed to reclaim */
+ zone->prev_priority = 0;
+ else
+ zone->prev_priority = zone->temp_priority;
}
if (!all_zones_ok) {
cond_resched();
Martin Bligh wrote:
> The same bug is contained in both try_to_free_pages and balance_pgdat.
> On reclaiming the requisite number of pages we correctly set
> prev_priority back to DEF_PRIORITY.
AFAIKS, we set prev_priority to the priority at which the zone was
deemed to require no more reclaiming, not DEF_PRIORITY.
> However, we ALSO do this even
> if we loop over all priorities and fail to reclaim.
If that happens, shouldn't prev_priority be set to 0?
I don't agree the patch is correct.
>
> Setting prev_priority artificially high causes reclaimers to set
> distress artificially low, and fail to reclaim mapped pages, when
> they are, in fact, under severe memory pressure (their priority
> may be as low as 0). This causes the OOM killer to fire incorrectly.
>
> This patch changes that to set prev_priority to 0 instead, if we
> fail to reclaim.
We saw problems with this before releasing SLES10 too. See
zone_is_near_oom and other changesets from around that era. I would
like to know what workload was prevented from going OOM with these
changes, but zone_is_near_oom didn't help -- it must have been very
marginal (or there may indeed be a bug somewhere).
Nick
--
>
> Signed-off-by: Martin J. Bligh <[email protected]>
>
>
>------------------------------------------------------------------------
>
>diff -aurpN -X /home/mbligh/.diff.exclude linux-2.6.18/mm/vmscan.c 2.6.18-prev_reset/mm/vmscan.c
>--- linux-2.6.18/mm/vmscan.c 2006-09-20 12:24:42.000000000 -0700
>+++ 2.6.18-prev_reset/mm/vmscan.c 2006-10-16 17:23:48.000000000 -0700
>@@ -962,7 +962,6 @@ static unsigned long shrink_zones(int pr
> unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
> {
> int priority;
>- int ret = 0;
> unsigned long total_scanned = 0;
> unsigned long nr_reclaimed = 0;
> struct reclaim_state *reclaim_state = current->reclaim_state;
>@@ -1000,8 +999,15 @@ unsigned long try_to_free_pages(struct z
> }
> total_scanned += sc.nr_scanned;
> if (nr_reclaimed >= sc.swap_cluster_max) {
>- ret = 1;
>- goto out;
>+ for (i = 0; zones[i] != 0; i++) {
>+ struct zone *zone = zones[i];
>+
>+ if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
>+ continue;
>+
>+ zone->prev_priority = zone->temp_priority;
>+ }
>+ return 1;
> }
>
> /*
>@@ -1021,16 +1027,15 @@ unsigned long try_to_free_pages(struct z
> if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
> blk_congestion_wait(WRITE, HZ/10);
> }
>-out:
> for (i = 0; zones[i] != 0; i++) {
> struct zone *zone = zones[i];
>
> if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
> continue;
>
>- zone->prev_priority = zone->temp_priority;
>+ zone->prev_priority = 0;
> }
>- return ret;
>+ return 0;
> }
>
> /*
>@@ -1186,7 +1191,10 @@ out:
> for (i = 0; i < pgdat->nr_zones; i++) {
> struct zone *zone = pgdat->node_zones + i;
>
>- zone->prev_priority = zone->temp_priority;
>+ if (priority < 0) /* we failed to reclaim */
>+ zone->prev_priority = 0;
>+ else
>+ zone->prev_priority = zone->temp_priority;
> }
> if (!all_zones_ok) {
> cond_resched();
>
Send instant messages to your online friends http://au.messenger.yahoo.com
Nick Piggin wrote:
> Martin Bligh wrote:
>
>> The same bug is contained in both try_to_free_pages and balance_pgdat.
>> On reclaiming the requisite number of pages we correctly set
>> prev_priority back to DEF_PRIORITY.
>
>
> AFAIKS, we set prev_priority to the priority at which the zone was
> deemed to require no more reclaiming, not DEF_PRIORITY.
Well, it's zone->temp_priority, which was set to DEF_PRIORITY at the
top of the function, though I suppose something else might have
changed it since.
>> However, we ALSO do this even
>> if we loop over all priorities and fail to reclaim.
>
>
> If that happens, shouldn't prev_priority be set to 0?
Yes, but it's not. We fall off the bottom of the loop, and set it
back to temp_priority. At best, the code is unclear.
I suppose shrink_zones() might in theory knock temp_priority down
as it goes, so it might come out right. But given that it's a global
(per zone), not per-reclaimer, I fail to see how that's really safe.
Supposing someone else has just started reclaim, and is still at
prio 12?
Moreover, whilst try_to_free_pages calls shrink_zones, balance_pgdat
does not. Nothing else I can see sets temp_priority.
> I don't agree the patch is correct.
You think it's doing something wrong? Or just unnecessary?
I'm inclined to think the whole concept of temp_priority and
prev_priority are pretty broken. This may not fix the whole thing,
but it seems to me to make it better than it was before.
> We saw problems with this before releasing SLES10 too. See
> zone_is_near_oom and other changesets from around that era. I would
> like to know what workload was prevented from going OOM with these
> changes, but zone_is_near_oom didn't help -- it must have been very
> marginal (or there may indeed be a bug somewhere).
Google production workload. Multiple reclaimers operating - one is
down to priority 0 on the reclaim, but distress is still set to 0,
thanks to prev_priority being borked. Hence we don't reclaim mapped
pages, the reclaim fails, OOM killer kicks in.
Forward ported from an earlier version of 2.6 ... but I don't see
why we need extra heuristics here, it seems like a clear and fairly
simple bug. We're in deep crap with reclaim, and we go set the
global indicator back to "oh no, everything's fine". Not a good plan.
M.
Martin J. Bligh wrote:
> Nick Piggin wrote:
>
>> Martin Bligh wrote:
>>
>>> The same bug is contained in both try_to_free_pages and balance_pgdat.
>>> On reclaiming the requisite number of pages we correctly set
>>> prev_priority back to DEF_PRIORITY.
>>
>>
>>
>> AFAIKS, we set prev_priority to the priority at which the zone was
>> deemed to require no more reclaiming, not DEF_PRIORITY.
>
>
> Well, it's zone->temp_priority, which was set to DEF_PRIORITY at the
> top of the function, though I suppose something else might have
> changed it since.
Yes.
>
>>> However, we ALSO do this even
>>> if we loop over all priorities and fail to reclaim.
>>
>>
>>
>> If that happens, shouldn't prev_priority be set to 0?
>
>
> Yes, but it's not. We fall off the bottom of the loop, and set it
> back to temp_priority. At best, the code is unclear.
But temp_priority should be set to 0 at that point.
> I suppose shrink_zones() might in theory knock temp_priority down
> as it goes, so it might come out right. But given that it's a global
> (per zone), not per-reclaimer, I fail to see how that's really safe.
> Supposing someone else has just started reclaim, and is still at
> prio 12?
But your loops are not exactly per reclaimer either. Granted there
is a large race window in the current code, but this patch isn't the
way to fix that particular problem.
> Moreover, whilst try_to_free_pages calls shrink_zones, balance_pgdat
> does not. Nothing else I can see sets temp_priority.
balance_pgdat.
>
> > I don't agree the patch is correct.
>
> You think it's doing something wrong? Or just unnecessary?
Unnecesary and indicates something else is broken if you are seeing
problems here.
> I'm inclined to think the whole concept of temp_priority and
> prev_priority are pretty broken. This may not fix the whole thing,
> but it seems to me to make it better than it was before.
I think it is broken too. I liked my split active lists, but at that point
vmscan.c was in don't-touch mode.
>> We saw problems with this before releasing SLES10 too. See
>> zone_is_near_oom and other changesets from around that era. I would
>> like to know what workload was prevented from going OOM with these
>> changes, but zone_is_near_oom didn't help -- it must have been very
>> marginal (or there may indeed be a bug somewhere).
>
>
> Google production workload. Multiple reclaimers operating - one is
> down to priority 0 on the reclaim, but distress is still set to 0,
> thanks to prev_priority being borked. Hence we don't reclaim mapped
> pages, the reclaim fails, OOM killer kicks in.
OK, so it sounds like temp_priority is being overwritten by the
race. I'd consider throwing out temp_priority completely, and just
going with adjusting prev_priority as we go.
> Forward ported from an earlier version of 2.6 ... but I don't see
> why we need extra heuristics here, it seems like a clear and fairly
> simple bug. We're in deep crap with reclaim, and we go set the
> global indicator back to "oh no, everything's fine". Not a good plan.
All that reclaim_mapped code is pretty arbitrary anyway. What is needed
is the zone_is_near_oom so we can decouple all those heuristics from
the OOM decision. So do you still see the problem on upstream kernel
without your patches applied?
--
Send instant messages to your online friends http://au.messenger.yahoo.com
>> Well, it's zone->temp_priority, which was set to DEF_PRIORITY at the
>> top of the function, though I suppose something else might have
>> changed it since.
>
> Yes.
...
>>> If that happens, shouldn't prev_priority be set to 0?
>>
>> Yes, but it's not. We fall off the bottom of the loop, and set it
>> back to temp_priority. At best, the code is unclear.
>
> But temp_priority should be set to 0 at that point.
It that were true, it'd be great. But how?
This is everything that touches it:
0 mmzone.h <global> 208 int temp_priority;
1 page_alloc.c free_area_init_core 2019 zone->temp_priority =
zone->prev_priority = DEF_PRIORITY;
2 vmscan.c shrink_zones 937 zone->temp_priority = priority;
3 vmscan.c try_to_free_pages 987 zone->temp_priority = DEF_PRIORITY;
4 vmscan.c try_to_free_pages 1031 zone->prev_priority =
zone->temp_priority;
5 vmscan.c balance_pgdat 1081 zone->temp_priority = DEF_PRIORITY;
6 vmscan.c balance_pgdat 1143 zone->temp_priority = priority;
7 vmscan.c balance_pgdat 1189 zone->prev_priority =
zone->temp_priority;
8 vmstat.c zoneinfo_show 593 zone->temp_priority,
Only thing that looks interesting here is shrink_zones.
>> I suppose shrink_zones() might in theory knock temp_priority down
>> as it goes, so it might come out right. But given that it's a global
>> (per zone), not per-reclaimer, I fail to see how that's really safe.
>> Supposing someone else has just started reclaim, and is still at
>> prio 12?
>
> But your loops are not exactly per reclaimer either. Granted there
> is a large race window in the current code, but this patch isn't the
> way to fix that particular problem.
Why not? Perhaps it's not a panacea, but it's a definite improvement.
>> Moreover, whilst try_to_free_pages calls shrink_zones, balance_pgdat
>> does not. Nothing else I can see sets temp_priority.
>
> balance_pgdat.
That's only called from kswapd. If we're in balance_pgdat, we ARE
kswapd. We can't fix ourself. So effectively we're doing:
while (priority--) {
if (we reclaimed OK)
goto out;
}
out:
prev_priority = DEF_PRIORITY;
We've just walked the whole bloody list with priority set to 0.
We failed to reclaim a few pages.
We know the world is in deep pain.
Why the hell would we elevate prev_priority?
> Unnecesary and indicates something else is broken if you are seeing
> problems here.
You think we should set prev_priority up, when we've just walked the
whole list at prio 0 and can't reclaim anything? Unless so, I fail
to see how the patch is unnecessary.
And yes, I'm sure other things are broken, but again, this fixes a
clear bug.
>> I'm inclined to think the whole concept of temp_priority and
>> prev_priority are pretty broken. This may not fix the whole thing,
>> but it seems to me to make it better than it was before.
>
> I think it is broken too. I liked my split active lists, but at that point
> vmscan.c was in don't-touch mode.
I'm glad we agree it's broken. Whilst we await the 100th rewrite of the
VM, perhaps we can apply this simple fix?
> OK, so it sounds like temp_priority is being overwritten by the
> race. I'd consider throwing out temp_priority completely, and just
> going with adjusting prev_priority as we go.
I'm fine with that. Whole thing is racy as hell and pretty pointless
anyway. I'll make another patch up today.
>> Forward ported from an earlier version of 2.6 ... but I don't see
>> why we need extra heuristics here, it seems like a clear and fairly
>> simple bug. We're in deep crap with reclaim, and we go set the
>> global indicator back to "oh no, everything's fine". Not a good plan.
>
> All that reclaim_mapped code is pretty arbitrary anyway. What is needed
> is the zone_is_near_oom so we can decouple all those heuristics from
> the OOM decision.
It seems what is needed is that we start to actually reclaim pages when
priority gets low. This is a very simple way of improving that.
> So do you still see the problem on upstream kernel
> without your patches applied?
I can't slap an upstream bleeding edge kernel across a few thousand
production machines, and wait to see if the world blows up, sorry.
If I can make a reproduce test case, I'll send it out, but thus far
we've been unsuccessful.
But I can see it happening in earlier versions, and I can read the
code in 2.6.18, and see obvious bugs.
M.
Martin J. Bligh wrote:
>> But temp_priority should be set to 0 at that point.
>
>
> It that were true, it'd be great. But how?
> This is everything that touches it:
>
> 0 mmzone.h <global> 208 int temp_priority;
> 1 page_alloc.c free_area_init_core 2019 zone->temp_priority =
> zone->prev_priority = DEF_PRIORITY;
> 2 vmscan.c shrink_zones 937 zone->temp_priority = priority;
> 3 vmscan.c try_to_free_pages 987 zone->temp_priority = DEF_PRIORITY;
> 4 vmscan.c try_to_free_pages 1031 zone->prev_priority =
> zone->temp_priority;
> 5 vmscan.c balance_pgdat 1081 zone->temp_priority = DEF_PRIORITY;
> 6 vmscan.c balance_pgdat 1143 zone->temp_priority = priority;
> 7 vmscan.c balance_pgdat 1189 zone->prev_priority =
> zone->temp_priority;
> 8 vmstat.c zoneinfo_show 593 zone->temp_priority,
>
> Only thing that looks interesting here is shrink_zones.
For try_to_free_pages, shrink_zones will continue to be called until
priority reaches 0. So temp_priority and prev_priority are now 0. When
it breaks out of the loop, prev_priority gets assigned temp_priority.
Both of which are zero *unless you've hit the temp_priority race*. As
I said, getting rid of temp_priority and somehow tracking it locally
will close this race. I agree this race is a bug and would be happy to
see it fixed. This might be what your patch inadvertently fixes.
>> But your loops are not exactly per reclaimer either. Granted there
>> is a large race window in the current code, but this patch isn't the
>> way to fix that particular problem.
>
>
> Why not? Perhaps it's not a panacea, but it's a definite improvement.
OK it is an improvement for the cases when we hit priority = 0. It would
be nice to fix the race for medium priorities as well though. Hmm, OK,
if we can't do that easily then I would be OK with this approach for the
time being.
Please don't duplicate that whole loop again in try_to_free_pages, though.
>
>>> Moreover, whilst try_to_free_pages calls shrink_zones, balance_pgdat
>>> does not. Nothing else I can see sets temp_priority.
>>
>>
>> balance_pgdat.
>
>
> That's only called from kswapd. If we're in balance_pgdat, we ARE
> kswapd. We can't fix ourself. So effectively we're doing:
>
> while (priority--) {
> if (we reclaimed OK)
> goto out;
> }
> out:
> prev_priority = DEF_PRIORITY;
>
> We've just walked the whole bloody list with priority set to 0.
>
> We failed to reclaim a few pages.
>
> We know the world is in deep pain.
>
> Why the hell would we elevate prev_priority?
No. If we've walked the whole bloody list and failed to reclaim any
pages, we do not set prev_priority to DEF_PRIORITY. Read the code, it
does the same thing with the priorities as shrink_zones.
>> Unnecesary and indicates something else is broken if you are seeing
>> problems here.
>
>
> You think we should set prev_priority up, when we've just walked the
> whole list at prio 0 and can't reclaim anything? Unless so, I fail
> to see how the patch is unnecessary.
>
> And yes, I'm sure other things are broken, but again, this fixes a
> clear bug.
AFAIKS there is no bug that have identified here or in your changelog.
There is a race, there are many of tolerable races in reclaim. I can
accept this races is intolerable for you, so I am OK with fixing it.
> > So do you still see the problem on upstream kernel
>
>> without your patches applied?
>
>
> I can't slap an upstream bleeding edge kernel across a few thousand
> production machines, and wait to see if the world blows up, sorry.
> If I can make a reproduce test case, I'll send it out, but thus far
> we've been unsuccessful.
No problem, I didn't ask you to do that. But if you want this patch
in the upstream kerenl, then I will keep asking whether it fixes a
problem in the upstream kernel.
>
> But I can see it happening in earlier versions, and I can read the
> code in 2.6.18, and see obvious bugs.
I can't see any besides the temp_priority race.
--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com