2008-11-13 22:14:01

by Rik van Riel

[permalink] [raw]
Subject: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

Sometimes the VM spends the first few priority rounds rotating back
referenced pages and submitting IO. Once we get to a lower priority,
sometimes the VM ends up freeing way too many pages.

The fix is relatively simple: in shrink_zone() we can check how many
pages we have already freed and break out of the loop.

However, in order to do this we do need to know how many pages we already
freed, so move nr_reclaimed into scan_control.


Signed-off-by: Rik van Riel <[email protected]>

---
mm/vmscan.c | 60 ++++++++++++++++++++++++++++++------------------------------
1 file changed, 30 insertions(+), 30 deletions(-)

Index: linux-2.6.28-rc2-mm1/mm/vmscan.c
===================================================================
--- linux-2.6.28-rc2-mm1.orig/mm/vmscan.c 2008-10-30 15:20:06.000000000 -0400
+++ linux-2.6.28-rc2-mm1/mm/vmscan.c 2008-11-13 17:08:35.000000000 -0500
@@ -53,6 +53,9 @@ struct scan_control {
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;

+ /* Number of pages that were freed */
+ unsigned long nr_reclaimed;
+
/* This context's GFP mask */
gfp_t gfp_mask;

@@ -1408,16 +1411,14 @@ static void get_scan_ratio(struct zone *
percent[1] = 100 - percent[0];
}

-
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static unsigned long shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(int priority, struct zone *zone,
struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
- unsigned long nr_reclaimed = 0;
unsigned long percent[2]; /* anon @ 0; file @ 1 */
enum lru_list l;

@@ -1458,10 +1459,18 @@ static unsigned long shrink_zone(int pri
(unsigned long)sc->swap_cluster_max);
nr[l] -= nr_to_scan;

- nr_reclaimed += shrink_list(l, nr_to_scan,
+ sc->nr_reclaimed += shrink_list(l, nr_to_scan,
zone, sc, priority);
}
}
+ /*
+ * On large memory systems, scan >> priority can become
+ * really large. This is OK if we need to scan through
+ * that many pages (referenced, dirty, etc), but make
+ * sure to stop if we already freed enough.
+ */
+ if (sc->nr_reclaimed > sc->swap_cluster_max)
+ break;
}

/*
@@ -1474,7 +1483,6 @@ static unsigned long shrink_zone(int pri
shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);

throttle_vm_writeout(sc->gfp_mask);
- return nr_reclaimed;
}

/*
@@ -1488,16 +1496,13 @@ static unsigned long shrink_zone(int pri
* b) The zones may be over pages_high but they must go *over* pages_high to
* satisfy the `incremental min' zone defense algorithm.
*
- * Returns the number of reclaimed pages.
- *
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
- unsigned long nr_reclaimed = 0;
struct zoneref *z;
struct zone *zone;

@@ -1528,10 +1533,8 @@ static unsigned long shrink_zones(int pr
priority);
}

- nr_reclaimed += shrink_zone(priority, zone, sc);
+ shrink_zone(priority, zone, sc);
}
-
- return nr_reclaimed;
}

/*
@@ -1556,7 +1559,6 @@ static unsigned long do_try_to_free_page
int priority;
unsigned long ret = 0;
unsigned long total_scanned = 0;
- unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0;
struct zoneref *z;
@@ -1584,7 +1586,7 @@ static unsigned long do_try_to_free_page
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1592,13 +1594,13 @@ static unsigned long do_try_to_free_page
if (scan_global_lru(sc)) {
shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
if (reclaim_state) {
- nr_reclaimed += reclaim_state->reclaimed_slab;
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
}
total_scanned += sc->nr_scanned;
- if (nr_reclaimed >= sc->swap_cluster_max) {
- ret = nr_reclaimed;
+ if (sc->nr_reclaimed >= sc->swap_cluster_max) {
+ ret = sc->nr_reclaimed;
goto out;
}

@@ -1621,7 +1623,7 @@ static unsigned long do_try_to_free_page
}
/* top priority shrink_zones still had more to do? don't OOM, then */
if (!sc->all_unreclaimable && scan_global_lru(sc))
- ret = nr_reclaimed;
+ ret = sc->nr_reclaimed;
out:
/*
* Now that we've scanned all the zones at this priority level, note
@@ -1716,7 +1718,6 @@ static unsigned long balance_pgdat(pg_da
int priority;
int i;
unsigned long total_scanned;
- unsigned long nr_reclaimed;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -1735,7 +1736,7 @@ static unsigned long balance_pgdat(pg_da

loop_again:
total_scanned = 0;
- nr_reclaimed = 0;
+ sc.nr_reclaimed = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);

@@ -1821,11 +1822,11 @@ loop_again:
*/
if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
end_zone, 0))
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
- nr_reclaimed += reclaim_state->reclaimed_slab;
+ sc.nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone_is_all_unreclaimable(zone))
continue;
@@ -1839,7 +1840,7 @@ loop_again:
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > nr_reclaimed + nr_reclaimed / 2)
+ total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
sc.may_writepage = 1;
}
if (all_zones_ok)
@@ -1857,7 +1858,7 @@ loop_again:
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if (nr_reclaimed >= SWAP_CLUSTER_MAX)
+ if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
}
out:
@@ -1879,7 +1880,7 @@ out:
goto loop_again;
}

- return nr_reclaimed;
+ return sc.nr_reclaimed;
}

/*
@@ -2231,7 +2232,6 @@ static int __zone_reclaim(struct zone *z
struct task_struct *p = current;
struct reclaim_state reclaim_state;
int priority;
- unsigned long nr_reclaimed = 0;
struct scan_control sc = {
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2264,9 +2264,9 @@ static int __zone_reclaim(struct zone *z
priority = ZONE_RECLAIM_PRIORITY;
do {
note_zone_scanning_priority(zone, priority);
- nr_reclaimed += shrink_zone(priority, zone, &sc);
+ shrink_zone(priority, zone, &sc);
priority--;
- } while (priority >= 0 && nr_reclaimed < nr_pages);
+ } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
}

slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2290,13 +2290,13 @@ static int __zone_reclaim(struct zone *z
* Update nr_reclaimed by the number of slab pages we
* reclaimed from this zone.
*/
- nr_reclaimed += slab_reclaimable -
+ sc.nr_reclaimed += slab_reclaimable -
zone_page_state(zone, NR_SLAB_RECLAIMABLE);
}

p->reclaim_state = NULL;
current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
- return nr_reclaimed >= nr_pages;
+ return sc.nr_reclaimed >= nr_pages;
}

int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)


2008-11-14 00:51:21

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

Hi

> Sometimes the VM spends the first few priority rounds rotating back
> referenced pages and submitting IO. Once we get to a lower priority,
> sometimes the VM ends up freeing way too many pages.
>
> The fix is relatively simple: in shrink_zone() we can check how many
> pages we have already freed and break out of the loop.
>
> However, in order to do this we do need to know how many pages we already
> freed, so move nr_reclaimed into scan_control.
>
>
> Signed-off-by: Rik van Riel <[email protected]>

Wow!
Honestly, I prepared the similar patche recently.




> ---
> mm/vmscan.c | 60 ++++++++++++++++++++++++++++++------------------------------
> 1 file changed, 30 insertions(+), 30 deletions(-)
>
> Index: linux-2.6.28-rc2-mm1/mm/vmscan.c
> ===================================================================
> --- linux-2.6.28-rc2-mm1.orig/mm/vmscan.c 2008-10-30 15:20:06.000000000 -0400
> +++ linux-2.6.28-rc2-mm1/mm/vmscan.c 2008-11-13 17:08:35.000000000 -0500
> @@ -53,6 +53,9 @@ struct scan_control {
> /* Incremented by the number of inactive pages that were scanned */
> unsigned long nr_scanned;
>
> + /* Number of pages that were freed */
> + unsigned long nr_reclaimed;
> +
> /* This context's GFP mask */
> gfp_t gfp_mask;
>
> @@ -1408,16 +1411,14 @@ static void get_scan_ratio(struct zone *
> percent[1] = 100 - percent[0];
> }
>
> -
> /*
> * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
> */
> -static unsigned long shrink_zone(int priority, struct zone *zone,
> +static void shrink_zone(int priority, struct zone *zone,
> struct scan_control *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> unsigned long nr_to_scan;
> - unsigned long nr_reclaimed = 0;
> unsigned long percent[2]; /* anon @ 0; file @ 1 */
> enum lru_list l;
>
> @@ -1458,10 +1459,18 @@ static unsigned long shrink_zone(int pri
> (unsigned long)sc->swap_cluster_max);
> nr[l] -= nr_to_scan;
>
> - nr_reclaimed += shrink_list(l, nr_to_scan,
> + sc->nr_reclaimed += shrink_list(l, nr_to_scan,
> zone, sc, priority);
> }
> }
> + /*
> + * On large memory systems, scan >> priority can become
> + * really large. This is OK if we need to scan through
> + * that many pages (referenced, dirty, etc), but make
> + * sure to stop if we already freed enough.
> + */
> + if (sc->nr_reclaimed > sc->swap_cluster_max)
> + break;
> }

There is one risk.
__alloc_pages_internal() has following code,

pages_reclaimed += did_some_progress;
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
if (order <= PAGE_ALLOC_COSTLY_ORDER) {
do_retry = 1;
} else {
if (gfp_mask & __GFP_REPEAT &&
pages_reclaimed < (1 << order))
do_retry = 1;
}
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
}


So, reclaim shortcutting can increase the possibility of page allocation
endless retry on high-order allocation.

Yes, it is the theorical issue.
But we should test it for avoid regression.


Otherthing, looks good to me.

Reviewed-by: KOSAKI Motohiro <[email protected]>


2008-11-14 03:28:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

On Thu, 13 Nov 2008 17:12:08 -0500 Rik van Riel <[email protected]> wrote:

> Sometimes the VM spends the first few priority rounds rotating back
> referenced pages and submitting IO. Once we get to a lower priority,
> sometimes the VM ends up freeing way too many pages.
>
> The fix is relatively simple: in shrink_zone() we can check how many
> pages we have already freed and break out of the loop.
>
> However, in order to do this we do need to know how many pages we already
> freed, so move nr_reclaimed into scan_control.

There was a reason for not doing this, but I forget what it was. It might require
some changelog archeology. iirc it was to do with balancing scanning rates
between the various things which we scan.

2008-11-14 14:37:23

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

Andrew Morton wrote:
> On Thu, 13 Nov 2008 17:12:08 -0500 Rik van Riel <[email protected]> wrote:
>
>> Sometimes the VM spends the first few priority rounds rotating back
>> referenced pages and submitting IO. Once we get to a lower priority,
>> sometimes the VM ends up freeing way too many pages.
>>
>> The fix is relatively simple: in shrink_zone() we can check how many
>> pages we have already freed and break out of the loop.
>>
>> However, in order to do this we do need to know how many pages we already
>> freed, so move nr_reclaimed into scan_control.
>
> There was a reason for not doing this, but I forget what it was. It might require
> some changelog archeology. iirc it was to do with balancing scanning rates
> between the various things which we scan.

I've seen worse symptoms without this code, though. Pretty
much all 2.6 kernels show bad behaviour occasionally.

Sometimes the VM gets in such a state where multiple processes
cannot find anything readily evictable, and they all end up
at a lower priority level.

This can cause them to evict more than half of everything from
memory, before breaking out of the pageout loop and swapping
things back in. On my 2GB desktop, I've seen as much as 1200MB
memory free due to such a swapout storm. It is possible more is
free at the top of the cycle, but X and gnome-terminal and top
and everything else is stuck, so that's not actually visible :)

I am not convinced that a scanning imbalance is more serious.

Of course, one thing we could do is exempt kswapd from this check.
During light reclaim, kswapd does most of the eviction so scanning
should remain balanced. Having one process fall down to a lower
priority level is also not a big problem.

As long as the direct reclaim processes do not also fall into the
same trap, the situation should be manageable.

Does that sound reasonable to you?

--
All rights reversed.

2008-11-14 17:18:55

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

On Fri, 14 Nov 2008 09:36:28 -0500 Rik van Riel <[email protected]> wrote:

> Andrew Morton wrote:
> > On Thu, 13 Nov 2008 17:12:08 -0500 Rik van Riel <[email protected]> wrote:
> >
> >> Sometimes the VM spends the first few priority rounds rotating back
> >> referenced pages and submitting IO. Once we get to a lower priority,
> >> sometimes the VM ends up freeing way too many pages.
> >>
> >> The fix is relatively simple: in shrink_zone() we can check how many
> >> pages we have already freed and break out of the loop.
> >>
> >> However, in order to do this we do need to know how many pages we already
> >> freed, so move nr_reclaimed into scan_control.
> >
> > There was a reason for not doing this, but I forget what it was. It might require
> > some changelog archeology. iirc it was to do with balancing scanning rates
> > between the various things which we scan.
>
> I've seen worse symptoms without this code, though. Pretty
> much all 2.6 kernels show bad behaviour occasionally.
>
> Sometimes the VM gets in such a state where multiple processes
> cannot find anything readily evictable, and they all end up
> at a lower priority level.
>
> This can cause them to evict more than half of everything from
> memory, before breaking out of the pageout loop and swapping
> things back in. On my 2GB desktop, I've seen as much as 1200MB
> memory free due to such a swapout storm. It is possible more is
> free at the top of the cycle, but X and gnome-terminal and top
> and everything else is stuck, so that's not actually visible :)
>
> I am not convinced that a scanning imbalance is more serious.

I'm not as sure as you are that it was done this way to avoid scanning
imbalance. I don't remember the reasons :(

It isn't necessarily true that this change and <whatever that was> are
mutually exclusive things.

> Of course, one thing we could do is exempt kswapd from this check.
> During light reclaim, kswapd does most of the eviction so scanning
> should remain balanced. Having one process fall down to a lower
> priority level is also not a big problem.
>
> As long as the direct reclaim processes do not also fall into the
> same trap, the situation should be manageable.
>
> Does that sound reasonable to you?

I'll need to find some time to go dig through the changelogs.

2008-11-16 07:54:38

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

On Sun, 16 Nov 2008 16:43:10 +0900 (JST) KOSAKI Motohiro <[email protected]> wrote:

> > > Of course, one thing we could do is exempt kswapd from this check.
> > > During light reclaim, kswapd does most of the eviction so scanning
> > > should remain balanced. Having one process fall down to a lower
> > > priority level is also not a big problem.
> > >
> > > As long as the direct reclaim processes do not also fall into the
> > > same trap, the situation should be manageable.
> > >
> > > Does that sound reasonable to you?
> >
> > I'll need to find some time to go dig through the changelogs.
>
> as far as I tried, git database doesn't have that changelogs.
> FWIW, I guess it is more old.
>

git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/old-2.6-bkcvs.git
goes back to 2.5.20 (iirc).

2008-11-16 07:56:49

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

> > > I'll need to find some time to go dig through the changelogs.
> >
> > as far as I tried, git database doesn't have that changelogs.
> > FWIW, I guess it is more old.
>
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/old-2.6-bkcvs.git
> goes back to 2.5.20 (iirc).

Wow!
I'll try it.

Thanks!!

2008-11-16 08:03:19

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

On Sun, 16 Nov 2008 16:56:15 +0900 (JST) KOSAKI Motohiro <[email protected]> wrote:

> > > > I'll need to find some time to go dig through the changelogs.
> > >
> > > as far as I tried, git database doesn't have that changelogs.
> > > FWIW, I guess it is more old.
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/old-2.6-bkcvs.git
> > goes back to 2.5.20 (iirc).

err, make that 2.5.0.

> Wow!
> I'll try it.
>
> Thanks!!

2008-11-16 08:11:59

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

> > Of course, one thing we could do is exempt kswapd from this check.
> > During light reclaim, kswapd does most of the eviction so scanning
> > should remain balanced. Having one process fall down to a lower
> > priority level is also not a big problem.
> >
> > As long as the direct reclaim processes do not also fall into the
> > same trap, the situation should be manageable.
> >
> > Does that sound reasonable to you?
>
> I'll need to find some time to go dig through the changelogs.

as far as I tried, git database doesn't have that changelogs.
FWIW, I guess it is more old.



2008-11-16 08:13:27

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

One more point.

> Sometimes the VM spends the first few priority rounds rotating back
> referenced pages and submitting IO. Once we get to a lower priority,
> sometimes the VM ends up freeing way too many pages.
>
> The fix is relatively simple: in shrink_zone() we can check how many
> pages we have already freed and break out of the loop.
>
> However, in order to do this we do need to know how many pages we already
> freed, so move nr_reclaimed into scan_control.

IIRC, Balbir-san explained the implemetation of the memcgroup
force cache dropping feature need non bail out at the past reclaim
throttring discussion.

I am not sure about this still right or not (iirc, memcgroup implemetation
was largely changed).

Balbir-san, Could you comment to this patch?


2008-11-17 00:39:25

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

On Sun, 16 Nov 2008 16:38:56 +0900 (JST)
KOSAKI Motohiro <[email protected]> wrote:

> One more point.
>
> > Sometimes the VM spends the first few priority rounds rotating back
> > referenced pages and submitting IO. Once we get to a lower priority,
> > sometimes the VM ends up freeing way too many pages.
> >
> > The fix is relatively simple: in shrink_zone() we can check how many
> > pages we have already freed and break out of the loop.
> >
> > However, in order to do this we do need to know how many pages we already
> > freed, so move nr_reclaimed into scan_control.
>
> IIRC, Balbir-san explained the implemetation of the memcgroup
> force cache dropping feature need non bail out at the past reclaim
> throttring discussion.
>
> I am not sure about this still right or not (iirc, memcgroup implemetation
> was largely changed).
>
> Balbir-san, Could you comment to this patch?
>
>
I'm not Balbir-san but there is no "force-cache-dropping" feature now.
(I have no plan to do that.)

But, mem+swap controller will need to modify reclaim path to do "cache drop
first" becasue the amount of "mem+swap" will not change when "mem+swap" hit
limit. It's now set "sc.may_swap" to 0.

Hmm, I hope memcg is a silver bullet to this kind of special? workload in
long term.


Thanks,
-Kame


2008-11-17 03:47:40

by Balbir Singh

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

KAMEZAWA Hiroyuki wrote:
> On Sun, 16 Nov 2008 16:38:56 +0900 (JST)
> KOSAKI Motohiro <[email protected]> wrote:
>
>> One more point.
>>
>>> Sometimes the VM spends the first few priority rounds rotating back
>>> referenced pages and submitting IO. Once we get to a lower priority,
>>> sometimes the VM ends up freeing way too many pages.
>>>
>>> The fix is relatively simple: in shrink_zone() we can check how many
>>> pages we have already freed and break out of the loop.
>>>
>>> However, in order to do this we do need to know how many pages we already
>>> freed, so move nr_reclaimed into scan_control.
>> IIRC, Balbir-san explained the implemetation of the memcgroup
>> force cache dropping feature need non bail out at the past reclaim
>> throttring discussion.
>>

Yes, for we used that for force_empty() in the past, but see below

>> I am not sure about this still right or not (iirc, memcgroup implemetation
>> was largely changed).
>>
>> Balbir-san, Could you comment to this patch?
>>
>>
> I'm not Balbir-san but there is no "force-cache-dropping" feature now.
> (I have no plan to do that.)
>
> But, mem+swap controller will need to modify reclaim path to do "cache drop
> first" becasue the amount of "mem+swap" will not change when "mem+swap" hit
> limit. It's now set "sc.may_swap" to 0.
>

Yes, there have been several changes to force_empty() and its meaning, including
movement of accounts. Since you've made most of the recent changes, your
comments are very relevant.

> Hmm, I hope memcg is a silver bullet to this kind of special? workload in
> long term.

:-) From my perspective, hierarchy, soft limits (sharing memory when there is no
contention), some form of over commit support and getting swappiness to work
correctly are very important for memcg.

--
Balbir

2008-11-19 16:55:01

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

On Thu, Nov 13, 2008 at 05:12:08PM -0500, Rik van Riel wrote:
> Sometimes the VM spends the first few priority rounds rotating back
> referenced pages and submitting IO. Once we get to a lower priority,
> sometimes the VM ends up freeing way too many pages.
>
> The fix is relatively simple: in shrink_zone() we can check how many
> pages we have already freed and break out of the loop.
>
> However, in order to do this we do need to know how many pages we already
> freed, so move nr_reclaimed into scan_control.
>
>
> Signed-off-by: Rik van Riel <[email protected]>
>
> ---
> mm/vmscan.c | 60 ++++++++++++++++++++++++++++++------------------------------
> 1 file changed, 30 insertions(+), 30 deletions(-)
>
> Index: linux-2.6.28-rc2-mm1/mm/vmscan.c
> ===================================================================
> --- linux-2.6.28-rc2-mm1.orig/mm/vmscan.c 2008-10-30 15:20:06.000000000 -0400
> +++ linux-2.6.28-rc2-mm1/mm/vmscan.c 2008-11-13 17:08:35.000000000 -0500
> @@ -53,6 +53,9 @@ struct scan_control {
> /* Incremented by the number of inactive pages that were scanned */
> unsigned long nr_scanned;
>
> + /* Number of pages that were freed */
> + unsigned long nr_reclaimed;
> +

Is this not strictly true as this is used as a running count?

/* Number of pages freed so far during a call to shrink_zones() */

> /* This context's GFP mask */
> gfp_t gfp_mask;
>
> @@ -1408,16 +1411,14 @@ static void get_scan_ratio(struct zone *
> percent[1] = 100 - percent[0];
> }
>
> -

nit, spurious whitespace change there.

> /*
> * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
> */
> -static unsigned long shrink_zone(int priority, struct zone *zone,
> +static void shrink_zone(int priority, struct zone *zone,
> struct scan_control *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> unsigned long nr_to_scan;
> - unsigned long nr_reclaimed = 0;
> unsigned long percent[2]; /* anon @ 0; file @ 1 */
> enum lru_list l;
>
> @@ -1458,10 +1459,18 @@ static unsigned long shrink_zone(int pri
> (unsigned long)sc->swap_cluster_max);
> nr[l] -= nr_to_scan;
>
> - nr_reclaimed += shrink_list(l, nr_to_scan,
> + sc->nr_reclaimed += shrink_list(l, nr_to_scan,
> zone, sc, priority);
> }
> }
> + /*
> + * On large memory systems, scan >> priority can become
> + * really large. This is OK if we need to scan through
> + * that many pages (referenced, dirty, etc), but make
> + * sure to stop if we already freed enough.
> + */
> + if (sc->nr_reclaimed > sc->swap_cluster_max)
> + break;

This triggered alarm bells for me because I thought it would affect lumpy
reclaim. However, lumpy reclaim happens at a lower level and what I'd
expect to happen is that nr_reclaimed be at least the number of base pages
making up a high-order block for example. Thinking about it, this should be
safe but I ran it through the old anti-frag tests for hugepage allocations
(basically allocating hugepages under compile-load).

On some machines the situation improved very slighly but on one, the
success rates under load were severely impaired. On all machines at rest,
a one-shot attempt to resize the hugepage pool is resulting in much lower
success figures. However, multiple attempts eventually succeed and aggressive
resizing of the hugepage pool is resulting in higher success rates on all
but one machine.

Bottom line, hugepage pool resizing is taking more attempts but still
succeeding. While I'm not happy about the one-attempt hugepage pool resizing
being worse, I strongly suspect it's due to the current reclaim algorithm
reclaiming aggressively as a percentage of total memory and this behaviour
seems to make more sense overall. I'll re-examine how dynamic pool resizing
is and look at making it better if this change goes through.

With that out of the way, I also tried thinking about what this change really
means and have a few questions. This is all hand-wavy on my part and possibly
clueless so take it with a grain of salt. Basically the changes comes down to;

o A process doing direct reclaim is not reclaiming a number of pages
based on memory size and reclaim priority any more. Instead, it'll reclaim
a bit and then check to see what the situation is.

Q1. There already is a check higher up to bail out when more than
swap_cluster_max pages are reclaimed. Should that change be now eliminated
as being redundant as it takes place "too late" when a lot of memory may
already been unnecessarily reclaimed?

Q2. In low-memory situations, it would appear that one process entering
direct reclaim (e.g. a process doing all the dirtying) might also have ended
up doing all of the cleaning. Is there a danger that a heavy-dirtying process
is now going to offload its cleaning work in small bits and pieces to every
other process entering direct reclaim?

Q3. Related to Q2, would it make sense to exclude kswapd from the check? On
the plus side, it may get to be the sucker process that does the scanning
and reclaim. On the downside, it may reclaim way more memory than is needed
to bring the free pages above the high watermark and becomes a variation of
the problem you are trying to solve here.

Q4. Less reclaim also means less scanning which means the aging information
of the pages on the lists is that bit older too. Do we care?

I was going to ask if it was easier to go OOM now, but even under very high
stress, we should be making forward progress. It's just in smaller steps so
I can't see it causing a problem.

> }
>
> /*
> @@ -1474,7 +1483,6 @@ static unsigned long shrink_zone(int pri
> shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
>
> throttle_vm_writeout(sc->gfp_mask);
> - return nr_reclaimed;
> }
>
> /*
> @@ -1488,16 +1496,13 @@ static unsigned long shrink_zone(int pri
> * b) The zones may be over pages_high but they must go *over* pages_high to
> * satisfy the `incremental min' zone defense algorithm.
> *
> - * Returns the number of reclaimed pages.
> - *
> * If a zone is deemed to be full of pinned pages then just give it a light
> * scan then give up on it.
> */
> -static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
> +static void shrink_zones(int priority, struct zonelist *zonelist,
> struct scan_control *sc)
> {
> enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
> - unsigned long nr_reclaimed = 0;
> struct zoneref *z;
> struct zone *zone;
>
> @@ -1528,10 +1533,8 @@ static unsigned long shrink_zones(int pr
> priority);
> }
>
> - nr_reclaimed += shrink_zone(priority, zone, sc);
> + shrink_zone(priority, zone, sc);
> }
> -
> - return nr_reclaimed;
> }
>
> /*
> @@ -1556,7 +1559,6 @@ static unsigned long do_try_to_free_page
> int priority;
> unsigned long ret = 0;
> unsigned long total_scanned = 0;
> - unsigned long nr_reclaimed = 0;
> struct reclaim_state *reclaim_state = current->reclaim_state;
> unsigned long lru_pages = 0;
> struct zoneref *z;
> @@ -1584,7 +1586,7 @@ static unsigned long do_try_to_free_page
> sc->nr_scanned = 0;
> if (!priority)
> disable_swap_token();
> - nr_reclaimed += shrink_zones(priority, zonelist, sc);
> + shrink_zones(priority, zonelist, sc);
> /*
> * Don't shrink slabs when reclaiming memory from
> * over limit cgroups
> @@ -1592,13 +1594,13 @@ static unsigned long do_try_to_free_page
> if (scan_global_lru(sc)) {
> shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
> if (reclaim_state) {
> - nr_reclaimed += reclaim_state->reclaimed_slab;
> + sc->nr_reclaimed += reclaim_state->reclaimed_slab;
> reclaim_state->reclaimed_slab = 0;
> }
> }
> total_scanned += sc->nr_scanned;
> - if (nr_reclaimed >= sc->swap_cluster_max) {
> - ret = nr_reclaimed;
> + if (sc->nr_reclaimed >= sc->swap_cluster_max) {
> + ret = sc->nr_reclaimed;
> goto out;
> }
>
> @@ -1621,7 +1623,7 @@ static unsigned long do_try_to_free_page
> }
> /* top priority shrink_zones still had more to do? don't OOM, then */
> if (!sc->all_unreclaimable && scan_global_lru(sc))
> - ret = nr_reclaimed;
> + ret = sc->nr_reclaimed;
> out:
> /*
> * Now that we've scanned all the zones at this priority level, note
> @@ -1716,7 +1718,6 @@ static unsigned long balance_pgdat(pg_da
> int priority;
> int i;
> unsigned long total_scanned;
> - unsigned long nr_reclaimed;
> struct reclaim_state *reclaim_state = current->reclaim_state;
> struct scan_control sc = {
> .gfp_mask = GFP_KERNEL,
> @@ -1735,7 +1736,7 @@ static unsigned long balance_pgdat(pg_da
>
> loop_again:
> total_scanned = 0;
> - nr_reclaimed = 0;
> + sc.nr_reclaimed = 0;
> sc.may_writepage = !laptop_mode;
> count_vm_event(PAGEOUTRUN);
>
> @@ -1821,11 +1822,11 @@ loop_again:
> */
> if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
> end_zone, 0))
> - nr_reclaimed += shrink_zone(priority, zone, &sc);
> + shrink_zone(priority, zone, &sc);
> reclaim_state->reclaimed_slab = 0;
> nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
> lru_pages);
> - nr_reclaimed += reclaim_state->reclaimed_slab;
> + sc.nr_reclaimed += reclaim_state->reclaimed_slab;
> total_scanned += sc.nr_scanned;
> if (zone_is_all_unreclaimable(zone))
> continue;
> @@ -1839,7 +1840,7 @@ loop_again:
> * even in laptop mode
> */
> if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
> - total_scanned > nr_reclaimed + nr_reclaimed / 2)
> + total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
> sc.may_writepage = 1;
> }
> if (all_zones_ok)
> @@ -1857,7 +1858,7 @@ loop_again:
> * matches the direct reclaim path behaviour in terms of impact
> * on zone->*_priority.
> */
> - if (nr_reclaimed >= SWAP_CLUSTER_MAX)
> + if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
> break;
> }
> out:
> @@ -1879,7 +1880,7 @@ out:
> goto loop_again;
> }
>
> - return nr_reclaimed;
> + return sc.nr_reclaimed;
> }
>
> /*
> @@ -2231,7 +2232,6 @@ static int __zone_reclaim(struct zone *z
> struct task_struct *p = current;
> struct reclaim_state reclaim_state;
> int priority;
> - unsigned long nr_reclaimed = 0;
> struct scan_control sc = {
> .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
> .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
> @@ -2264,9 +2264,9 @@ static int __zone_reclaim(struct zone *z
> priority = ZONE_RECLAIM_PRIORITY;
> do {
> note_zone_scanning_priority(zone, priority);
> - nr_reclaimed += shrink_zone(priority, zone, &sc);
> + shrink_zone(priority, zone, &sc);
> priority--;
> - } while (priority >= 0 && nr_reclaimed < nr_pages);
> + } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
> }
>
> slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
> @@ -2290,13 +2290,13 @@ static int __zone_reclaim(struct zone *z
> * Update nr_reclaimed by the number of slab pages we
> * reclaimed from this zone.
> */
> - nr_reclaimed += slab_reclaimable -
> + sc.nr_reclaimed += slab_reclaimable -
> zone_page_state(zone, NR_SLAB_RECLAIMABLE);
> }
>
> p->reclaim_state = NULL;
> current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
> - return nr_reclaimed >= nr_pages;
> + return sc.nr_reclaimed >= nr_pages;
> }
>
> int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2008-11-21 11:58:41

by Petr Tesařík

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

Dne Wednesday 19 of November 2008 17:54:44 Mel Gorman napsal(a):
>[...]
> I was going to ask if it was easier to go OOM now, but even under very high
> stress, we should be making forward progress. It's just in smaller steps so
> I can't see it causing a problem.

Actually, I had to apply a very similar patch the other day to reduce the time
the system was unresponsive because of the OOM-killer, so I tested OOM
situations quite a lot, and it did not cause any problem.

Petr Tesarik

2008-11-22 10:22:19

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

Hi

I digged many git-log today.


> > > > Of course, one thing we could do is exempt kswapd from this check.
> > > > During light reclaim, kswapd does most of the eviction so scanning
> > > > should remain balanced. Having one process fall down to a lower
> > > > priority level is also not a big problem.
> > > >
> > > > As long as the direct reclaim processes do not also fall into the
> > > > same trap, the situation should be manageable.
> > > >
> > > > Does that sound reasonable to you?
> > >
> > > I'll need to find some time to go dig through the changelogs.
> >
> > as far as I tried, git database doesn't have that changelogs.
> > FWIW, I guess it is more old.
>
> git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/old-2.6-bkcvs.git
> goes back to 2.5.20 (iirc).

sorry, I was wrong.
following patch revertion was happend at 2006.

And, thank you andrew.
your comment is very nice.

So, desiable behavior is

direct reclaim:
should be bailed out if enough page reclaimed

kswapd:
don't bailed.


Actually, my prepared another bailed out patch has sc->may_cut_off member.
shrink_zone can do shorcut exiting if only sc->may_cut_off==1.


Rik, sorry, I nak current your patch.
because it don't fix old akpm issue.

Very sorry.


------------------------------------------------------------------------
From: Andrew Morton <[email protected]>
Date: Fri, 6 Jan 2006 08:11:14 +0000 (-0800)
Subject: [PATCH] vmscan: balancing fix
X-Git-Tag: v2.6.16-rc1~936^2~246


Revert a patch which went into 2.6.8-rc1. The changelog for that patch was:

The shrink_zone() logic can, under some circumstances, cause far too many
pages to be reclaimed. Say, we're scanning at high priority and suddenly
hit a large number of reclaimable pages on the LRU.

Change things so we bale out when SWAP_CLUSTER_MAX pages have been
reclaimed.

Problem is, this change caused significant imbalance in inter-zone scan
balancing by truncating scans of larger zones.

Suppose, for example, ZONE_HIGHMEM is 10x the size of ZONE_NORMAL. The zone
balancing algorithm would require that if we're scanning 100 pages of
ZONE_HIGHMEM, we should scan 10 pages of ZONE_NORMAL. But this logic will
cause the scanning of ZONE_HIGHMEM to bale out after only 32 pages are
reclaimed. Thus effectively causing smaller zones to be scanned relatively
harder than large ones.

Now I need to remember what the workload was which caused me to write this
patch originally, then fix it up in a different way...
----------------------------------------------------------------------------



2008-11-22 16:58:29

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

KOSAKI Motohiro wrote:

> Rik, sorry, I nak current your patch.
> because it don't fix old akpm issue.

You are right. We do need to keep pressure between zones
equivalent to the size of the zones (or more precisely, to
the number of pages the zones have on their LRU lists).

However, having dozens of direct reclaim tasks all getting
to the lower priority levels can be disastrous, causing
extraordinarily large amounts of memory to be swapped out
and minutes-long stalls to applications.

I think we can come up with a middle ground here:
- always let kswapd continue its rounds
- have direct reclaim tasks continue when priority == DEF_PRIORITY
- break out of the loop for direct reclaim tasks, when
priority < DEF_PRIORITY and enough pages have been freed

Does that sound like it would mostly preserve memory pressure
between zones, while avoiding the worst of the worst when it
comes to excessive page eviction?

--
All rights reversed.

2008-11-24 19:12:35

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

>> Rik, sorry, I nak current your patch. because it don't fix old akpm issue.
>
> You are right. We do need to keep pressure between zones
> equivalent to the size of the zones (or more precisely, to
> the number of pages the zones have on their LRU lists).

Oh, sorry.
you are right. but I talked about reverse thing.

1. shrink_zones() doesn't have any shortcut exiting way.
it always call all zone's shrink_zone()
2. balance_pgdat also doesn't have shortcut.

simple shrink_zone() shortcut and lite memory pressure cause following
bad scenario.

1. reclaim 32 page from ZONE_HIGHMEM
2. reclaim 32 page from ZONE_NORMAL
3. reclaim 32 page from ZONE_DMA
4. exit reclaim
5. another task call page alloc and it cause try_to_free_pages()
6. reclaim 32 page from ZONE_HIGHMEM
7. reclaim 32 page from ZONE_NORMAL
8. reclaim 32 page from ZONE_DMA

oops, all zone reclaimed the same pages although ZONE_HIGHMEM have
much memory than ZONE_DMA.
IOW, ZONE_DMA's reclaim scanning rate is much than ZONE_HIGHMEM largely.

it isn't intentionally.



Actually, try_to_free_pages don't need pressure fairness. it is the
role of the balance_pgdat().


> However, having dozens of direct reclaim tasks all getting
> to the lower priority levels can be disastrous, causing
> extraordinarily large amounts of memory to be swapped out
> and minutes-long stalls to applications.

agreed.

>
> I think we can come up with a middle ground here:
> - always let kswapd continue its rounds

agreed.

> - have direct reclaim tasks continue when priority == DEF_PRIORITY

disagreed.
it cause above bad scenario, I think.

> - break out of the loop for direct reclaim tasks, when
> priority < DEF_PRIORITY and enough pages have been freed
>
> Does that sound like it would mostly preserve memory pressure
> between zones, while avoiding the worst of the worst when it
> comes to excessive page eviction?

2008-11-24 19:18:48

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages

KOSAKI Motohiro wrote:

> 1. reclaim 32 page from ZONE_HIGHMEM
> 2. reclaim 32 page from ZONE_NORMAL
> 3. reclaim 32 page from ZONE_DMA
> 4. exit reclaim
> 5. another task call page alloc and it cause try_to_free_pages()
> 6. reclaim 32 page from ZONE_HIGHMEM
> 7. reclaim 32 page from ZONE_NORMAL
> 8. reclaim 32 page from ZONE_DMA

>> - have direct reclaim tasks continue when priority == DEF_PRIORITY
>
> disagreed.
> it cause above bad scenario, I think.

I think I did not explain it clearly. Let me illustrate
with a new patch. (one moment :))

--
All rights reversed.