Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756343AbYKMWOB (ORCPT ); Thu, 13 Nov 2008 17:14:01 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1756174AbYKMWNp (ORCPT ); Thu, 13 Nov 2008 17:13:45 -0500 Received: from mx2.redhat.com ([66.187.237.31]:51955 "EHLO mx2.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1755891AbYKMWNn (ORCPT ); Thu, 13 Nov 2008 17:13:43 -0500 Date: Thu, 13 Nov 2008 17:12:08 -0500 From: Rik van Riel To: linux-kernel@vger.kernel.org Cc: linux-mm@kvack.org Subject: [PATCH -mm] vmscan: bail out of page reclaim after swap_cluster_max pages Message-ID: <20081113171208.6985638e@bree.surriel.com> Organization: Red Hat, Inc. Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 8005 Lines: 249 Sometimes the VM spends the first few priority rounds rotating back referenced pages and submitting IO. Once we get to a lower priority, sometimes the VM ends up freeing way too many pages. The fix is relatively simple: in shrink_zone() we can check how many pages we have already freed and break out of the loop. However, in order to do this we do need to know how many pages we already freed, so move nr_reclaimed into scan_control. Signed-off-by: Rik van Riel --- mm/vmscan.c | 60 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 30 deletions(-) Index: linux-2.6.28-rc2-mm1/mm/vmscan.c =================================================================== --- linux-2.6.28-rc2-mm1.orig/mm/vmscan.c 2008-10-30 15:20:06.000000000 -0400 +++ linux-2.6.28-rc2-mm1/mm/vmscan.c 2008-11-13 17:08:35.000000000 -0500 @@ -53,6 +53,9 @@ struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; + /* Number of pages that were freed */ + unsigned long nr_reclaimed; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -1408,16 +1411,14 @@ static void get_scan_ratio(struct zone * percent[1] = 100 - percent[0]; } - /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static unsigned long shrink_zone(int priority, struct zone *zone, +static void shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long nr_reclaimed = 0; unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; @@ -1458,10 +1459,18 @@ static unsigned long shrink_zone(int pri (unsigned long)sc->swap_cluster_max); nr[l] -= nr_to_scan; - nr_reclaimed += shrink_list(l, nr_to_scan, + sc->nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, priority); } } + /* + * On large memory systems, scan >> priority can become + * really large. This is OK if we need to scan through + * that many pages (referenced, dirty, etc), but make + * sure to stop if we already freed enough. + */ + if (sc->nr_reclaimed > sc->swap_cluster_max) + break; } /* @@ -1474,7 +1483,6 @@ static unsigned long shrink_zone(int pri shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); - return nr_reclaimed; } /* @@ -1488,16 +1496,13 @@ static unsigned long shrink_zone(int pri * b) The zones may be over pages_high but they must go *over* pages_high to * satisfy the `incremental min' zone defense algorithm. * - * Returns the number of reclaimed pages. - * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static unsigned long shrink_zones(int priority, struct zonelist *zonelist, +static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); - unsigned long nr_reclaimed = 0; struct zoneref *z; struct zone *zone; @@ -1528,10 +1533,8 @@ static unsigned long shrink_zones(int pr priority); } - nr_reclaimed += shrink_zone(priority, zone, sc); + shrink_zone(priority, zone, sc); } - - return nr_reclaimed; } /* @@ -1556,7 +1559,6 @@ static unsigned long do_try_to_free_page int priority; unsigned long ret = 0; unsigned long total_scanned = 0; - unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; struct zoneref *z; @@ -1584,7 +1586,7 @@ static unsigned long do_try_to_free_page sc->nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zonelist, sc); + shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -1592,13 +1594,13 @@ static unsigned long do_try_to_free_page if (scan_global_lru(sc)) { shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { - nr_reclaimed += reclaim_state->reclaimed_slab; + sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } } total_scanned += sc->nr_scanned; - if (nr_reclaimed >= sc->swap_cluster_max) { - ret = nr_reclaimed; + if (sc->nr_reclaimed >= sc->swap_cluster_max) { + ret = sc->nr_reclaimed; goto out; } @@ -1621,7 +1623,7 @@ static unsigned long do_try_to_free_page } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scan_global_lru(sc)) - ret = nr_reclaimed; + ret = sc->nr_reclaimed; out: /* * Now that we've scanned all the zones at this priority level, note @@ -1716,7 +1718,6 @@ static unsigned long balance_pgdat(pg_da int priority; int i; unsigned long total_scanned; - unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -1735,7 +1736,7 @@ static unsigned long balance_pgdat(pg_da loop_again: total_scanned = 0; - nr_reclaimed = 0; + sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); @@ -1821,11 +1822,11 @@ loop_again: */ if (!zone_watermark_ok(zone, order, 8*zone->pages_high, end_zone, 0)) - nr_reclaimed += shrink_zone(priority, zone, &sc); + shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - nr_reclaimed += reclaim_state->reclaimed_slab; + sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone_is_all_unreclaimable(zone)) continue; @@ -1839,7 +1840,7 @@ loop_again: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > nr_reclaimed + nr_reclaimed / 2) + total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; } if (all_zones_ok) @@ -1857,7 +1858,7 @@ loop_again: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if (nr_reclaimed >= SWAP_CLUSTER_MAX) + if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) break; } out: @@ -1879,7 +1880,7 @@ out: goto loop_again; } - return nr_reclaimed; + return sc.nr_reclaimed; } /* @@ -2231,7 +2232,6 @@ static int __zone_reclaim(struct zone *z struct task_struct *p = current; struct reclaim_state reclaim_state; int priority; - unsigned long nr_reclaimed = 0; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), @@ -2264,9 +2264,9 @@ static int __zone_reclaim(struct zone *z priority = ZONE_RECLAIM_PRIORITY; do { note_zone_scanning_priority(zone, priority); - nr_reclaimed += shrink_zone(priority, zone, &sc); + shrink_zone(priority, zone, &sc); priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); @@ -2290,13 +2290,13 @@ static int __zone_reclaim(struct zone *z * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - nr_reclaimed += slab_reclaimable - + sc.nr_reclaimed += slab_reclaimable - zone_page_state(zone, NR_SLAB_RECLAIMABLE); } p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - return nr_reclaimed >= nr_pages; + return sc.nr_reclaimed >= nr_pages; } int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/