Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757097Ab1FIIDg (ORCPT ); Thu, 9 Jun 2011 04:03:36 -0400 Received: from cantor2.suse.de ([195.135.220.15]:59214 "EHLO mx2.suse.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756928Ab1FIIDI (ORCPT ); Thu, 9 Jun 2011 04:03:08 -0400 From: Mel Gorman To: Andrew Morton Cc: Linux-MM , Linux-Netdev , LKML , David Miller , Neil Brown , Peter Zijlstra , Mel Gorman Subject: [PATCH 13/14] mm: Throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage Date: Thu, 9 Jun 2011 09:02:52 +0100 Message-Id: <1307606573-24704-14-git-send-email-mgorman@suse.de> X-Mailer: git-send-email 1.7.3.4 In-Reply-To: <1307606573-24704-1-git-send-email-mgorman@suse.de> References: <1307606573-24704-1-git-send-email-mgorman@suse.de> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 4058 Lines: 125 If swap is backed by network storage such as NBD, there is a risk that a large number of reclaimers can hang the system by consuming all PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune min_free_kbytes in advance. This patch will throttle direct reclaimers if half the PF_MEMALLOC reserves are in use as the system is at risk of hanging. Signed-off-by: Mel Gorman --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 1 + mm/vmscan.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 0 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c928dac..5b32906 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -633,6 +633,7 @@ typedef struct pglist_data { range, including holes */ int node_id; wait_queue_head_t kswapd_wait; + wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; int kswapd_max_order; enum zone_type classzone_idx; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ac779f5..00eea0f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4288,6 +4288,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); diff --git a/mm/vmscan.c b/mm/vmscan.c index faa0a08..fe95e4f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2154,6 +2154,45 @@ out: return 0; } +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat, int high_zoneidx) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + + for (i = 0; i <= high_zoneidx; i++) { + zone = &pgdat->node_zones[i]; + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + return (free_pages > pfmemalloc_reserve / 2) ? true : false; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached + */ +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zone *zone; + int high_zoneidx = gfp_zone(gfp_mask); + DEFINE_WAIT(wait); + + /* Check if the pfmemalloc reserves are ok */ + first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); + if (pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx)) + return; + + /* Throttle */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(zone->zone_pgdat, high_zoneidx)); +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { @@ -2173,6 +2212,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = sc.gfp_mask, }; + throttle_direct_reclaim(gfp_mask, zonelist, nodemask); + + /* + * Do not enter reclaim if fatal signal is pending. 1 is returned so + * that the page allocator does not consider triggering OOM + */ + if (fatal_signal_pending(current)) + return 1; + trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, gfp_mask); @@ -2541,6 +2589,12 @@ loop_again: } } + + /* Wake throttled direct reclaimers if low watermark is met */ + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + pfmemalloc_watermark_ok(pgdat, MAX_NR_ZONES - 1)) + wake_up_interruptible(&pgdat->pfmemalloc_wait); + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* -- 1.7.3.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/