Zone reclaim allows the reclaiming of pages from a zone if the number of free
pages falls below the watermark even if other zones still have enough pages
available. Zone reclaim is of particular importance for NUMA machines. It can
be more beneficial to reclaim a page than taking the performance penalties
that come with allocating a page on a remote zone.
The patch replaces Martin Hick's zone reclaim function (which was never
working properly).
An arch can control zone_reclaim by setting zone_reclaim_mode during bootup
if it is discovered that the kernel is running on an NUMA configuration.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.15-rc4/mm/page_alloc.c
===================================================================
--- linux-2.6.15-rc4.orig/mm/page_alloc.c 2005-11-30 22:25:15.000000000 -0800
+++ linux-2.6.15-rc4/mm/page_alloc.c 2005-12-06 09:12:01.000000000 -0800
@@ -842,7 +842,9 @@ get_page_from_freelist(gfp_t gfp_mask, u
mark = (*z)->pages_high;
if (!zone_watermark_ok(*z, order, mark,
classzone_idx, alloc_flags))
- continue;
+ if (zone_reclaim_mode &&
+ !zone_reclaim(*z, gfp_mask, order))
+ continue;
}
page = buffered_rmqueue(*z, order, gfp_mask);
Index: linux-2.6.15-rc4/mm/vmscan.c
===================================================================
--- linux-2.6.15-rc4.orig/mm/vmscan.c 2005-11-30 22:25:15.000000000 -0800
+++ linux-2.6.15-rc4/mm/vmscan.c 2005-12-06 09:10:00.000000000 -0800
@@ -1354,6 +1354,13 @@ static int __init kswapd_init(void)
module_init(kswapd_init)
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ */
+int zone_reclaim_mode;
/*
* Try to free up some pages from this zone through reclaim.
@@ -1362,12 +1369,13 @@ int zone_reclaim(struct zone *zone, gfp_
{
struct scan_control sc;
int nr_pages = 1 << order;
- int total_reclaimed = 0;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
- /* The reclaim may sleep, so don't do it if sleep isn't allowed */
- if (!(gfp_mask & __GFP_WAIT))
- return 0;
- if (zone->all_unreclaimable)
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->zone_pgdat->node_id != numa_node_id() ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0)
return 0;
sc.gfp_mask = gfp_mask;
@@ -1376,24 +1384,20 @@ int zone_reclaim(struct zone *zone, gfp_
sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
- /* scan at the highest priority */
sc.priority = 0;
disable_swap_token();
- if (nr_pages > SWAP_CLUSTER_MAX)
- sc.swap_cluster_max = nr_pages;
- else
- sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
- /* Don't reclaim the zone if there are other reclaimers active */
- if (atomic_read(&zone->reclaim_in_progress) > 0)
- goto out;
+ sc.swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX);
+ cond_resched();
+ p->flags |= PF_MEMALLOC;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
shrink_zone(zone, &sc);
- total_reclaimed = sc.nr_reclaimed;
-
- out:
- return total_reclaimed;
+ p->reclaim_state = NULL;
+ current->flags &= ~PF_MEMALLOC;
+ cond_resched();
+ return sc.nr_reclaimed > (1 << order);
}
asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
Index: linux-2.6.15-rc4/include/linux/swap.h
===================================================================
--- linux-2.6.15-rc4.orig/include/linux/swap.h 2005-11-30 22:25:15.000000000 -0800
+++ linux-2.6.15-rc4/include/linux/swap.h 2005-12-06 09:10:39.000000000 -0800
@@ -172,6 +172,7 @@ extern void swap_setup(void);
/* linux/mm/vmscan.c */
extern int try_to_free_pages(struct zone **, gfp_t);
+extern int zone_reclaim_mode;
extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
extern int shrink_all_memory(int);
extern int vm_swappiness;
Index: linux-2.6.15-rc4/arch/ia64/mm/discontig.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/ia64/mm/discontig.c 2005-11-30 22:25:15.000000000 -0800
+++ linux-2.6.15-rc4/arch/ia64/mm/discontig.c 2005-12-06 09:17:29.000000000 -0800
@@ -446,6 +446,8 @@ static void __init arch_sparse_init(void
{
efi_memmap_walk(register_sparse_mem, NULL);
sparse_init();
+ /* Switch on zone reclaim */
+ zone_reclaim_mode = 1;
}
#else
#define arch_sparse_init() do {} while (0)
Remove debris of old zone reclaim
Removes the leftovers from prior attempts to implement Zone reclaim.
sys_set_zone_reclaim is not rechable in 2.6.14.
The reclaim_pages field in struct zone is only used by sys_set_zone_reclaim.
Signed-off-by: Christoph Lameter <[email protected]>
Index: linux-2.6.15-rc4/include/linux/mmzone.h
===================================================================
--- linux-2.6.15-rc4.orig/include/linux/mmzone.h 2005-11-30 22:25:15.000000000 -0800
+++ linux-2.6.15-rc4/include/linux/mmzone.h 2005-12-06 09:17:58.000000000 -0800
@@ -150,11 +150,6 @@ struct zone {
unsigned long pages_scanned; /* since last reclaim */
int all_unreclaimable; /* All pages pinned */
- /*
- * Does the allocator try to reclaim pages from the zone as soon
- * as it fails a watermark_ok() in __alloc_pages?
- */
- int reclaim_pages;
/* A count of how many reclaimers are scanning this zone */
atomic_t reclaim_in_progress;
Index: linux-2.6.15-rc4/mm/vmscan.c
===================================================================
--- linux-2.6.15-rc4.orig/mm/vmscan.c 2005-12-06 09:10:00.000000000 -0800
+++ linux-2.6.15-rc4/mm/vmscan.c 2005-12-06 09:17:58.000000000 -0800
@@ -1400,33 +1400,3 @@ int zone_reclaim(struct zone *zone, gfp_
return sc.nr_reclaimed > (1 << order);
}
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
- unsigned int state)
-{
- struct zone *z;
- int i;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- if (node >= MAX_NUMNODES || !node_online(node))
- return -EINVAL;
-
- /* This will break if we ever add more zones */
- if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
- return -EINVAL;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (!(zone & 1<<i))
- continue;
-
- z = &NODE_DATA(node)->node_zones[i];
-
- if (state)
- z->reclaim_pages = 1;
- else
- z->reclaim_pages = 0;
- }
-
- return 0;
-}
On Tue, Dec 06, 2005 at 09:24:44AM -0800, Christoph Lameter wrote:
> Zone reclaim allows the reclaiming of pages from a zone if the number of free
> pages falls below the watermark even if other zones still have enough pages
> available. Zone reclaim is of particular importance for NUMA machines. It can
> be more beneficial to reclaim a page than taking the performance penalties
> that come with allocating a page on a remote zone.
>
> The patch replaces Martin Hick's zone reclaim function (which was never
> working properly).
>
> An arch can control zone_reclaim by setting zone_reclaim_mode during bootup
> if it is discovered that the kernel is running on an NUMA configuration.
Looks much better. Thanks. But how about auto controlling the variable in generic
code based on node_distance() (at least for the non node hotplug case)
> +/*
> + * Zone reclaim mode
> + *
> + * If non-zero call zone_reclaim when the number of free pages falls below
> + * the watermarks.
> + */
> +int zone_reclaim_mode;
I would mark it __read_mostly to avoid potential false sharing.
-Andi
On Tue, 6 Dec 2005, Andi Kleen wrote:
> > An arch can control zone_reclaim by setting zone_reclaim_mode during bootup
> > if it is discovered that the kernel is running on an NUMA configuration.
>
> Looks much better. Thanks. But how about auto controlling the variable in generic
> code based on node_distance() (at least for the non node hotplug case)
Any suggestions on how to determine zone reclaim behavior based on node
distances? AFAIK the main aspects in this are latency and bandwidth of
remote accesses. These vary depending on the distance of the remote node
under consideration.
> > +int zone_reclaim_mode;
>
> I would mark it __read_mostly to avoid potential false sharing.
Ok.
On Tue, Dec 06, 2005 at 10:01:28AM -0800, Christoph Lameter wrote:
> On Tue, 6 Dec 2005, Andi Kleen wrote:
>
> > > An arch can control zone_reclaim by setting zone_reclaim_mode during bootup
> > > if it is discovered that the kernel is running on an NUMA configuration.
> >
> > Looks much better. Thanks. But how about auto controlling the variable in generic
> > code based on node_distance() (at least for the non node hotplug case)
>
> Any suggestions on how to determine zone reclaim behavior based on node
> distances? AFAIK the main aspects in this are latency and bandwidth of
> remote accesses. These vary depending on the distance of the remote node
> under consideration.
I would enable it if distance for any combination of online (or possible?) nodes is
> LOCAL_DISTANCE. I guess hotplug can be ignored for now.
If an architecture really needs something better it can be still refined. But there aren't
that many NUMA architectures anyways, so it shouldn't be a big issue.
It will actually need some tweaking on Opterons because many BIOS just
report 10 everywhere in SLIT and it should be still enabled, but that can be done
in the architecture then.
-Andi
On Tue, 6 Dec 2005, Andi Kleen wrote:
> I would enable it if distance for any combination of online (or possible?) nodes is
> > LOCAL_DISTANCE. I guess hotplug can be ignored for now.
>
> If an architecture really needs something better it can be still refined. But there aren't
> that many NUMA architectures anyways, so it shouldn't be a big issue.
>
> It will actually need some tweaking on Opterons because many BIOS just
> report 10 everywhere in SLIT and it should be still enabled, but that can be done
> in the architecture then.
Here is a patch that may do what you want. The LOCAL_DISTANCE may have to
be set per arch and we may need a reasonable default:
Index: linux-2.6.15-rc4/mm/page_alloc.c
===================================================================
--- linux-2.6.15-rc4.orig/mm/page_alloc.c 2005-12-06 10:30:35.000000000 -0800
+++ linux-2.6.15-rc4/mm/page_alloc.c 2005-12-06 10:35:03.000000000 -0800
@@ -1561,13 +1561,17 @@ static void __init build_zonelists(pg_da
prev_node = local_node;
nodes_clear(used_mask);
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+ int distance = node_distance(local_node, node);
/*
* We don't want to pressure a particular node.
* So adding penalty to the first node in same
* distance group to make it round-robin.
*/
- if (node_distance(local_node, node) !=
- node_distance(local_node, prev_node))
+
+ if (distance > LOCAL_DISTANCE)
+ zone_reclaim_mode = 1;
+
+ if (distance != node_distance(local_node, prev_node))
node_load[node] += load;
prev_node = node;
load--;
Index: linux-2.6.15-rc4/include/linux/numa.h
===================================================================
--- linux-2.6.15-rc4.orig/include/linux/numa.h 2005-11-30 22:25:15.000000000 -0800
+++ linux-2.6.15-rc4/include/linux/numa.h 2005-12-06 10:32:49.000000000 -0800
@@ -13,4 +13,8 @@
#define MAX_NUMNODES (1 << NODES_SHIFT)
+#ifndef LOCAL_DISTANCE
+#define LOCAL_DISTANCE 10
+#endif
+
#endif /* _LINUX_NUMA_H */