2009-03-23 01:05:38

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH] fix vmscan to take care of nodemask

From: KAMEZAWA Hiroyuki <[email protected]>

try_to_free_pages() scans zonelist but doesn't take care of nodemask which is
passed to alloc_pages_nodemask(). This makes try_to_free_pages() less effective.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
fs/buffer.c | 2 +-
include/linux/swap.h | 2 +-
mm/page_alloc.c | 3 ++-
mm/vmscan.c | 14 ++++++++++++--
4 files changed, 16 insertions(+), 5 deletions(-)

Index: mmotm-2.6.29-Mar21/mm/vmscan.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/mm/vmscan.c
+++ mmotm-2.6.29-Mar21/mm/vmscan.c
@@ -79,6 +79,9 @@ struct scan_control {
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;

+ /* Nodemask */
+ nodemask_t *nodemask;
+
/* Pluggable isolate pages callback */
unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
unsigned long *scanned, int order, int mode,
@@ -1544,7 +1547,9 @@ static void shrink_zones(int priority, s
struct zone *zone;

sc->all_unreclaimable = 1;
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ /* Note: sc->nodemask==NULL means scan all node */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+ sc->nodemask) {
if (!populated_zone(zone))
continue;
/*
@@ -1689,7 +1694,7 @@ out:
}

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
- gfp_t gfp_mask)
+ gfp_t gfp_mask, nodemask_t *nodemask)
{
struct scan_control sc = {
.gfp_mask = gfp_mask,
@@ -1700,6 +1705,7 @@ unsigned long try_to_free_pages(struct z
.order = order,
.mem_cgroup = NULL,
.isolate_pages = isolate_pages_global,
+ .nodemask = nodemask,
};

return do_try_to_free_pages(zonelist, &sc);
@@ -1720,6 +1726,7 @@ unsigned long try_to_free_mem_cgroup_pag
.order = 0,
.mem_cgroup = mem_cont,
.isolate_pages = mem_cgroup_isolate_pages,
+ .nodemask = NULL,
};
struct zonelist *zonelist;

@@ -1769,6 +1776,7 @@ static unsigned long balance_pgdat(pg_da
.order = order,
.mem_cgroup = NULL,
.isolate_pages = isolate_pages_global,
+ .nodemask = NULL,
};
/*
* temp_priority is used to remember the scanning priority at which
@@ -2112,6 +2120,7 @@ unsigned long shrink_all_memory(unsigned
.may_unmap = 0,
.may_writepage = 1,
.isolate_pages = isolate_pages_global,
+ .nodemask = NULL,
};

current->reclaim_state = &reclaim_state;
@@ -2298,6 +2307,7 @@ static int __zone_reclaim(struct zone *z
.swappiness = vm_swappiness,
.order = order,
.isolate_pages = isolate_pages_global,
+ .nodemask = NULL,
};
unsigned long slab_reclaimable;

Index: mmotm-2.6.29-Mar21/include/linux/swap.h
===================================================================
--- mmotm-2.6.29-Mar21.orig/include/linux/swap.h
+++ mmotm-2.6.29-Mar21/include/linux/swap.h
@@ -213,7 +213,7 @@ static inline void lru_cache_add_active_

/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
- gfp_t gfp_mask);
+ gfp_t gfp_mask, nodemask_t *mask);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness);
Index: mmotm-2.6.29-Mar21/mm/page_alloc.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/mm/page_alloc.c
+++ mmotm-2.6.29-Mar21/mm/page_alloc.c
@@ -1598,7 +1598,8 @@ nofail_alloc:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist, order,
+ gfp_mask, nodemask);

p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
Index: mmotm-2.6.29-Mar21/fs/buffer.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/fs/buffer.c
+++ mmotm-2.6.29-Mar21/fs/buffer.c
@@ -476,7 +476,7 @@ static void free_more_memory(void)
&zone);
if (zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
- GFP_NOFS);
+ GFP_NOFS, NULL);
}
}


2009-03-23 01:15:29

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH] fix vmscan to take care of nodemask

Kosaki pointed out it's not necessary to initialize struct member value by NULL.
Remvoed it.

Regards,
-Kame
==
From: KAMEZAWA Hiroyuki <[email protected]>

try_to_free_pages() scans zonelist but don't take care of nodemask which is
given to alloc_pages_nodemask(). This makes try_to_free_pages() less effective.

Changelog: v1 -> v2
- removed unnecessary nodemask=NULL initialization.

Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
Index: mmotm-2.6.29-Mar21/mm/vmscan.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/mm/vmscan.c
+++ mmotm-2.6.29-Mar21/mm/vmscan.c
@@ -79,6 +79,9 @@ struct scan_control {
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;

+ /* Nodemask */
+ nodemask_t *nodemask;
+
/* Pluggable isolate pages callback */
unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
unsigned long *scanned, int order, int mode,
@@ -1544,7 +1547,9 @@ static void shrink_zones(int priority, s
struct zone *zone;

sc->all_unreclaimable = 1;
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ /* Note: sc->nodemask==NULL means scan all node */
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+ sc->nodemask) {
if (!populated_zone(zone))
continue;
/*
@@ -1689,7 +1694,7 @@ out:
}

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
- gfp_t gfp_mask)
+ gfp_t gfp_mask, nodemask_t *nodemask)
{
struct scan_control sc = {
.gfp_mask = gfp_mask,
@@ -1700,6 +1705,7 @@ unsigned long try_to_free_pages(struct z
.order = order,
.mem_cgroup = NULL,
.isolate_pages = isolate_pages_global,
+ .nodemask = nodemask,
};

return do_try_to_free_pages(zonelist, &sc);
Index: mmotm-2.6.29-Mar21/include/linux/swap.h
===================================================================
--- mmotm-2.6.29-Mar21.orig/include/linux/swap.h
+++ mmotm-2.6.29-Mar21/include/linux/swap.h
@@ -213,7 +213,7 @@ static inline void lru_cache_add_active_

/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
- gfp_t gfp_mask);
+ gfp_t gfp_mask, nodemask_t *mask);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness);
Index: mmotm-2.6.29-Mar21/mm/page_alloc.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/mm/page_alloc.c
+++ mmotm-2.6.29-Mar21/mm/page_alloc.c
@@ -1598,7 +1598,8 @@ nofail_alloc:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist, order,
+ gfp_mask, nodemask);

p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
Index: mmotm-2.6.29-Mar21/fs/buffer.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/fs/buffer.c
+++ mmotm-2.6.29-Mar21/fs/buffer.c
@@ -476,7 +476,7 @@ static void free_more_memory(void)
&zone);
if (zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
- GFP_NOFS);
+ GFP_NOFS, NULL);
}
}

2009-03-23 01:20:34

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH] fix vmscan to take care of nodemask

> Kosaki pointed out it's not necessary to initialize struct member value by NULL.
> Remvoed it.
>
> Regards,
> -Kame
> ==
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> try_to_free_pages() scans zonelist but don't take care of nodemask which is
> given to alloc_pages_nodemask(). This makes try_to_free_pages() less effective.

Yes, ignore nodemask make unnecessary reclaim. it decrease try_to_free_pages()
performance.

Note: currently, try_to_free_pages() stop to process reclaim after 32 pages reclaimed.
then, non intentional node scanning can cause large performance degression.

>
> Changelog: v1 -> v2
> - removed unnecessary nodemask=NULL initialization.
>
> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>

thanks.
Reviewed-by: KOSAKI Motohiro <[email protected]>


2009-03-23 11:48:28

by Mel Gorman

[permalink] [raw]
Subject: Re: [PATCH] fix vmscan to take care of nodemask

On Mon, Mar 23, 2009 at 10:03:56AM +0900, KAMEZAWA Hiroyuki wrote:
> From: KAMEZAWA Hiroyuki <[email protected]>
>
> try_to_free_pages() scans zonelist but doesn't take care of nodemask which is
> passed to alloc_pages_nodemask(). This makes try_to_free_pages() less effective.
>

Hmm, do you mind if I try expanding that changelog a bit?

====

try_to_free_pages() is used for the direct reclaim of up to
SWAP_CLUSTER_MAX pages when watermarks are low. The caller to
alloc_pages_nodemask() can specify a nodemask of nodes that are allowed
to be used but this is not passed to try_to_free_pages(). This can lead
to the unnecessary reclaim of pages that are unusable by the caller and
in the worst case lead to allocation failure as progress was not been
made where it is needed.

This patch passes the nodemask used for alloc_pages_nodemask() to
try_to_free_pages().

====

?

> Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> ---
> fs/buffer.c | 2 +-
> include/linux/swap.h | 2 +-
> mm/page_alloc.c | 3 ++-
> mm/vmscan.c | 14 ++++++++++++--
> 4 files changed, 16 insertions(+), 5 deletions(-)
>
> Index: mmotm-2.6.29-Mar21/mm/vmscan.c
> ===================================================================
> --- mmotm-2.6.29-Mar21.orig/mm/vmscan.c
> +++ mmotm-2.6.29-Mar21/mm/vmscan.c
> @@ -79,6 +79,9 @@ struct scan_control {
> /* Which cgroup do we reclaim from */
> struct mem_cgroup *mem_cgroup;
>
> + /* Nodemask */
> + nodemask_t *nodemask;
> +

That comment is not a whole pile of help

/*
* nodemask of nodes allowed by the caller. Note that nodemask==NULL
* means scal all nods
*/

?

> /* Pluggable isolate pages callback */
> unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
> unsigned long *scanned, int order, int mode,
> @@ -1544,7 +1547,9 @@ static void shrink_zones(int priority, s
> struct zone *zone;
>
> sc->all_unreclaimable = 1;
> - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> + /* Note: sc->nodemask==NULL means scan all node */

Your choice, but I moved this comment to scan_control. I don't mind
where it is really.

> + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
> + sc->nodemask) {
> if (!populated_zone(zone))
> continue;
> /*
> @@ -1689,7 +1694,7 @@ out:
> }
>
> unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> - gfp_t gfp_mask)
> + gfp_t gfp_mask, nodemask_t *nodemask)
> {
> struct scan_control sc = {
> .gfp_mask = gfp_mask,
> @@ -1700,6 +1705,7 @@ unsigned long try_to_free_pages(struct z
> .order = order,
> .mem_cgroup = NULL,
> .isolate_pages = isolate_pages_global,
> + .nodemask = nodemask,
> };
>
> return do_try_to_free_pages(zonelist, &sc);
> @@ -1720,6 +1726,7 @@ unsigned long try_to_free_mem_cgroup_pag
> .order = 0,
> .mem_cgroup = mem_cont,
> .isolate_pages = mem_cgroup_isolate_pages,
> + .nodemask = NULL,

While strictly speaking, this is unnecessary, I prefer it because it
tells a reader that "yes, I really meant it to be NULL".

> };
> struct zonelist *zonelist;
>
> @@ -1769,6 +1776,7 @@ static unsigned long balance_pgdat(pg_da
> .order = order,
> .mem_cgroup = NULL,
> .isolate_pages = isolate_pages_global,
> + .nodemask = NULL,
> };
> /*
> * temp_priority is used to remember the scanning priority at which
> @@ -2112,6 +2120,7 @@ unsigned long shrink_all_memory(unsigned
> .may_unmap = 0,
> .may_writepage = 1,
> .isolate_pages = isolate_pages_global,
> + .nodemask = NULL,
> };
>
> current->reclaim_state = &reclaim_state;
> @@ -2298,6 +2307,7 @@ static int __zone_reclaim(struct zone *z
> .swappiness = vm_swappiness,
> .order = order,
> .isolate_pages = isolate_pages_global,
> + .nodemask = NULL,
> };
> unsigned long slab_reclaimable;
>
> Index: mmotm-2.6.29-Mar21/include/linux/swap.h
> ===================================================================
> --- mmotm-2.6.29-Mar21.orig/include/linux/swap.h
> +++ mmotm-2.6.29-Mar21/include/linux/swap.h
> @@ -213,7 +213,7 @@ static inline void lru_cache_add_active_
>
> /* linux/mm/vmscan.c */
> extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> - gfp_t gfp_mask);
> + gfp_t gfp_mask, nodemask_t *mask);
> extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
> gfp_t gfp_mask, bool noswap,
> unsigned int swappiness);
> Index: mmotm-2.6.29-Mar21/mm/page_alloc.c
> ===================================================================
> --- mmotm-2.6.29-Mar21.orig/mm/page_alloc.c
> +++ mmotm-2.6.29-Mar21/mm/page_alloc.c
> @@ -1598,7 +1598,8 @@ nofail_alloc:
> reclaim_state.reclaimed_slab = 0;
> p->reclaim_state = &reclaim_state;
>
> - did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
> + did_some_progress = try_to_free_pages(zonelist, order,
> + gfp_mask, nodemask);
>
> p->reclaim_state = NULL;
> lockdep_clear_current_reclaim_state();
> Index: mmotm-2.6.29-Mar21/fs/buffer.c
> ===================================================================
> --- mmotm-2.6.29-Mar21.orig/fs/buffer.c
> +++ mmotm-2.6.29-Mar21/fs/buffer.c
> @@ -476,7 +476,7 @@ static void free_more_memory(void)
> &zone);
> if (zone)
> try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
> - GFP_NOFS);
> + GFP_NOFS, NULL);
> }
> }
>

Otherwise looks good. With a better changelog and a comment other than "/*
nodemask */" explaining what nodemask is;

Acked-by: Mel Gorman <[email protected]>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab

2009-03-23 13:42:07

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] fix vmscan to take care of nodemask

On Mon, 23 Mar 2009, Mel Gorman wrote:

> try_to_free_pages() is used for the direct reclaim of up to
> SWAP_CLUSTER_MAX pages when watermarks are low. The caller to
> alloc_pages_nodemask() can specify a nodemask of nodes that are allowed
> to be used but this is not passed to try_to_free_pages(). This can lead
> to the unnecessary reclaim of pages that are unusable by the caller and
> in the worst case lead to allocation failure as progress was not been
> made where it is needed.
>
> This patch passes the nodemask used for alloc_pages_nodemask() to
> try_to_free_pages().


This is only useful for MPOL_BIND. Direct reclaim within a cpuset already
honors the boundaries of the cpuset.

2009-03-23 15:31:25

by KOSAKI Motohiro

[permalink] [raw]
Subject: Re: [PATCH] fix vmscan to take care of nodemask

2009/3/23 Christoph Lameter <[email protected]>:
> On Mon, 23 Mar 2009, Mel Gorman wrote:
>
>> try_to_free_pages() is used for the direct reclaim of up to
>> SWAP_CLUSTER_MAX pages when watermarks are low. The caller to
>> alloc_pages_nodemask() can specify a nodemask of nodes that are allowed
>> to be used but this is not passed to try_to_free_pages(). This can lead
>> to the unnecessary reclaim of pages that are unusable by the caller and
>> in the worst case lead to allocation failure as progress was not been
>> made where it is needed.
>>
>> This patch passes the nodemask used for alloc_pages_nodemask() to
>> try_to_free_pages().
>
> This is only useful for MPOL_BIND. Direct reclaim within a cpuset already
> honors the boundaries of the cpuset.

Do you mean nak or comment adding request?
I agree you. but I don't find any weak point of this patch.

2009-03-23 15:43:20

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH] fix vmscan to take care of nodemask

On Tue, 24 Mar 2009, KOSAKI Motohiro wrote:

> 2009/3/23 Christoph Lameter <[email protected]>:
> > On Mon, 23 Mar 2009, Mel Gorman wrote:
> >
> >> try_to_free_pages() is used for the direct reclaim of up to
> >> SWAP_CLUSTER_MAX pages when watermarks are low. The caller to
> >> alloc_pages_nodemask() can specify a nodemask of nodes that are allowed
> >> to be used but this is not passed to try_to_free_pages(). This can lead
> >> to the unnecessary reclaim of pages that are unusable by the caller and
> >> in the worst case lead to allocation failure as progress was not been
> >> made where it is needed.
> >>
> >> This patch passes the nodemask used for alloc_pages_nodemask() to
> >> try_to_free_pages().
> >
> > This is only useful for MPOL_BIND. Direct reclaim within a cpuset already
> > honors the boundaries of the cpuset.
>
> Do you mean nak or comment adding request?
> I agree you. but I don't find any weak point of this patch.

Just a comment.

2009-03-24 01:33:21

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: Re: [PATCH] fix vmscan to take care of nodemask

On Mon, 23 Mar 2009 11:48:14 +0000
Mel Gorman <[email protected]> wrote:

> On Mon, Mar 23, 2009 at 10:03:56AM +0900, KAMEZAWA Hiroyuki wrote:
> > From: KAMEZAWA Hiroyuki <[email protected]>
> >
> > try_to_free_pages() scans zonelist but doesn't take care of nodemask which is
> > passed to alloc_pages_nodemask(). This makes try_to_free_pages() less effective.
> >
>
> Hmm, do you mind if I try expanding that changelog a bit?
>
> ====
>
> try_to_free_pages() is used for the direct reclaim of up to
> SWAP_CLUSTER_MAX pages when watermarks are low. The caller to
> alloc_pages_nodemask() can specify a nodemask of nodes that are allowed
> to be used but this is not passed to try_to_free_pages(). This can lead
> to the unnecessary reclaim of pages that are unusable by the caller and
> in the worst case lead to allocation failure as progress was not been
> made where it is needed.
>
> This patch passes the nodemask used for alloc_pages_nodemask() to
> try_to_free_pages().
>
> ====
>
> ?

Thank you. I gradly use your text :)

>
> > Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
> > ---
> > fs/buffer.c | 2 +-
> > include/linux/swap.h | 2 +-
> > mm/page_alloc.c | 3 ++-
> > mm/vmscan.c | 14 ++++++++++++--
> > 4 files changed, 16 insertions(+), 5 deletions(-)
> >
> > Index: mmotm-2.6.29-Mar21/mm/vmscan.c
> > ===================================================================
> > --- mmotm-2.6.29-Mar21.orig/mm/vmscan.c
> > +++ mmotm-2.6.29-Mar21/mm/vmscan.c
> > @@ -79,6 +79,9 @@ struct scan_control {
> > /* Which cgroup do we reclaim from */
> > struct mem_cgroup *mem_cgroup;
> >
> > + /* Nodemask */
> > + nodemask_t *nodemask;
> > +
>
> That comment is not a whole pile of help
>
> /*
> * nodemask of nodes allowed by the caller. Note that nodemask==NULL
> * means scal all nods
> */
>
> ?
>
ok.

> > /* Pluggable isolate pages callback */
> > unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
> > unsigned long *scanned, int order, int mode,
> > @@ -1544,7 +1547,9 @@ static void shrink_zones(int priority, s
> > struct zone *zone;
> >
> > sc->all_unreclaimable = 1;
> > - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
> > + /* Note: sc->nodemask==NULL means scan all node */
>
> Your choice, but I moved this comment to scan_control. I don't mind
> where it is really.
>
ok.


> > + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
> > + sc->nodemask) {
> > if (!populated_zone(zone))
> > continue;
> > /*
> > @@ -1689,7 +1694,7 @@ out:
> > }
> >
> > unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> > - gfp_t gfp_mask)
> > + gfp_t gfp_mask, nodemask_t *nodemask)
> > {
> > struct scan_control sc = {
> > .gfp_mask = gfp_mask,
> > @@ -1700,6 +1705,7 @@ unsigned long try_to_free_pages(struct z
> > .order = order,
> > .mem_cgroup = NULL,
> > .isolate_pages = isolate_pages_global,
> > + .nodemask = nodemask,
> > };
> >
> > return do_try_to_free_pages(zonelist, &sc);
> > @@ -1720,6 +1726,7 @@ unsigned long try_to_free_mem_cgroup_pag
> > .order = 0,
> > .mem_cgroup = mem_cont,
> > .isolate_pages = mem_cgroup_isolate_pages,
> > + .nodemask = NULL,
>
> While strictly speaking, this is unnecessary, I prefer it because it
> tells a reader that "yes, I really meant it to be NULL".
>
> > };
> > struct zonelist *zonelist;
> >
> > @@ -1769,6 +1776,7 @@ static unsigned long balance_pgdat(pg_da
> > .order = order,
> > .mem_cgroup = NULL,
> > .isolate_pages = isolate_pages_global,
> > + .nodemask = NULL,
> > };
> > /*
> > * temp_priority is used to remember the scanning priority at which
> > @@ -2112,6 +2120,7 @@ unsigned long shrink_all_memory(unsigned
> > .may_unmap = 0,
> > .may_writepage = 1,
> > .isolate_pages = isolate_pages_global,
> > + .nodemask = NULL,
> > };
> >
> > current->reclaim_state = &reclaim_state;
> > @@ -2298,6 +2307,7 @@ static int __zone_reclaim(struct zone *z
> > .swappiness = vm_swappiness,
> > .order = order,
> > .isolate_pages = isolate_pages_global,
> > + .nodemask = NULL,
> > };
> > unsigned long slab_reclaimable;
> >
> > Index: mmotm-2.6.29-Mar21/include/linux/swap.h
> > ===================================================================
> > --- mmotm-2.6.29-Mar21.orig/include/linux/swap.h
> > +++ mmotm-2.6.29-Mar21/include/linux/swap.h
> > @@ -213,7 +213,7 @@ static inline void lru_cache_add_active_
> >
> > /* linux/mm/vmscan.c */
> > extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> > - gfp_t gfp_mask);
> > + gfp_t gfp_mask, nodemask_t *mask);
> > extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
> > gfp_t gfp_mask, bool noswap,
> > unsigned int swappiness);
> > Index: mmotm-2.6.29-Mar21/mm/page_alloc.c
> > ===================================================================
> > --- mmotm-2.6.29-Mar21.orig/mm/page_alloc.c
> > +++ mmotm-2.6.29-Mar21/mm/page_alloc.c
> > @@ -1598,7 +1598,8 @@ nofail_alloc:
> > reclaim_state.reclaimed_slab = 0;
> > p->reclaim_state = &reclaim_state;
> >
> > - did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
> > + did_some_progress = try_to_free_pages(zonelist, order,
> > + gfp_mask, nodemask);
> >
> > p->reclaim_state = NULL;
> > lockdep_clear_current_reclaim_state();
> > Index: mmotm-2.6.29-Mar21/fs/buffer.c
> > ===================================================================
> > --- mmotm-2.6.29-Mar21.orig/fs/buffer.c
> > +++ mmotm-2.6.29-Mar21/fs/buffer.c
> > @@ -476,7 +476,7 @@ static void free_more_memory(void)
> > &zone);
> > if (zone)
> > try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
> > - GFP_NOFS);
> > + GFP_NOFS, NULL);
> > }
> > }
> >
>
> Otherwise looks good. With a better changelog and a comment other than "/*
> nodemask */" explaining what nodemask is;
>
> Acked-by: Mel Gorman <[email protected]>
>

Thank you for review !

Regards,
-Kame


> --
> Mel Gorman
> Part-time Phd Student Linux Technology Center
> University of Limerick IBM Dublin Software Lab
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2009-03-24 02:28:22

by Kamezawa Hiroyuki

[permalink] [raw]
Subject: [PATCH] fix vmscan to take care of nodemask v3

Thank you for all comments :)
-Kame
==
From: KAMEZAWA Hiroyuki <[email protected]>

try_to_free_pages() is used for the direct reclaim of up to SWAP_CLUSTER_MAX
pages when watermarks are low. The caller to alloc_pages_nodemask() can
specify a nodemask of nodes that are allowed to be used but this is not
passed to try_to_free_pages(). This can lead to unnecessary reclaim of pages
that are unusable by the caller and int the worst case lead to allocation
failure as progress was not been make where it is needed.

This patch passes the nodemask used for alloc_pages_nodemask() to
try_to_free_pages().

Changelog: v2 -> v3
- rewrote the patch description.
- enhanced comment text.
- added .nodemask = NULL for try_to_free_mem_cgroup_pages() to show the
difference with try_to_free_pages().
Changelog: v1 -> v2
- removed unnecessary nodemask=NULL initialization.

Reviewed-by: KOSAKI Motohiro <[email protected]>
Acked-by: Mel Gorman <[email protected]>
Signed-off-by: KAMEZAWA Hiroyuki <[email protected]>
---
fs/buffer.c | 2 +-
include/linux/swap.h | 2 +-
mm/page_alloc.c | 3 ++-
mm/vmscan.c | 13 +++++++++++--
4 files changed, 15 insertions(+), 5 deletions(-)

Index: mmotm-2.6.29-Mar21/mm/vmscan.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/mm/vmscan.c
+++ mmotm-2.6.29-Mar21/mm/vmscan.c
@@ -79,6 +79,12 @@ struct scan_control {
/* Which cgroup do we reclaim from */
struct mem_cgroup *mem_cgroup;

+ /*
+ * Nodemask of nodes allowed by the caller. If NULL, all nodes
+ * are scanned.
+ */
+ nodemask_t *nodemask;
+
/* Pluggable isolate pages callback */
unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
unsigned long *scanned, int order, int mode,
@@ -1544,7 +1550,8 @@ static void shrink_zones(int priority, s
struct zone *zone;

sc->all_unreclaimable = 1;
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+ sc->nodemask) {
if (!populated_zone(zone))
continue;
/*
@@ -1689,7 +1696,7 @@ out:
}

unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
- gfp_t gfp_mask)
+ gfp_t gfp_mask, nodemask_t *nodemask)
{
struct scan_control sc = {
.gfp_mask = gfp_mask,
@@ -1700,6 +1707,7 @@ unsigned long try_to_free_pages(struct z
.order = order,
.mem_cgroup = NULL,
.isolate_pages = isolate_pages_global,
+ .nodemask = nodemask,
};

return do_try_to_free_pages(zonelist, &sc);
@@ -1720,6 +1728,7 @@ unsigned long try_to_free_mem_cgroup_pag
.order = 0,
.mem_cgroup = mem_cont,
.isolate_pages = mem_cgroup_isolate_pages,
+ .nodemask = NULL, /* we don't care the placement */
};
struct zonelist *zonelist;

Index: mmotm-2.6.29-Mar21/include/linux/swap.h
===================================================================
--- mmotm-2.6.29-Mar21.orig/include/linux/swap.h
+++ mmotm-2.6.29-Mar21/include/linux/swap.h
@@ -213,7 +213,7 @@ static inline void lru_cache_add_active_

/* linux/mm/vmscan.c */
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
- gfp_t gfp_mask);
+ gfp_t gfp_mask, nodemask_t *mask);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness);
Index: mmotm-2.6.29-Mar21/mm/page_alloc.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/mm/page_alloc.c
+++ mmotm-2.6.29-Mar21/mm/page_alloc.c
@@ -1598,7 +1598,8 @@ nofail_alloc:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist, order,
+ gfp_mask, nodemask);

p->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
Index: mmotm-2.6.29-Mar21/fs/buffer.c
===================================================================
--- mmotm-2.6.29-Mar21.orig/fs/buffer.c
+++ mmotm-2.6.29-Mar21/fs/buffer.c
@@ -476,7 +476,7 @@ static void free_more_memory(void)
&zone);
if (zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
- GFP_NOFS);
+ GFP_NOFS, NULL);
}
}