2005-10-01 19:00:38

by Rohit Seth

[permalink] [raw]
Subject: [PATCH]: Clean up of __alloc_pages


[PATCH]: Below is the cleaning up of __alloc_pages code. Few
things different from original version are

1: remove the initial direct reclaim logic
2: order zero pages are now first looked into pcp list upfront
3: GFP_HIGH pages are allowed to go little below low watermark sooner
4: Search for free pages unconditionally after direct reclaim

Signed-off-by: Rohit Seth <[email protected]>


--- linux-2.6.14-rc2-mm1.org/mm/page_alloc.c 2005-09-27 10:03:51.000000000 -0700
+++ linux-2.6.14-rc2-mm1/mm/page_alloc.c 2005-10-01 10:40:06.000000000 -0700
@@ -722,7 +722,8 @@
* or two.
*/
static struct page *
-buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
+buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags,
+ int replenish)
{
unsigned long flags;
struct page *page = NULL;
@@ -733,7 +734,7 @@

pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
- if (pcp->count <= pcp->low)
+ if ((pcp->count <= pcp->low) && replenish)
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
if (pcp->count) {
@@ -743,9 +744,7 @@
}
local_irq_restore(flags);
put_cpu();
- }
-
- if (page == NULL) {
+ } else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -858,6 +857,44 @@
}
#endif /* CONFIG_PAGE_OWNER */

+/* This function get_page_from_freeliest loops through all the possible zones
+ * to find out if it can allocate a page. can_try_harder can have following
+ * values:
+ * -1 => No need to check for the watermarks.
+ * 0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
+ * 1 => Go far below the low watermark. See zone_watermark_ok (RT TASK)
+ */
+
+static inline struct page *
+get_page_from_freelist(unsigned int __nocast gfp_mask, unsigned int order,
+ struct zone **zones, int can_try_harder)
+{
+ struct zone *z;
+ struct page *page = NULL;
+ int classzone_idx = zone_idx(zones[0]);
+ int i;
+
+ /*
+ * Go through the zonelist once, looking for a zone with enough free.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ if (!cpuset_zone_allowed(z, gfp_mask))
+ continue;
+
+ if ((can_try_harder >= 0) &&
+ (!zone_watermark_ok(z, order, z->pages_low,
+ classzone_idx, can_try_harder,
+ gfp_mask & __GFP_HIGH)))
+ continue;
+
+ page = buffered_rmqueue(z, order, gfp_mask, 1);
+ if (page)
+ break;
+ }
+ return page;
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -866,15 +903,13 @@
struct zonelist *zonelist)
{
const int wait = gfp_mask & __GFP_WAIT;
- struct zone **zones, *z;
+ struct zone **zones, *z = NULL;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int i;
- int classzone_idx;
int do_retry;
int can_try_harder;
- int did_some_progress;

might_sleep_if(wait);

@@ -892,45 +927,23 @@
return NULL;
}

- classzone_idx = zone_idx(zones[0]);
-
-restart:
- /*
- * Go through the zonelist once, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ /* First check if we can easily take a page from any pcp list
+ * for 0 order pages. Don't replenish any pcp list at this time.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- /*
- * If the zone is to attempt early page reclaim then this loop
- * will try to reclaim pages and check the watermark a second
- * time before giving up and falling back to the next zone.
- */
-zone_reclaim_retry:
- if (!zone_watermark_ok(z, order, z->pages_low,
- classzone_idx, 0, 0)) {
- if (!do_reclaim)
- continue;
- else {
- zone_reclaim(z, gfp_mask, order);
- /* Only try reclaim once */
- do_reclaim = 0;
- goto zone_reclaim_retry;
- }
+ if (order == 0) {
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ page = buffered_rmqueue(z, 0, gfp_mask, 0);
+ if (page)
+ goto got_pg;
}
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
}
+restart:
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones, 0);
+ if (page)
+ goto got_pg;

for (i = 0; (z = zones[i]) != NULL; i++)
wakeup_kswapd(z, order);
-
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
* coming from realtime tasks to go deeper into reserves
@@ -939,19 +952,12 @@
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;

- if (wait && !cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ if (!wait)
+ page = get_page_from_freelist(gfp_mask, order, zones,
+ can_try_harder);
+ if (page)
+ goto got_pg;

/* This allocation should allow future memory freeing. */

@@ -959,13 +965,9 @@
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
/* go through the zonelist yet again, ignoring mins */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order, zones, -1);
+ if (page)
+ goto got_pg;
}
goto nopage;
}
@@ -982,47 +984,21 @@
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zones, gfp_mask);
+ i = try_to_free_pages(zones, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

cond_resched();

- if (likely(did_some_progress)) {
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
- } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ page = get_page_from_freelist(gfp_mask, order, zones,
+ can_try_harder);
+ if (page)
+ goto got_pg;
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
/*
- * Go through the zonelist yet one more time, keep
- * very high watermark here, this is only to catch
- * a parallel oom killing, we must fail if we're still
- * under heavy pressure.
+ * Start the OOM for this scenario.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_high,
- classzone_idx, 0, 0))
- continue;
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
-
out_of_memory(gfp_mask, order);
goto restart;
}
@@ -1060,7 +1036,7 @@
#ifdef CONFIG_PAGE_OWNER
set_page_owner(page, order, gfp_mask);
#endif
- zone_statistics(zonelist, z);
+ zone_statistics(zonelist, page_zone(page));
return page;
}


2005-10-02 03:08:36

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Seth, Rohit wrote:
> [PATCH]: Below is the cleaning up of __alloc_pages code. Few
> things different from original version are
>
> 1: remove the initial direct reclaim logic
> 2: order zero pages are now first looked into pcp list upfront
> 3: GFP_HIGH pages are allowed to go little below low watermark sooner
> 4: Search for free pages unconditionally after direct reclaim
>
> Signed-off-by: Rohit Seth <[email protected]>
>

Hi,

Seems pretty good at a quick glance.

Perhaps splitting it into 2 would be a good idea - ie. first
patch does the cleanup, second does the direct pcp list alloc.

Regarding the direct pcp list allocation - I think it is a good
idea, because we're currently already accounting pcp list pages
as being 'allocated' for the purposes of the reclaim watermarks.

Also, the structure is there to avoid touching cachelines whenever
possible so it makes sense to use it early here. Do you have any
performance numbers or allocation statistics (e.g. %pcp hits) to
show?

Also, I would really think about uninlining get_page_from_freelist,
and inlining buffered_rmqueue, so that the constant 'replenish'
argument can be propogated into buffered_rmqueue and should allow
for some nice optimisations. While not bloating the code too much
because your get_page_from_freelist becomes out of line.

Thanks,
Nick

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-10-03 15:34:16

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Sat, 1 Oct 2005, Seth, Rohit wrote:

> - goto zone_reclaim_retry;
> - }
> + if (order == 0) {
> + for (i = 0; (z = zones[i]) != NULL; i++) {
> + page = buffered_rmqueue(z, 0, gfp_mask, 0);
> + if (page)
> + goto got_pg;
> }
> -

This is checking all zones for pages on the pcp before going the more
expensive route?

Seems that this removes the logic intended to prefer local
allocations over remote pages present in the existing alloc_pages? There
is the danger that this modification will lead to the allocation of remote
pages even if local pages are available. Thus reducing performance.

I would suggest to just check the first zone's pcp instead of all zones.

2005-10-03 16:42:53

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Sun, 2005-10-02 at 13:09 +1000, Nick Piggin wrote:

> Perhaps splitting it into 2 would be a good idea - ie. first
> patch does the cleanup, second does the direct pcp list alloc.
>
> Regarding the direct pcp list allocation - I think it is a good
> idea, because we're currently already accounting pcp list pages
> as being 'allocated' for the purposes of the reclaim watermarks.
>

Right.

> Also, the structure is there to avoid touching cachelines whenever
> possible so it makes sense to use it early here. Do you have any
> performance numbers or allocation statistics (e.g. %pcp hits) to
> show?
>

No, I don't have any data at this point to share.

> Also, I would really think about uninlining get_page_from_freelist,
> and inlining buffered_rmqueue, so that the constant 'replenish'
> argument can be propogated into buffered_rmqueue and should allow
> for some nice optimisations. While not bloating the code too much
> because your get_page_from_freelist becomes out of line.

I will do that.

Thanks for your feedback,
-rohit

2005-10-03 16:48:54

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Mon, 2005-10-03 at 08:34 -0700, Christoph Lameter wrote:
> On Sat, 1 Oct 2005, Seth, Rohit wrote:
>
> > - goto zone_reclaim_retry;
> > - }
> > + if (order == 0) {
> > + for (i = 0; (z = zones[i]) != NULL; i++) {
> > + page = buffered_rmqueue(z, 0, gfp_mask, 0);
> > + if (page)
> > + goto got_pg;
> > }
> > -
>
> This is checking all zones for pages on the pcp before going the more
> expensive route?
>

That is right.

> Seems that this removes the logic intended to prefer local
> allocations over remote pages present in the existing alloc_pages? There
> is the danger that this modification will lead to the allocation of remote
> pages even if local pages are available. Thus reducing performance.
>

Good catch. I will up level the cpuset check in buffered_rmqueue rather
then doing it in get_page_from_freelist. That should retain the current
preferences for local pages.

> I would suggest to just check the first zone's pcp instead of all zones.
>

Na. This for most cases will be ZONE_DMA pcp list having nothing much
most of the time. And picking any other zone randomly will be exposed
to faulty behavior.

Thanks,
-rohit

2005-10-03 16:58:04

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Mon, 3 Oct 2005, Rohit Seth wrote:

> > Seems that this removes the logic intended to prefer local
> > allocations over remote pages present in the existing alloc_pages? There
> > is the danger that this modification will lead to the allocation of remote
> > pages even if local pages are available. Thus reducing performance.
> Good catch. I will up level the cpuset check in buffered_rmqueue rather
> then doing it in get_page_from_freelist. That should retain the current
> preferences for local pages.

This is not only the cpuset check. If there is memory available in an
earlier zone then it needs to be taken regardless of later pcp's
the zonelist containing pages. Otherwise we did not take the pages nearest
to the requested node.

> > I would suggest to just check the first zone's pcp instead of all zones.
> >
>
> Na. This for most cases will be ZONE_DMA pcp list having nothing much
> most of the time. And picking any other zone randomly will be exposed
> to faulty behavior.

Maybe only check the first node?

2005-10-03 17:42:05

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Mon, 2005-10-03 at 09:57 -0700, Christoph Lameter wrote:
> On Mon, 3 Oct 2005, Rohit Seth wrote:
>
> > > Seems that this removes the logic intended to prefer local
> > > allocations over remote pages present in the existing alloc_pages? There
> > > is the danger that this modification will lead to the allocation of remote
> > > pages even if local pages are available. Thus reducing performance.
> > Good catch. I will up level the cpuset check in buffered_rmqueue rather
> > then doing it in get_page_from_freelist. That should retain the current
> > preferences for local pages.
>
> This is not only the cpuset check. If there is memory available in an
> earlier zone then it needs to be taken regardless of later pcp's
> the zonelist containing pages. Otherwise we did not take the pages nearest
> to the requested node.
>

Ah. Okay.

> > > I would suggest to just check the first zone's pcp instead of all zones.
> > >
> >
> > Na. This for most cases will be ZONE_DMA pcp list having nothing much
> > most of the time. And picking any other zone randomly will be exposed
> > to faulty behavior.
>
> Maybe only check the first node?
>

I think conceptually this ask for a new flag __GFP_NODEONLY that
indicate allocations to come from current node only.

This definitely though means I will need to separate out the allocation
from pcp patch (as Nick suggested earlier).

-rohit

2005-10-04 13:27:17

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Rohit Seth <[email protected]> writes:
>
> I think conceptually this ask for a new flag __GFP_NODEONLY that
> indicate allocations to come from current node only.
>
> This definitely though means I will need to separate out the allocation
> from pcp patch (as Nick suggested earlier).

This reminds me - the current logic is currently a bit suboptimal on
many NUMA systems. Often it would be better to be a bit more
aggressive at freeing memory (maybe do a very low overhead light try to
free pages) in the first node before falling back to other nodes. What
right now happens is that when you have even minor memory pressure
because e.g. you node is filled up with disk cache the local memory
affinity doesn't work too well anymore.

-Andi

2005-10-04 16:05:14

by Ray Bryant

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Tuesday 04 October 2005 08:27, Andi Kleen wrote:
> Rohit Seth <[email protected]> writes:
> > I think conceptually this ask for a new flag __GFP_NODEONLY that
> > indicate allocations to come from current node only.
> >
> > This definitely though means I will need to separate out the allocation
> > from pcp patch (as Nick suggested earlier).
>
> This reminds me - the current logic is currently a bit suboptimal on
> many NUMA systems. Often it would be better to be a bit more
> aggressive at freeing memory (maybe do a very low overhead light try to
> free pages) in the first node before falling back to other nodes. What
> right now happens is that when you have even minor memory pressure
> because e.g. you node is filled up with disk cache the local memory
> affinity doesn't work too well anymore.
>
> -Andi
>
That's exactly what Martin Hick's additions to __alloc_pages() were trying to
achieve. However, we've never figured out how to make the "very low
overhead light try to free pages" thing work with low enough overhead that it
can be left on all of the time. As soon as we make this the least bit more
expensive, then this hurts those workloads (file servers being one example)
who don't care about local, but who need the fastest possible allocations.

This problem is often a showstopper on larger NUMA systems, at least for HPC
type applications, where the inability to guarantee local storage allocation
when it is requested can make the application run significantly slower.
--
Ray Bryant
AMD Performance Labs Austin, Tx
512-602-0038 (o) 512-507-7807 (c)

2005-10-04 16:10:48

by Martin Bligh

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages



--Ray Bryant <[email protected]> wrote (on Tuesday, October 04, 2005 11:26:52 -0500):

> On Tuesday 04 October 2005 08:27, Andi Kleen wrote:
>> Rohit Seth <[email protected]> writes:
>> > I think conceptually this ask for a new flag __GFP_NODEONLY that
>> > indicate allocations to come from current node only.
>> >
>> > This definitely though means I will need to separate out the allocation
>> > from pcp patch (as Nick suggested earlier).
>>
>> This reminds me - the current logic is currently a bit suboptimal on
>> many NUMA systems. Often it would be better to be a bit more
>> aggressive at freeing memory (maybe do a very low overhead light try to
>> free pages) in the first node before falling back to other nodes. What
>> right now happens is that when you have even minor memory pressure
>> because e.g. you node is filled up with disk cache the local memory
>> affinity doesn't work too well anymore.
>>
>> -Andi
>>
> That's exactly what Martin Hick's additions to __alloc_pages() were trying to
> achieve. However, we've never figured out how to make the "very low
> overhead light try to free pages" thing work with low enough overhead that it
> can be left on all of the time. As soon as we make this the least bit more
> expensive, then this hurts those workloads (file servers being one example)
> who don't care about local, but who need the fastest possible allocations.
>
> This problem is often a showstopper on larger NUMA systems, at least for HPC
> type applications, where the inability to guarantee local storage allocation
> when it is requested can make the application run significantly slower.

Can we not do some migration / more targeted pressure balancing in kswapd?
Ie if we had a constant measure of per-node pressure, we could notice an
imbalance, and start migrating the least recently used pages from the node
under most pressure to the node under least ...

M.

2005-10-04 16:40:51

by Ray Bryant

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Tuesday 04 October 2005 11:10, Martin J. Bligh wrote:
> --Ray Bryant <[email protected]> wrote (on Tuesday, October 04, 2005
>
<snip>

> >
> > That's exactly what Martin Hick's additions to __alloc_pages() were
> > trying to achieve. However, we've never figured out how to make the
> > "very low overhead light try to free pages" thing work with low enough
> > overhead that it can be left on all of the time. As soon as we make
> > this the least bit more expensive, then this hurts those workloads (file
> > servers being one example) who don't care about local, but who need the
> > fastest possible allocations.
> >
> > This problem is often a showstopper on larger NUMA systems, at least for
> > HPC type applications, where the inability to guarantee local storage
> > allocation when it is requested can make the application run
> > significantly slower.
>
> Can we not do some migration / more targeted pressure balancing in kswapd?
> Ie if we had a constant measure of per-node pressure, we could notice an
> imbalance, and start migrating the least recently used pages from the node
> under most pressure to the node under least ...
>
> M.
>

Unfortunately, I think that anything that doesn't happen synchronously (with
the original allocation request) happens to late. In order to make this
work we'd have to not only migrate the excess page cache pages off of the
loaded nodes, but find the supposed-to-be-local pages that got allocated off
node and migrate them back. I'm not sure how to do all of that. :-)

(Another way to think about this is that if we truly balanced page cache
allocations across nodes via migration, then we could have equally as well
originally allocated the page cache pages round-robin across the nodes and
gotten the balancing for free.)

Another idea that I have been kicking around for some time is the notion of a
MPOL_LOCAL memory policy. MPOL_LOCAL would be kind of like a dynamic
MPOL_BIND in the sense that the policy would allow allocation only on a
single node, but that signal node would be dynamically set to the current
node (rather than statically set ala MPOL_BIND). The result of such an
allocation policy would be that if a local allocation can't be made, then the
current task will end up in synchronous page reclaim until some memory
becomes free. That would force clean page cache pages to be discarded.
(This idea is similar, in spirit, to Rohit's suggestion for __GFP_NODEONLY...)

The intent of this is for an otherwise mempolicy unaware application to be run
under numactl and MPOL_LOCAL if it requires local storage allocations.
File server applications (or other applications that demand the fastest
storage allocation without regard to local) would chose not to be run in such
a fashion.

The down side of this is that a task that requests more memory than is
available on a node (and is running under MPOL_LOCAL) could invoke the OOM
handler. We'd have to special case this under the OOM handler, I would
imagine, although a similar problem may exist with MPOL_BIND at the present
time, anyway.


> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Ray Bryant
AMD Performance Labs Austin, Tx
512-602-0038 (o) 512-507-7807 (c)