2005-10-29 01:33:43

by Rohit Seth

[permalink] [raw]
Subject: [PATCH]: Clean up of __alloc_pages

[PATCH]: Below is the cleanup of __alloc_pages code. As discussed earlier,
the only changes in this clean up are:

1- remove the initial direct reclaim logic
2- GFP_HIGH pages are allowed to go little below low watermark sooner
3- Search for free pages unconditionally after direct reclaim

I've not added the logic of looking into PCPs first in this rev of patch. I will send a
seperate patch for adding that support (needing extra logic for NUMA).

Signed-off-by: Rohit Seth <[email protected]>


diff -Naru linux-2.6.14.org/mm/page_alloc.c linux-2.6.14/mm/page_alloc.c
--- linux-2.6.14.org/mm/page_alloc.c 2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14/mm/page_alloc.c 2005-10-28 10:11:39.000000000 -0700
@@ -685,8 +685,8 @@
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static inline struct page *
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags, int replenish)
{
unsigned long flags;
struct page *page = NULL;
@@ -697,7 +697,7 @@

pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
- if (pcp->count <= pcp->low)
+ if ((pcp->count <= pcp->low) && replenish)
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
if (pcp->count) {
@@ -707,9 +707,7 @@
}
local_irq_restore(flags);
put_cpu();
- }
-
- if (page == NULL) {
+ } else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -770,6 +768,44 @@
return 1;
}

+/* get_page_from_freeliest loops through all the possible zones
+ * to find out if it can allocate a page. can_try_harder can have following
+ * values:
+ * -1 => No need to check for the watermarks.
+ * 0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
+ * 1 => Go far below the low watermark. See zone_watermark_ok (RT TASK)
+ */
+
+static struct page *
+get_page_from_freelist(unsigned int __nocast gfp_mask, unsigned int order,
+ struct zone **zones, int can_try_harder)
+{
+ struct zone *z;
+ struct page *page = NULL;
+ int classzone_idx = zone_idx(zones[0]);
+ int i;
+
+ /*
+ * Go through the zonelist once, looking for a zone with enough free.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ if (!cpuset_zone_allowed(z, gfp_mask))
+ continue;
+
+ if ((can_try_harder >= 0) &&
+ (!zone_watermark_ok(z, order, z->pages_low,
+ classzone_idx, can_try_harder,
+ gfp_mask & __GFP_HIGH)))
+ continue;
+
+ page = buffered_rmqueue(z, order, gfp_mask, 1);
+ if (page)
+ break;
+ }
+ return page;
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -778,15 +814,13 @@
struct zonelist *zonelist)
{
const int wait = gfp_mask & __GFP_WAIT;
- struct zone **zones, *z;
+ struct zone **zones, *z = NULL;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int i;
- int classzone_idx;
int do_retry;
int can_try_harder;
- int did_some_progress;

might_sleep_if(wait);

@@ -803,42 +837,10 @@
/* Should this ever happen?? */
return NULL;
}
-
- classzone_idx = zone_idx(zones[0]);
-
restart:
- /*
- * Go through the zonelist once, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- /*
- * If the zone is to attempt early page reclaim then this loop
- * will try to reclaim pages and check the watermark a second
- * time before giving up and falling back to the next zone.
- */
-zone_reclaim_retry:
- if (!zone_watermark_ok(z, order, z->pages_low,
- classzone_idx, 0, 0)) {
- if (!do_reclaim)
- continue;
- else {
- zone_reclaim(z, gfp_mask, order);
- /* Only try reclaim once */
- do_reclaim = 0;
- goto zone_reclaim_retry;
- }
- }
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones, 0);
+ if (page)
+ goto got_pg;

for (i = 0; (z = zones[i]) != NULL; i++)
wakeup_kswapd(z, order);
@@ -851,19 +853,11 @@
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (wait && !cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ if (!wait)
+ page = get_page_from_freelist(gfp_mask, order, zones,
+ can_try_harder);
+ if (page)
+ goto got_pg;

/* This allocation should allow future memory freeing. */

@@ -871,13 +865,9 @@
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
/* go through the zonelist yet again, ignoring mins */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order, zones, -1);
+ if (page)
+ goto got_pg;
}
goto nopage;
}
@@ -894,47 +884,20 @@
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zones, gfp_mask);
+ i = try_to_free_pages(zones, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

cond_resched();

- if (likely(did_some_progress)) {
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
- } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ page = get_page_from_freelist(gfp_mask, order, zones, can_try_harder);
+ if (page)
+ goto got_pg;
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
/*
- * Go through the zonelist yet one more time, keep
- * very high watermark here, this is only to catch
- * a parallel oom killing, we must fail if we're still
- * under heavy pressure.
+ * Start the OOM here.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_high,
- classzone_idx, 0, 0))
- continue;
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
-
out_of_memory(gfp_mask, order);
goto restart;
}
@@ -968,7 +931,7 @@
}
return NULL;
got_pg:
- zone_statistics(zonelist, z);
+ zone_statistics(zonelist, page_zone(page));
return page;
}


2005-10-29 02:32:11

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Rohit, Seth wrote:
> the only changes in this clean up are:
>

Looking good. I imagine it must be good for icache.
Man, the page allocator somehow turned unreadable since I last
looked at it! We will want this patch.

> 1- remove the initial direct reclaim logic
> 2- GFP_HIGH pages are allowed to go little below low watermark sooner

I don't think #2 is any good. The reason we don't check GFP_HIGH on
the first time round is because we simply want to kick kswapd at its
normal watermark - ie. it doesn't matter what kind of allocation this
is, kswapd should start at the same time no matter what.

If you don't do this, then a GFP_HIGH allocator can allocate right
down to its limit before it kicks kswapd, then it either will fail or
will have to do direct reclaim.

I would be inclined to simply add a int gfp_high argument to
get_page_from_freelist, which would also somewhat match zone_watermark_ok.

> 3- Search for free pages unconditionally after direct reclaim
>
> I've not added the logic of looking into PCPs first in this rev of patch. I will send a
> seperate patch for adding that support (needing extra logic for NUMA).
>
> Signed-off-by: Rohit Seth <[email protected]>
>

One other comment below:

> +
> +static struct page *
> +get_page_from_freelist(unsigned int __nocast gfp_mask, unsigned int order,
> + struct zone **zones, int can_try_harder)
> +{
> + struct zone *z;
> + struct page *page = NULL;
> + int classzone_idx = zone_idx(zones[0]);
> + int i;
> +
> + /*
> + * Go through the zonelist once, looking for a zone with enough free.
> + * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
> + */
> + for (i = 0; (z = zones[i]) != NULL; i++) {
> + if (!cpuset_zone_allowed(z, gfp_mask))
> + continue;
> +
> + if ((can_try_harder >= 0) &&
> + (!zone_watermark_ok(z, order, z->pages_low,
> + classzone_idx, can_try_harder,
> + gfp_mask & __GFP_HIGH)))
> + continue;
> +
> + page = buffered_rmqueue(z, order, gfp_mask, 1);
> + if (page)
> + break;
> + }
> + return page;
> +}

[snip]

> @@ -968,7 +931,7 @@
> }
> return NULL;
> got_pg:
> - zone_statistics(zonelist, z);
> + zone_statistics(zonelist, page_zone(page));
> return page;

How about moving the zone_statistics up into the 'if (page)'
test of get_page_from_freelist? This way we don't have to
evaluate page_zone().

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-10-30 00:16:46

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Seth wroteL
> @@ -851,19 +853,11 @@
> * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
> * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
> */
> - for (i = 0; (z = zones[i]) != NULL; i++) {
> - if (!zone_watermark_ok(z, order, z->pages_min,
> - classzone_idx, can_try_harder,
> - gfp_mask & __GFP_HIGH))
> - continue;
> -
> - if (wait && !cpuset_zone_allowed(z, gfp_mask))
> - continue;
> -
> - page = buffered_rmqueue(z, order, gfp_mask);
> - if (page)
> - goto got_pg;
> - }
> + if (!wait)
> + page = get_page_from_freelist(gfp_mask, order, zones,
> + can_try_harder);

Thanks for the clean-up work. Good stuff.

I think you've changed the affect that the cpuset check has on the
above pass.

As you know, the above is the last chance we have for GFP_ATOMIC (can't
wait) allocations before getting into the oom_kill code. The code had
been written to ignore cpuset constraints for GFP_ATOMIC (that is,
"!wait") allocations. The intent is to allow taking GFP_ATOMIC memory
from any damn node we can find it on, rather than start killing.

Your change will call into get_page_from_freelist() in such cases,
where the cpuset check is still done.

I would be tempted instead to:
1) pass 'can_try_harder' value of -1, instead of the the local value
of 1 (which it certainly is, since we are in !wait code).
2) condition the cpuset check in get_page_from_freelist() on
can_try_harder being not equal to -1.

The item (2) -does- change the existing cpuset conditions as well,
allowing cpuset boundaries to be violated for the cases that would
"allow future memory freeing" (such as GFP_MEMALLOC or TIF_MEMDIE),
whereas until now, we did not allow violating cpuset conditions
for this. But that is arguably a good change.

The following patch, on top of yours, shows what I have in mind here:

--- 2.6.14-rc5-mm1.orig/mm/page_alloc.c 2005-10-29 14:45:07.000000000 -0700
+++ 2.6.14-rc5-mm1/mm/page_alloc.c 2005-10-29 16:35:55.000000000 -0700
@@ -777,7 +777,7 @@ get_page_from_freelist(unsigned int __no
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!cpuset_zone_allowed(z, gfp_mask))
+ if (can_try_harder != -1 && !cpuset_zone_allowed(z, gfp_mask))
continue;

if ((can_try_harder >= 0) &&
@@ -940,8 +940,7 @@ restart:
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
if (!wait)
- page = get_page_from_freelist(gfp_mask, order, zones,
- can_try_harder);
+ page = get_page_from_freelist(gfp_mask, order, zones, -1);
if (page)
goto got_pg;


However ...
1) The above also would change __GFP_HIGH and rt allocations to also
ignore mins entirely, instead of just going deeper into reserves,
on this pass. That is likely not good.
2) I can't get my head wrapped around Nick's reply to this patch.

So my above patch is no doubt flawed in one or more ways.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-10-30 01:47:43

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

A couple more items:
1) Lets try for a consistent use of type "gfp_t" for gfp_mask.
2) The can_try_harder flag values were driving me nuts.
3) The "inline" you added to buffered_rmqueue() blew up my compile.
4) The return from try_to_free_pages() was put in "i" for no evident reason.
5) I have no clue what the replenish flag you added to buffered_rmqueue does.

You patch has:
> can_try_harder can have following
> * values:
> * -1 => No need to check for the watermarks.
> * 0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
> * 1 => Go far below the low watermark. See zone_watermark_ok (RT TASK)

Later on, you have an inequality test on this value:
if ((can_try_harder >= 0)
and a non-zero test:
if (can_try_harder)

That's three magic values, not even in increasing order of "how hard
one should try", tested a couple of different ways that requires
absorbing the complete details of the three values and their ordering
before one can read the code.

I'd like to use a enum type, to put names to these 3 values, and I'd
like to put them in order of increasing desperation.

The following patch tweaks the gfp_mask type, makes can_try_harder
an enum, removes the inline from buffered_rmqueue(), doesn't stash
the return value of try_to_free_pages(), and removes buffered_rmqueue's
replenish flag. It applies on top of your patch. If you like it, I'd
be happiest if you picked it up and incorporated it into your work.

I compiled it once, never tested or booted.

Signed-off-by: Paul Jackson <[email protected]>

---

include/linux/mmzone.h | 9 ++++++++-
mm/page_alloc.c | 39 +++++++++++++++++++--------------------
mm/vmscan.c | 9 ++++++---
3 files changed, 33 insertions(+), 24 deletions(-)

--- 2.6.14-rc5-mm1-cpuset-patches.orig/include/linux/mmzone.h 2005-10-25 09:37:45.154588634 -0700
+++ 2.6.14-rc5-mm1-cpuset-patches/include/linux/mmzone.h 2005-10-29 17:58:28.433716118 -0700
@@ -330,8 +330,15 @@ void get_zone_counts(unsigned long *acti
unsigned long *free);
void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone, int order);
+
+typedef enum {
+ dont_try_harder, /* Don't go much below low watermark (GFP_HIGH) */
+ try_harder, /* Go far below low watermark (RT TASK) */
+ try_really_hard /* Ignore all watermarks */
+} can_try_harder_t;
+
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int alloc_type, int can_try_harder, int gfp_high);
+ int alloc_type, can_try_harder_t can_try_harder, int gfp_high);

#ifdef CONFIG_HAVE_MEMORY_PRESENT
void memory_present(int nid, unsigned long start, unsigned long end);
--- 2.6.14-rc5-mm1-cpuset-patches.orig/mm/page_alloc.c 2005-10-29 14:44:55.371576062 -0700
+++ 2.6.14-rc5-mm1-cpuset-patches/mm/page_alloc.c 2005-10-29 18:40:41.358953388 -0700
@@ -713,8 +713,8 @@ static inline void prep_zero_page(struct
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-struct inline page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags, int replenish)
+struct page *
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
struct page *page = NULL;
@@ -725,7 +725,7 @@ buffered_rmqueue(struct zone *zone, int

pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
local_irq_save(flags);
- if ((pcp->count <= pcp->low) && replenish)
+ if (pcp->count <= pcp->low)
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
if (pcp->count) {
@@ -756,16 +756,12 @@ buffered_rmqueue(struct zone *zone, int
}

/* get_page_from_freeliest loops through all the possible zones
- * to find out if it can allocate a page. can_try_harder can have following
- * values:
- * -1 => No need to check for the watermarks.
- * 0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
- * 1 => Go far below the low watermark. See zone_watermark_ok (RT TASK)
+ * to find out if it can allocate a page.
*/

static struct page *
-get_page_from_freelist(unsigned int __nocast gfp_mask, unsigned int order,
- struct zone **zones, int can_try_harder)
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+ struct zone **zones, can_try_harder_t can_try_harder)
{
struct zone *z;
struct page *page = NULL;
@@ -780,13 +776,13 @@ get_page_from_freelist(unsigned int __no
if (!cpuset_zone_allowed(z, gfp_mask))
continue;

- if ((can_try_harder >= 0) &&
+ if ((can_try_harder < try_really_hard) &&
(!zone_watermark_ok(z, order, z->pages_low,
classzone_idx, can_try_harder,
gfp_mask & __GFP_HIGH)))
continue;

- page = buffered_rmqueue(z, order, gfp_mask, 1);
+ page = buffered_rmqueue(z, order, gfp_mask);
if (page)
break;
}
@@ -798,7 +794,7 @@ get_page_from_freelist(unsigned int __no
* of the allocation.
*/
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int can_try_harder, int gfp_high)
+ int classzone_idx, can_try_harder_t can_try_harder, int gfp_high)
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
@@ -806,7 +802,7 @@ int zone_watermark_ok(struct zone *z, in

if (gfp_high)
min -= min / 2;
- if (can_try_harder)
+ if (can_try_harder >= try_harder)
min -= min / 4;

if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -878,7 +874,7 @@ static inline void __stack_trace(struct
}

static inline void set_page_owner(struct page *page,
- unsigned int order, unsigned int gfp_mask)
+ unsigned int order, gfp_t gfp_mask)
{
unsigned long address, bp;
#ifdef CONFIG_X86_64
@@ -906,7 +902,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
struct task_struct *p = current;
int i;
int do_retry;
- int can_try_harder;
+ can_try_harder_t can_try_harder = dont_try_harder;

might_sleep_if(wait);

@@ -915,7 +911,8 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
* cannot run direct reclaim, or is the caller has realtime scheduling
* policy
*/
- can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
+ if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+ can_try_harder = try_harder;

zones = zonelist->zones; /* the list of zones suitable for gfp_mask */

@@ -924,7 +921,8 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
return NULL;
}
restart:
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones, 0);
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones,
+ dont_try_harder);
if (page)
goto got_pg;

@@ -951,7 +949,8 @@ restart:
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
/* go through the zonelist yet again, ignoring mins */
- page = get_page_from_freelist(gfp_mask, order, zones, -1);
+ page = get_page_from_freelist(gfp_mask, order, zones,
+ try_really_hard);
if (page)
goto got_pg;
}
@@ -970,7 +969,7 @@ rebalance:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- i = try_to_free_pages(zones, gfp_mask);
+ try_to_free_pages(zones, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
--- 2.6.14-rc5-mm1-cpuset-patches.orig/mm/vmscan.c 2005-10-25 09:38:47.225573361 -0700
+++ 2.6.14-rc5-mm1-cpuset-patches/mm/vmscan.c 2005-10-29 17:21:39.268886029 -0700
@@ -1082,7 +1082,8 @@ loop_again:
continue;

if (!zone_watermark_ok(zone, order,
- zone->pages_high, 0, 0, 0)) {
+ zone->pages_high, 0, 0,
+ dont_try_harder)) {
end_zone = i;
goto scan;
}
@@ -1119,7 +1120,8 @@ scan:

if (nr_pages == 0) { /* Not software suspend */
if (!zone_watermark_ok(zone, order,
- zone->pages_high, end_zone, 0, 0))
+ zone->pages_high, end_zone,
+ 0, dont_try_harder))
all_zones_ok = 0;
}
zone->temp_priority = priority;
@@ -1267,7 +1269,8 @@ void wakeup_kswapd(struct zone *zone, in
return;

pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
+ if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0,
+ dont_try_harder))
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;


--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-10-30 02:00:12

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson wrote:
> A couple more items:
> 1) Lets try for a consistent use of type "gfp_t" for gfp_mask.
> 2) The can_try_harder flag values were driving me nuts.

Please instead use a second argument 'gfp_high', which will nicely
match zone_watermark_ok, and use that consistently when converting
__alloc_pages code to use get_page_from_freelist. Ie. keep current
behaviour.

That would solve my issues with the patch.

> 3) The "inline" you added to buffered_rmqueue() blew up my compile.

How? Why? This should be solved because a future possible feature
(early allocation from pcp lists) will want inlining in order to
propogate the constant 'replenish' argument.

> 4) The return from try_to_free_pages() was put in "i" for no evident reason.
> 5) I have no clue what the replenish flag you added to buffered_rmqueue does.
>

Slight patch mis-split I guess. For the cleanup patch, you're right,
this should be removed.

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-10-30 02:20:05

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Nick, replying to pj:
> > 3) The "inline" you added to buffered_rmqueue() blew up my compile.
>
> How? Why? This should be solved because a future possible feature
> (early allocation from pcp lists) will want inlining in order to
> propogate the constant 'replenish' argument.

If I make the following change to a copy of mm/page_alloc.c:

=================================
--- 2.6.14-rc5-mm1-cpuset-patches.orig/mm/page_alloc.c 2005-10-29 19:04:13.745641793 -0700
+++ 2.6.14-rc5-mm1-cpuset-patches/mm/page_alloc.c 2005-10-29 19:04:03.085367810 -0700
@@ -713,7 +713,7 @@ static inline void prep_zero_page(struct
* we cheat by calling it from here, in the order > 0 path. Saves a branch
* or two.
*/
-struct page *
+struct inline page *
buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
{
unsigned long flags;
=================================

Then it goes from compiling ok, to failing with 61 lines of
error output, beginning with:

mm/page_alloc.c:716: error: parse error before "inline"
mm/page_alloc.c:721: error: `gfp_flags' undeclared here (not in a function)
mm/page_alloc.c:723: error: parse error before "if"
mm/page_alloc.c:726: warning: type defaults to `int' in declaration of `pcp'
mm/page_alloc.c:726: error: `zone' undeclared here (not in a function)
mm/page_alloc.c:726: error: braced-group within expression allowed only inside a function
mm/page_alloc.c:726: warning: data definition has no type or storage class
mm/page_alloc.c:726: warning: type defaults to `int' in declaration of `debug_smp_processor_id'
mm/page_alloc.c:726: warning: function declaration isn't a prototype
mm/page_alloc.c:726: error: conflicting types for `debug_smp_processor_id'
include/linux/smp.h:131: error: previous declaration of `debug_smp_processor_id'

This is gcc 3.3.3, compiling sn2_defconfig on and for an SN2.

Perhaps "inline struct page *" would work better than "struct inline page *" ?
... yes ... that fixes my compiler complaints.

Also ... buffered_rmqueue() is a rather large function to be inlining.
And if it is inlined, then are you expecting to also have an out of
line copy, for use by the call to it from mm/swap_prefetch.c
prefetch_get_page()?

Adding the 'inline' keyword increases my kernel text size by
1448 bytes, for the extra copy of this code used inline from
the call to it from mm/page_alloc.c:get_page_from_freelist().
Is that really worth it?

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-10-30 02:26:19

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

> > 2) The can_try_harder flag values were driving me nuts.
>
> Please instead use a second argument 'gfp_high', which will nicely
> match zone_watermark_ok, and use that consistently when converting
> __alloc_pages code to use get_page_from_freelist. Ie. keep current
> behaviour.

Well ... I still don't understand what you're suggesting, so I
guess I will have to wait for an actual patch incorporating it.

Are you also objecting to converting "can_try_harder" to an
enum, and getting the values in order of desperation? If so,
I don't why you object.

And there is still the issue that I don't think cpuset constraints
should be applied in the last attempt before oom_killing for
GFP_ATOMIC requests.

I will await the next version of the patch, and see if it meets
my concerns. I am missing a couple too many clues to add more
at this point.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-10-30 02:30:56

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson wrote:
> Nick, replying to pj:
>
>>> 3) The "inline" you added to buffered_rmqueue() blew up my compile.
>>
>>How? Why? This should be solved because a future possible feature
>>(early allocation from pcp lists) will want inlining in order to
>>propogate the constant 'replenish' argument.
>
>
>
> Perhaps "inline struct page *" would work better than "struct inline page *" ?
> ... yes ... that fixes my compiler complaints.
>

Ah, yep.

> Also ... buffered_rmqueue() is a rather large function to be inlining.

It is, however there would only be 2 calls, and one I think would
also have a constant 0 for "order".

Though yeah, it may be better split another way. For this patch,
it shouldn't matter because it is static and will only have one
callsite so should be inlined anyway.

> And if it is inlined, then are you expecting to also have an out of
> line copy, for use by the call to it from mm/swap_prefetch.c
> prefetch_get_page()?
>

No, that shouldn't be there though.

> Adding the 'inline' keyword increases my kernel text size by
> 1448 bytes, for the extra copy of this code used inline from
> the call to it from mm/page_alloc.c:get_page_from_freelist().
> Is that really worth it?
>

Hmm, where is the other callsite? (sorry I don't have a copy
of -mm handy so I'm just looking at 2.6).

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-10-30 02:34:46

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson wrote:
>>> 2) The can_try_harder flag values were driving me nuts.
>>
>>Please instead use a second argument 'gfp_high', which will nicely
>>match zone_watermark_ok, and use that consistently when converting
>>__alloc_pages code to use get_page_from_freelist. Ie. keep current
>>behaviour.
>
>
> Well ... I still don't understand what you're suggesting, so I
> guess I will have to wait for an actual patch incorporating it.
>

See how can_try_harder and gfp_high is used currently. They
are simple boolean values and are easily derived from parameters
passed into __alloc_pages.

> Are you also objecting to converting "can_try_harder" to an
> enum, and getting the values in order of desperation? If so,
> I don't why you object.
>

Because then to get current behaviour you would have to add
branches to get the correct enum value.

> And there is still the issue that I don't think cpuset constraints
> should be applied in the last attempt before oom_killing for
> GFP_ATOMIC requests.
>

Sure.

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2005-10-30 03:06:45

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Nick, replying to pj:
> > And if it is inlined, then are you expecting to also have an out of
> > line copy, for use by the call to it from mm/swap_prefetch.c
> > prefetch_get_page()?
> >
>
> No, that shouldn't be there though.
>
> > Adding the 'inline' keyword increases my kernel text size by
> > 1448 bytes, for the extra copy of this code used inline from
> > the call to it from mm/page_alloc.c:get_page_from_freelist().
> > Is that really worth it?
> >
>
> Hmm, where is the other callsite?

The other callsite is mm/swap_prefetch.c:prefetch_get_page(), from Con
Kolivas's mm-implement-swap-prefetching.patch patch in *-mm, dated
about six days ago.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-10-30 03:09:29

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Nick wrote:
> See how can_try_harder and gfp_high is used currently.

Ah - by "current" you meant in Linus's or Andrew's tree,
not as in Seth's current patch. Since they are booleans,
rather than tri-values, using an enum is overkill. Ok.

Now I'm one less clue short of understanding. Thanks.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-10-30 03:51:48

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson wrote:
> Nick, replying to pj:
>

>>Hmm, where is the other callsite?
>
>
> The other callsite is mm/swap_prefetch.c:prefetch_get_page(), from Con
> Kolivas's mm-implement-swap-prefetching.patch patch in *-mm, dated
> about six days ago.
>

OK, I haven't looked at those patches really. I think some of that
stuff should go into page_alloc.c and I'd prefer to keep
buffered_rmqueue static.

But no matter for the cleanup patch at hand: let's leave the inline
off, and the compiler will do the right thing if it is static and
there is just a single call site (and I think newer gccs will do
function versioning if there are constant arguments).

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-10-30 03:54:15

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson wrote:
> Nick wrote:
>
>>See how can_try_harder and gfp_high is used currently.
>
>
> Ah - by "current" you meant in Linus's or Andrew's tree,
> not as in Seth's current patch. Since they are booleans,
> rather than tri-values, using an enum is overkill. Ok.
>

Yup.

> Now I'm one less clue short of understanding. Thanks.
>

I'll be more constructive next time round, and provide an
actual patch to address any of my remaining concerns after
this latest round of feedback.

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-10-30 04:11:47

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

> I'll be more constructive next time round ...

That's no problem. If Seth can figure this out and
produce a good next round of patch, that's sufficient.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-10-31 19:02:45

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Sat, 2005-10-29 at 17:16 -0700, Paul Jackson wrote:
> Seth wroteL
> > @@ -851,19 +853,11 @@
> > * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
> > * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
> > */
> > - for (i = 0; (z = zones[i]) != NULL; i++) {
> > - if (!zone_watermark_ok(z, order, z->pages_min,
> > - classzone_idx, can_try_harder,
> > - gfp_mask & __GFP_HIGH))
> > - continue;
> > -
> > - if (wait && !cpuset_zone_allowed(z, gfp_mask))
> > - continue;
> > -
> > - page = buffered_rmqueue(z, order, gfp_mask);
> > - if (page)
> > - goto got_pg;
> > - }
> > + if (!wait)
> > + page = get_page_from_freelist(gfp_mask, order, zones,
> > + can_try_harder);
>
> Thanks for the clean-up work. Good stuff.
>
> I think you've changed the affect that the cpuset check has on the
> above pass.
>
> As you know, the above is the last chance we have for GFP_ATOMIC (can't
> wait) allocations before getting into the oom_kill code. The code had
> been written to ignore cpuset constraints for GFP_ATOMIC (that is,
> "!wait") allocations. The intent is to allow taking GFP_ATOMIC memory
> from any damn node we can find it on, rather than start killing.
>
> Your change will call into get_page_from_freelist() in such cases,
> where the cpuset check is still done.


Shooo....I will fix it.

-rohit

2005-10-31 20:48:13

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Sat, 2005-10-29 at 12:33 +1000, Nick Piggin wrote:

> Rohit, Seth wrote:
> > the only changes in this clean up are:
> >
>
> Looking good. I imagine it must be good for icache.
> Man, the page allocator somehow turned unreadable since I last
> looked at it! We will want this patch.
>

Thanks for your comments.

> > 1- remove the initial direct reclaim logic
> > 2- GFP_HIGH pages are allowed to go little below low watermark sooner
>
> I don't think #2 is any good. The reason we don't check GFP_HIGH on
> the first time round is because we simply want to kick kswapd at its
> normal watermark - ie. it doesn't matter what kind of allocation this
> is, kswapd should start at the same time no matter what.
>
> If you don't do this, then a GFP_HIGH allocator can allocate right
> down to its limit before it kicks kswapd, then it either will fail or
> will have to do direct reclaim.
>

You are right if there are only GFP_HIGH requests coming in then the
allocation will go down to (min - min/2) before kicking in kswapd.
Though if the requester is not ready to wait, there is another good shot
at allocation succeed before we get into direct reclaim (and this is
happening based on can_try_harder flag).

> >
> > got_pg:
> > - zone_statistics(zonelist, z);
> > + zone_statistics(zonelist, page_zone(page));
> > return page;
>
> How about moving the zone_statistics up into the 'if (page)'
> test of get_page_from_freelist? This way we don't have to
> evaluate page_zone().
>

Let us keep this as is for now. Will revisit once after the
pcp_prefer_allocation patches get in place.

Thanks,
-rohit

2005-10-31 21:14:00

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Sat, 2005-10-29 at 18:47 -0700, Paul Jackson wrote:
> A couple more items:
> 1) Lets try for a consistent use of type "gfp_t" for gfp_mask.
> 2) The can_try_harder flag values were driving me nuts.

Not sure why? Let me see if some new values could better articulate the
meaning. Currently if value is < 0 then don't check the watermarks.
When we do check for watermarks, then the value of 1 indicates that it
could go below minimum value.

> 3) The "inline" you added to buffered_rmqueue() blew up my compile.

I will remove the inline based on your and Nick emails. Though my patch
had inline before the struct....

-static struct page *
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+static inline struct page *
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags, int
replenish)

...so that shouldn't have caused any problem.

> 4) The return from try_to_free_pages() was put in "i" for no evident reason.

Will be fixed.

> 5) I have no clue what the replenish flag you added to buffered_rmqueue does.
>

A bit futuristic. Will need it when pcp allocations gets checked first
(as Nick also mentioned). Will remove it for now.

> You patch has:
> > can_try_harder can have following
> > * values:
> > * -1 => No need to check for the watermarks.
> > * 0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
> > * 1 => Go far below the low watermark. See zone_watermark_ok (RT TASK)
>
> Later on, you have an inequality test on this value:
> if ((can_try_harder >= 0)
> and a non-zero test:
> if (can_try_harder)

The last line is from zone_watermark_ok. The first check is in
get_page_from_freelist. There is no (can_try_harder) check for this
flag in that function.

>
> That's three magic values, not even in increasing order of "how hard
> one should try", tested a couple of different ways that requires
> absorbing the complete details of the three values and their ordering
> before one can read the code.
>



2005-10-31 21:28:30

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Rohit wrote:
> Not sure why? Let me see if some new values could better articulate the
> meaning.

See also Nick's comments, before going too far. He was advocating
just using binary flags and adding a gfp_high flag, or something
like that.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-01 01:14:12

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Rohit Seth wrote:
> On Sat, 2005-10-29 at 12:33 +1000, Nick Piggin wrote:

>>If you don't do this, then a GFP_HIGH allocator can allocate right
>>down to its limit before it kicks kswapd, then it either will fail or
>>will have to do direct reclaim.
>>
>
>
> You are right if there are only GFP_HIGH requests coming in then the
> allocation will go down to (min - min/2) before kicking in kswapd.
> Though if the requester is not ready to wait, there is another good shot
> at allocation succeed before we get into direct reclaim (and this is
> happening based on can_try_harder flag).
>

Still, it is a change in behaviour that I would rather not introduce
with a cleanup patch (and is something we don't want to introduce anyway).

So if you could fix that up it would be good.

>>How about moving the zone_statistics up into the 'if (page)'
>>test of get_page_from_freelist? This way we don't have to
>>evaluate page_zone().
>>
>
>
> Let us keep this as is for now. Will revisit once after the
> pcp_prefer_allocation patches get in place.
>

Well page_zone is yet another cacheline that doesn't need to be touched,
and that is introduced by this patch. But the line is likely to be hot,
and get_page_from_freelist does not have the required 'zonelist' which
I didn't notice before.

So OK, revisit this later.

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-11-04 18:08:27

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Tue, 2005-11-01 at 12:14 +1100, Nick Piggin wrote:
> Rohit Seth wrote:
> > On Sat, 2005-10-29 at 12:33 +1000, Nick Piggin wrote:
>
> >>If you don't do this, then a GFP_HIGH allocator can allocate right
> >>down to its limit before it kicks kswapd, then it either will fail or
> >>will have to do direct reclaim.
> >>
> >
> >
> > You are right if there are only GFP_HIGH requests coming in then the
> > allocation will go down to (min - min/2) before kicking in kswapd.
> > Though if the requester is not ready to wait, there is another good shot
> > at allocation succeed before we get into direct reclaim (and this is
> > happening based on can_try_harder flag).
> >
>
> Still, it is a change in behaviour that I would rather not introduce
> with a cleanup patch (and is something we don't want to introduce anyway).
>
> So if you could fix that up it would be good.
>

Nick, sorry for not responding earlier.

I agree that it is slight change in behavior from original. I doubt
though it will impact any one in any negative way (may be for some
higher order allocations if at all). On a little positive side, less
frequent calls to kswapd for some cases and clear up the code a little
bit.

But I really don't want to get stuck here. The pcp traversal and
flushing is where I want to go next.

Thanks,
-rohit

2005-11-04 23:58:19

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Rohit Seth wrote:

>
> Nick, sorry for not responding earlier.
>

That's OK.

> I agree that it is slight change in behavior from original. I doubt
> though it will impact any one in any negative way (may be for some
> higher order allocations if at all). On a little positive side, less
> frequent calls to kswapd for some cases and clear up the code a little
> bit.
>

I really don't want a change of behaviour going in with this,
especially not one which I would want to revert anyway. But
don't get hung up with it - when you post your latest patch
I will make a patch for the changes I would like to see for it
and synch things up.

> But I really don't want to get stuck here. The pcp traversal and
> flushing is where I want to go next.
>

Sure, hope it goes well!

Nick

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-11-05 01:57:33

by Rohit Seth

[permalink] [raw]
Subject: RE: [PATCH]: Clean up of __alloc_pages


From: Nick Piggin [mailto:[email protected]]


>I really don't want a change of behaviour going in with this,
>especially not one which I would want to revert anyway. But
>don't get hung up with it - when you post your latest patch
>I will make a patch for the changes I would like to see for it
>and synch things up.


Well, those reverts are what I'm trying to avoid ;-)

2005-11-05 17:09:17

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson <[email protected]> writes:


Regarding cpumemset and alloc_pages. I recently rechecked
the cpumemset hooks in there and I must say they turned out
to be quite worse

In hindsight it would have been better to use the "generate
zonelists for all possible nodes" approach you originally had
and which I rejected (sorry)

That would make the code much cleaner and faster.
Maybe it's not too late to switch for that?

If not then the fast path definitely needs to be tuned a bit.

-Andi

2005-11-06 04:18:55

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

> Regarding cpumemset and alloc_pages. I recently rechecked
> the cpumemset hooks in there and I must say they turned out
> to be quite worse
>
> In hindsight it would have been better to use the "generate
> zonelists for all possible nodes" approach you originally had
> and which I rejected (sorry)
>
> That would make the code much cleaner and faster.
> Maybe it's not too late to switch for that?
>
> If not then the fast path definitely needs to be tuned a bit


(What Andi calls cpumemsets are what others call cpusets;
I faked him out with a threatened name change that never
happened. -pj)


Sure, this could be changed to the other scheme if we wanted. Only
internals of the cpuset, mempolicy, and page_alloc code are affected.

I see only small differences in performance, between the two choices,
in most cases.

The current code in the kernel does the following:
1) The cpuset_update_current_mems_allowed() calls in the
various alloc_page*() paths in mm/mempolicy.c:
* take the task_lock spinlock on the current task
* compare the tasks mems_generation to that in its cpuset
2) The first cpuset_zone_allowed() call or two, near the top
of mm/page_alloc.c:__alloc_pages():
* check in_interrupt()
* check if the zone's node is set in task->mems_allowed

This task_lock spinlock, or some performance equivalent, is, I think,
unavoidable.

An essential difference between mempolicy and cpusets is that cpusets
supports outside manipulation of a tasks memory placement. Sooner or
later, the task has to synchronize with these outside changes, and a
task_lock(current) in the path to __alloc_pages() is the lowest cost
way I could find to do this.

The one functional difference I know of affects mempolicy MPOL_BIND,
not cpusets. This subset zonelist proposal provided higher quality
zonelists to MPOL_BIND, properly sorted to start the search on the
faulting node and working outward.

===

The alternative approach that Andi refers to, along with some analysis
of its tradeoffs, is visible at:

http://lkml.org/lkml/2004/8/2/256
Date: Mon, 2 Aug 2004 16:35:06 -0700 (PDT)
Subject: [PATCH] subset zonelists and big numa friendly mempolicy MPOL_MBIND

Instead of passing a zonelist to __alloc_pages() with all the
systems nodes, and skipping over those zones on nodes not allowed,
this alternative would build custom zonelists for each cpuset and
each MPOL_BIND mempolicy, containing just the allowed nodes. A set
of zonelists is needed, one for each node in the cpuset, each one
sorted differently so that we search for memory on the nodes closest
to the faulting node first. This set of zonelists has to be packed
in, or otherwise they could eat alot of memory on large systems,
since they are N-squared in size, for the number of nodes in the set.

This should make Christoph Lameter happer too. He has been complaining
that the zonelists used by MPOL_BIND are not properly sorted to
look on the nodes nearest the current one first. The fancy subset
zonelists of my proposal provide properly sorted zonelists for
MPOL_BIND policies.

Work that would need to be done for this alternative:

* recode __GFP_HARDWALL to replace the initial short zonelist
with a longer zonelist if need be, and restart __alloc_pages.
* convert some code in this old patch from bitmaps to nodemasks
* add the cpuset hooks to these subset zonelists (the above patch
just has the infrastructure and MPOL_BIND hooks)

Comparative benefits between the current implementation and this
alternative:

Neutral:
Systems with just one or a few memory nodes on the system won't
notice much.

Neutral:
When there's enough free memory on the current node to meet the
request, it won't make much difference. A free page on the
current node is found quickly enough, either way.

Alternative wins:
The limit case of searching across many nodes on a large system
that is short of memory should save a few CPU cycles with this
alternative.

Alternative wins:
The instruction cache footprint of __alloc_pages, when one is
short of memory, should be a little smaller with this alternative.

Current way wins:
The kernel text size would likely grow a bit, as the code to pack
zonelists is a couple pages.

Alternative wins:
It provides properly sorted zonelists for MPOL_BIND.

Current way wins:
It's less work - we've already got it.

Alternative wins:
Code esthetics - I haven't coded it, but I suspect that the
alternative has a smaller, prettier impact on __alloc_pages().
From the above Aug 2004 patch, one can see that the alternative
definitely reduces the code size of mm/mempolicy.c, by removing
some specialized MPOL_BIND code.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-06 17:36:07

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Sunday 06 November 2005 05:18, Paul Jackson wrote:

> The current code in the kernel does the following:
> 1) The cpuset_update_current_mems_allowed() calls in the
> various alloc_page*() paths in mm/mempolicy.c:
> * take the task_lock spinlock on the current task

That needs to go imho. At least for the common "cpusets compiled in, but not
used" case. We already have too many locks. Even with cpusets - why
can't you test that generation lockless?

> * compare the tasks mems_generation to that in its cpuset

> 2) The first cpuset_zone_allowed() call or two, near the top
> of mm/page_alloc.c:__alloc_pages():
> * check in_interrupt()
> * check if the zone's node is set in task->mems_allowed

It's also too slow for the common "compiled in but not used" case.

I did a simple patch for that - at least skip all the loops when there
is no cpuset - but it got lost in a disk crash.

> This task_lock spinlock, or some performance equivalent, is, I think,
> unavoidable.

why?

>
> An essential difference between mempolicy and cpusets is that cpusets
> supports outside manipulation of a tasks memory placement.

Yes, that is their big problem (there is a reason I'm always complaining
about attempts to change mempolicy externally)

But actually some mempolicy can be already changed outside the task -
using VMA policy.

> Sooner or
> later, the task has to synchronize with these outside changes, and a
> task_lock(current) in the path to __alloc_pages() is the lowest cost
> way I could find to do this.

Why can't it just test that generation number lockless after testing
if there is a cpuset at all?

-Andi

2005-11-06 20:50:04

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Andi wrote:
> > The current code in the kernel does the following:
> > 1) The cpuset_update_current_mems_allowed() calls in the
> > various alloc_page*() paths in mm/mempolicy.c:
> > * take the task_lock spinlock on the current task
>
> That needs to go imho.

The comment for refresh_mems(), where this is happening, explains
why this lock is needed:

* The task_lock() is required to dereference current->cpuset safely.
* Without it, we could pick up the pointer value of current->cpuset
* in one instruction, and then attach_task could give us a different
* cpuset, and then the cpuset we had could be removed and freed,
* and then on our next instruction, we could dereference a no longer
* valid cpuset pointer to get its mems_generation field.

Hmmm ... on second thought ... damn ... you're right.

I can just flat out remove that task_lock - without penalty.

It's *OK* if I dereference a no longer valid cpuset pointer to get
its (used to be) mems_generation field. Either that field will have
already changed, or it won't.

If it has changed to some other value (doesn't matter what value)
I will realize (quite correctly) that my cpuset has changed out
from under me, and go into the slow path code to lock things down
and update things as need be.

If it has not changed yet (far the more likely case) then I will
have just missed by the skin of my teeth catching this cpuset
change this time around. Which is just fine. I will catch it next
time this task allocates memory, for sure, as I will be using the
new cpuset pointer value the next time, for sure.

Patch coming soon to remove that task_lock/task_unlock.

Thanks.


> At least for the common "cpusets compiled in, but not
> used" case.

Hmmm ... that comment got me to thinking too. I could have a global
kernel flag "cpusets_have_been_used", initialized to zero (0), set to
one (1) anytime someone creates or modifies a cpuset. Then most of my
hooks, in places like fork and exit, could collapse to really trivial
inline lockless code if cpusets have not been used yet since the system
booted.

Would you be interested in seeing such a patch?

This should even apply to the more interesting case of
cpuset_zone_allowed(), which is called for each iteration of each
zone loop in __alloc_pages(). That would be a nice win for those
systems making no active use of cpusets.

===

What about the other part of your initial question - replacing the
cpuset_zone_allowed() hooks in __alloc_pages() with code to build
custom zonelists?

I did my best in my previous reply to spell out the pluses and minuses
of that change and what work would be involved.

What's your recommendation on whether to do that or not?

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-07 02:55:44

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson wrote:
> Andi wrote:
>
>>>The current code in the kernel does the following:
>>> 1) The cpuset_update_current_mems_allowed() calls in the
>>> various alloc_page*() paths in mm/mempolicy.c:
>>> * take the task_lock spinlock on the current task
>>
>>That needs to go imho.
>
>
> The comment for refresh_mems(), where this is happening, explains
> why this lock is needed:
>
> * The task_lock() is required to dereference current->cpuset safely.
> * Without it, we could pick up the pointer value of current->cpuset
> * in one instruction, and then attach_task could give us a different
> * cpuset, and then the cpuset we had could be removed and freed,
> * and then on our next instruction, we could dereference a no longer
> * valid cpuset pointer to get its mems_generation field.
>
> Hmmm ... on second thought ... damn ... you're right.
>
> I can just flat out remove that task_lock - without penalty.
>
> It's *OK* if I dereference a no longer valid cpuset pointer to get
> its (used to be) mems_generation field. Either that field will have
> already changed, or it won't.
>

I don't think so because if the cpuset can be freed, then its page
might be unmapped from the kernel address space if use-after-free
debugging is turned on. And this is a use after free :)

Also, it may be reused for something else far into the future without
having its value changed - is this OK?

Anyway, I think the first problem is a showstopper. I'd look into
Hugh's SLAB_DESTROY_BY_RCU for this, which sounds like a good fit
if you need to go down this path (although I only had a quick skim
over the cpusets code).

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-11-07 03:43:15

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

On Monday 07 November 2005 03:57, Nick Piggin wrote:

>
> I don't think so because if the cpuset can be freed, then its page
> might be unmapped from the kernel address space if use-after-free
> debugging is turned on. And this is a use after free :)

RCU could be used to avoid that. Just only free it in a RCU callback.

-Andi

2005-11-07 03:44:38

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Nick wrote:
> I don't think so because if the cpuset can be freed, then its page
> might be unmapped from the kernel address space if use-after-free
> debugging is turned on. And this is a use after free :)

Yup - that is a showstopper. If dereferencing a stale pointer, even if
one doesn't really care what is read, is a no-no, then this is a no-no.

Thanks, Nick, for catching this.

This puts more value on the other idea I had - a global kernel flag
"cpusets_have_been_used", that could be used to short circuit all the
cpuset hooks on systems that never mucked with cpusets.

For any lurkers wondering why I am chasing stale pointers when I don't
care what I read, it's like this. Essentially, the task doing this
read is looking for an asychronous incoming level triggered signal
(going from the two mems_generations being equal to them being unequal),
that in this case is coming in at about the same time we are sampling
for it. Whether we realize this time that the signal came in, or
don't realize it until the next time we sample, doesn't really matter
to us. One way or the other, we'll see it, for sure the next sample if
not this one. So the details of what happened on this read (so long as
no one got annoyed that we tried to chase a stale pointer) don't really
matter. Unfortunately, Nick reminds us that someone will get annoyed.
Oh well.

> Also, it may be reused for something else far into the future without
> having its value changed - is this OK?

That part would be ok. If I failed to realize that the underlying
cpuset had changed this time through __alloc_pages(), I would see it
next time, when I picked up a fresh and useful copy of my task->cpuset
pointer, having long forgotten my stale copy. My stale cpuset pointer
only had a lifetime of a couple machine instructions.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-07 04:37:28

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Nick wrote:
> Anyway, I think the first problem is a showstopper. I'd look into
> Hugh's SLAB_DESTROY_BY_RCU for this ...

Andi wrote:
> RCU could be used to avoid that. Just only free it in a RCU callback.


... looking at mm/slab.h and rcupdate.h for the first time ...

Would this mean that I had to put the cpuset structures on their own
slab cache, marked SLAB_DESTROY_BY_RCU?

And is the pair of operators:
task_lock(current), task_unlock(current)
really that much worse than the pair of operatots
rcu_read_lock, rcu_read_unlock
which apparently reduce to:
preempt_disable, preempt_enable

Would this work something like the following? Say task A, on processor
AP, is trying to dereference its cpuset pointer, while task B, on
processor BP, is trying hard to destroy that cpuset. Then if task A
wraps its reference in <rcu_read_lock, rcu_read_unlock>, this will keep
the RCU freeing of that memory from completing, until interrupts on AP
are re-enabled.

For that matter, if I just put cpuset structs in their own slab
cache, would that be sufficient.

Nick - Does use-after-free debugging even catch use of objects
returned to their slab cache?

What about the other suggestions, Andi:
1) subset zonelists (which you asked to reconsider)
2) a kernel flag "cpusets_have_been_used" flag to short circuit
cpuset logic on systems not using cpusets.



--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-07 06:05:55

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson wrote:
> Nick wrote:
>
>>Anyway, I think the first problem is a showstopper. I'd look into
>>Hugh's SLAB_DESTROY_BY_RCU for this ...
>
>
> Andi wrote:
>
>>RCU could be used to avoid that. Just only free it in a RCU callback.
>
>
>
> ... looking at mm/slab.h and rcupdate.h for the first time ...
>

Yeah, take a look at rmap.c as well, and some of the comments in
changelogs if you need a better feel for it.

Basically SLAB_DESTROY_BY_RCU will allow the entries to be freed
back to the slab for reuse, but will not allow the slab caches to
be freed back to the page allocator inside rcu readside.

So your cpusets may be reused, but only as new cpusets. This should
be no problem at all for you.

> Would this mean that I had to put the cpuset structures on their own
> slab cache, marked SLAB_DESTROY_BY_RCU?
>
> And is the pair of operators:
> task_lock(current), task_unlock(current)
> really that much worse than the pair of operatots
> rcu_read_lock, rcu_read_unlock
> which apparently reduce to:
> preempt_disable, preempt_enable
>

You may also have to be careful about memory ordering when setting
a pointer which may be concurrently dereferenced by another CPU so
that stale data doesn't get picked up.

The set side needs an rcu_assign_pointer, and the dereference side
needs rcu_dereference. Unless you either don't care about races, or
already have the correct barriers in place. But it is better to be
safe.

> Would this work something like the following? Say task A, on processor
> AP, is trying to dereference its cpuset pointer, while task B, on
> processor BP, is trying hard to destroy that cpuset. Then if task A
> wraps its reference in <rcu_read_lock, rcu_read_unlock>, this will keep
> the RCU freeing of that memory from completing, until interrupts on AP
> are re-enabled.
>

Sounds like it should work.

> For that matter, if I just put cpuset structs in their own slab
> cache, would that be sufficient.
>

No, because the slab caches can get freed back into the general
page allocator at any time.

> Nick - Does use-after-free debugging even catch use of objects
> returned to their slab cache?
>

Yes (slab debugging catches write-after-free at least, I believe),
however there are exceptions made for RCU freed slab caches. That
is: it is acceptable to access a freed RCU slab object, especially
if you only read it (writes need to be more careful, but they're
possible in some situations).

> What about the other suggestions, Andi:
> 1) subset zonelists (which you asked to reconsider)
> 2) a kernel flag "cpusets_have_been_used" flag to short circuit
> cpuset logic on systems not using cpusets.
>

Not too sure at present. I think #1 might be a good idea though
it would be a bigger change. #2 again might be a good hack for
the time being, although it would be nice to try to get the same
performance from the normal cpuset fastpath.

My RCU suggestion was mainly an idea to get around your immediate
problem with a lockless fastpath, rather than advocating it over
any of the alternatives.

Thanks,
Nick

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-11-07 09:47:14

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Nick wrote:
> Yeah, take a look at rmap.c as well, and some of the comments in
> changelogs if you need a better feel for it.

Ok - thanks.


> So your cpusets may be reused, but only as new cpusets. This should
> be no problem at all for you.

Correct - should be no problem.


> > And is the pair of operators:
> > task_lock(current), task_unlock(current)
> > really that much worse than the pair of operators
> > ...
> > preempt_disable, preempt_enable

That part still surprises me a little. Is there enough difference in
the performance between:

1) task_lock, which is a spinlock on current->alloc_lock and
2) rcu_read_lock, which is .preempt_count++; barrier()

to justify a separate slab cache for cpusets and a little more code?

For all I know (not much) the task_lock might actually be cheaper ;).


> You may also have to be careful about memory ordering when setting
> a pointer which may be concurrently dereferenced by another CPU so
> that stale data doesn't get picked up.
>
> The set side needs an rcu_assign_pointer, and the dereference side
> needs rcu_dereference. Unless you either don't care about races,

I don't think I care ... I'm just sampling task->cpuset->mems_generation,
looking for it to change. Sooner or later, after it changes, I will get
an accurate read of it, realized it changed, and immediately down a
cpuset semaphore and reread all values of interest.

The semaphore down means doing an atomic_dec_return(), which imposes
a memory barrier, right?


> My RCU suggestion was mainly an idea to get around your immediate
> problem with a lockless fastpath, rather than advocating it over
> any of the alternatives.

Understood. Thanks for your comments on the alternatives - they
seem reasonable.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-07 10:15:27

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Paul Jackson wrote:
> Nick wrote:

>>>And is the pair of operators:
>>> task_lock(current), task_unlock(current)
>>>really that much worse than the pair of operators
>>> ...
>>> preempt_disable, preempt_enable
>
>
> That part still surprises me a little. Is there enough difference in
> the performance between:
>
> 1) task_lock, which is a spinlock on current->alloc_lock and
> 2) rcu_read_lock, which is .preempt_count++; barrier()
>
> to justify a separate slab cache for cpusets and a little more code?
>
> For all I know (not much) the task_lock might actually be cheaper ;).
>

But on a preempt kernel the spinlock must disable preempt as well!

Not to mention that a spinlock is an atomic op (though that is getting
cheaper these days) + 2 memory barriers (getting more expensive).

> The semaphore down means doing an atomic_dec_return(), which imposes
> a memory barrier, right?
>

Yep.

--
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com

2005-11-07 14:41:56

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Clean up of __alloc_pages

Ok ... so a spinlock (task_lock) costs one barrier and one atomic more
than an rcu_read_lock (on a preempt kernel where both spinlock and
rcu_read_lock cost a preempt), or something like that.

Thanks, Nick.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401