2005-11-08 01:44:04

by Rohit Seth

[permalink] [raw]
Subject: [PATCH]: Cleanup of __alloc_pages

[PATCH]: Clean up of __alloc_pages. Couple of difference from original behavior:
1- remove the initial reclaim logic
2- GFP_HIGH pages are allowed to go little below watermark sooner.
3- Search for free pages unconditionally after direct reclaim.

Signed-off-by: Rohit Seth <[email protected]>


--- linux-2.6.14.org/mm/page_alloc.c 2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.14/mm/page_alloc.c 2005-11-07 09:37:45.000000000 -0800
@@ -707,9 +707,7 @@
}
local_irq_restore(flags);
put_cpu();
- }
-
- if (page == NULL) {
+ } else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -770,6 +768,45 @@
return 1;
}

+/* get_page_from_freeliest loops through all the possible zones
+ * to find out if it can allocate a page. can_try_harder can have following
+ * values:
+ * -1 => No need to check for the watermarks.
+ * 0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
+ * 1 => Go far below the low watermark. See zone_watermark_ok (RT TASK)
+ *
+ * cpuset check is not performed when the skip_cpuset_chk flag is set.
+ */
+
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zone **zones,
+ int can_try_harder, int skip_cpuset_chk)
+{
+ struct zone *z;
+ struct page *page = NULL;
+ int classzone_idx = zone_idx(zones[0]);
+ int i;
+
+ /*
+ * Go through the zonelist once, looking for a zone with enough free.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ for (i = 0; (z = zones[i]) != NULL; i++) {
+ if (!skip_cpuset_chk && (!cpuset_zone_allowed(z, gfp_mask)))
+ continue;
+ if ((can_try_harder >= 0) &&
+ (!zone_watermark_ok(z, order, z->pages_low,
+ classzone_idx, can_try_harder,
+ gfp_mask & __GFP_HIGH)))
+ continue;
+
+ page = buffered_rmqueue(z, order, gfp_mask);
+ if (page)
+ break;
+ }
+ return page;
+}
+
/*
* This is the 'heart' of the zoned buddy allocator.
*/
@@ -778,15 +815,13 @@
struct zonelist *zonelist)
{
const int wait = gfp_mask & __GFP_WAIT;
- struct zone **zones, *z;
+ struct zone **zones, *z = NULL;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int i;
- int classzone_idx;
int do_retry;
int can_try_harder;
- int did_some_progress;

might_sleep_if(wait);

@@ -803,42 +838,10 @@
/* Should this ever happen?? */
return NULL;
}
-
- classzone_idx = zone_idx(zones[0]);
-
restart:
- /*
- * Go through the zonelist once, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- /*
- * If the zone is to attempt early page reclaim then this loop
- * will try to reclaim pages and check the watermark a second
- * time before giving up and falling back to the next zone.
- */
-zone_reclaim_retry:
- if (!zone_watermark_ok(z, order, z->pages_low,
- classzone_idx, 0, 0)) {
- if (!do_reclaim)
- continue;
- else {
- zone_reclaim(z, gfp_mask, order);
- /* Only try reclaim once */
- do_reclaim = 0;
- goto zone_reclaim_retry;
- }
- }
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zones, 0, 0);
+ if (page)
+ goto got_pg;

for (i = 0; (z = zones[i]) != NULL; i++)
wakeup_kswapd(z, order);
@@ -851,19 +854,11 @@
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (wait && !cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ if (!wait)
+ page = get_page_from_freelist(gfp_mask, order, zones,
+ can_try_harder, 1);
+ if (page)
+ goto got_pg;

/* This allocation should allow future memory freeing. */

@@ -871,13 +866,9 @@
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
/* go through the zonelist yet again, ignoring mins */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order, zones, -1, 1);
+ if (page)
+ goto got_pg;
}
goto nopage;
}
@@ -894,47 +885,20 @@
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zones, gfp_mask);
+ try_to_free_pages(zones, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;

cond_resched();

- if (likely(did_some_progress)) {
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
- } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ page = get_page_from_freelist(gfp_mask, order, zones, can_try_harder, 0);
+ if (page)
+ goto got_pg;
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
/*
- * Go through the zonelist yet one more time, keep
- * very high watermark here, this is only to catch
- * a parallel oom killing, we must fail if we're still
- * under heavy pressure.
+ * Start the OOM here.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_high,
- classzone_idx, 0, 0))
- continue;
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
-
out_of_memory(gfp_mask, order);
goto restart;
}
@@ -968,7 +932,7 @@
}
return NULL;
got_pg:
- zone_statistics(zonelist, z);
+ zone_statistics(zonelist, page_zone(page));
return page;
}


2005-11-08 01:54:11

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

"Rohit, Seth" <[email protected]> wrote:
>
> [PATCH]: Clean up of __alloc_pages. Couple of difference from original behavior:
> 1- remove the initial reclaim logic
> 2- GFP_HIGH pages are allowed to go little below watermark sooner.
> 3- Search for free pages unconditionally after direct reclaim.

Would it be possible to break these into three separate patches? The
cleanup part should be #1.

> +/* get_page_from_freeliest loops through all the possible zones
> + * to find out if it can allocate a page. can_try_harder can have following
> + * values:
> + * -1 => No need to check for the watermarks.
> + * 0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
> + * 1 => Go far below the low watermark. See zone_watermark_ok (RT TASK)
> + *
> + * cpuset check is not performed when the skip_cpuset_chk flag is set.
> + */
> +
> +static struct page *
> +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zone **zones,
> + int can_try_harder, int skip_cpuset_chk)
> +{
> + struct zone *z;
> + struct page *page = NULL;
> + int classzone_idx = zone_idx(zones[0]);
> + int i;
> +
> + /*
> + * Go through the zonelist once, looking for a zone with enough free.
> + * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
> + */
> + for (i = 0; (z = zones[i]) != NULL; i++) {
> + if (!skip_cpuset_chk && (!cpuset_zone_allowed(z, gfp_mask)))

It'd be nice to not have the `skip_cpuset_chk' flag there. a) it gives
Linus conniptions and b) it's a little extra overhead for !CONFIG_CPUSETS
kernels.

> - zone_statistics(zonelist, z);
> + zone_statistics(zonelist, page_zone(page));

Evaluating page_zone() is not completely trivial. Can we avoid the above?

2005-11-08 02:09:52

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

On Mon, 2005-11-07 at 17:53 -0800, Andrew Morton wrote:
> "Rohit, Seth" <[email protected]> wrote:
> >
> > [PATCH]: Clean up of __alloc_pages. Couple of difference from original behavior:
> > 1- remove the initial reclaim logic
> > 2- GFP_HIGH pages are allowed to go little below watermark sooner.
> > 3- Search for free pages unconditionally after direct reclaim.
>
> Would it be possible to break these into three separate patches? The
> cleanup part should be #1.
>

Doing the above three things as part of this clean up patch makes the
code look extra clean... Is there any specific issue coming out of 2 & 3
above.

> > + if (!skip_cpuset_chk && (!cpuset_zone_allowed(z, gfp_mask)))
>
> It'd be nice to not have the `skip_cpuset_chk' flag there. a) it gives
> Linus conniptions and b) it's a little extra overhead for !CONFIG_CPUSETS
> kernels.
>

I think it will be easier to do this change as a follow on patch as that
will change the header file, function definition and such. Can we defer
this to separate follow on patch.

> > - zone_statistics(zonelist, z);
> > + zone_statistics(zonelist, page_zone(page));
>
> Evaluating page_zone() is not completely trivial. Can we avoid the above?

Okay. Last time Nick also mentioned this but agreed to keep it here. I
will uplevel so that I don't go through the page_zone.

-rohit

2005-11-08 02:44:20

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Rohit Seth <[email protected]> wrote:
>
> On Mon, 2005-11-07 at 17:53 -0800, Andrew Morton wrote:
> > "Rohit, Seth" <[email protected]> wrote:
> > >
> > > [PATCH]: Clean up of __alloc_pages. Couple of difference from original behavior:
> > > 1- remove the initial reclaim logic
> > > 2- GFP_HIGH pages are allowed to go little below watermark sooner.
> > > 3- Search for free pages unconditionally after direct reclaim.
> >
> > Would it be possible to break these into three separate patches? The
> > cleanup part should be #1.
> >
>
> Doing the above three things as part of this clean up patch makes the
> code look extra clean...

With separate patches the changes can be better understood, and they can be
selectively dropped, and people looking for regressions with `git bisect'
will be able to pinpoint the source more accurately.

> Is there any specific issue coming out of 2 & 3
> above.

I haven't looked yet - all the changes are mixed together ;)

2005-11-08 03:07:28

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Seth wrote:
> +/* get_page_from_freeliest loops through all the possible zones
> + * to find out if it can allocate a page. can_try_harder can have following
> + * values:
> + * -1 => No need to check for the watermarks.
> + * 0 => Don't go too low down in deeps below the low watermark (GFP_HIGH)
> + * 1 => Go far below the low watermark. See zone_watermark_ok (RT TASK)

Argh.

These magic numbers, where in terms of how hard to try, 0 is less than
1 is less than -1, but where the order -does- matter for parsing such
tests as "if ((can_try_harder >= 0)" and where one has to read the
entire code to guess that, continue to give me conniptions.

I thought Nick had an alternative proposal, involving just boolean
flags. Why didn't you ever consider that?


> + * cpuset check is not performed when the skip_cpuset_chk flag is set.
> + */
> +
> +static struct page *
> +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zone **zones,
> + int can_try_harder, int skip_cpuset_chk)

Well - thanks for thinking of me ;). Though, as I suggested in my
reply last time, including a pseudo patch, I thought that the existing
flags such as can_try_harder had enough information to determine when
to do the cpuset check, without yet another flag for that. Having now
two magic 1's and 0's at the end of the calling argument lists is even
less readable.


Seth wrote in a later message, responding to Andrew:
> I think it will be easier to do this change as a follow on patch as that
> will change the header file, function definition and such. Can we defer
> this to separate follow on patch.

I have no clue what patch you have in mind here. Guess I'd have to see it.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-08 03:45:17

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

From: Rohit Seth <[email protected]>

Clean up of __alloc_pages.

Signed-off-by: Rohit Seth <[email protected]>

Restoration of previous behaviour, plus further cleanups
by introducing an 'alloc_flags', removing the last of
should_reclaim_zone.

Signed-off-by: Nick Piggin <[email protected]>


Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c 2005-11-08 14:39:56.000000000 +1100
+++ linux-2.6/mm/page_alloc.c 2005-11-08 14:42:11.000000000 +1100
@@ -707,9 +707,7 @@ buffered_rmqueue(struct zone *zone, int
}
local_irq_restore(flags);
put_cpu();
- }
-
- if (page == NULL) {
+ } else {
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -729,20 +727,25 @@ buffered_rmqueue(struct zone *zone, int
return page;
}

+#define ALLOC_WATERMARKS 0x01 /* check watermarks */
+#define ALLOC_HARDER 0x02 /* try to alloc harder */
+#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x08 /* check for correct cpuset */
+
/*
* Return 1 if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int classzone_idx, int can_try_harder, int gfp_high)
+ int classzone_idx, int alloc_flags)
{
/* free_pages my go negative - that's OK */
long min = mark, free_pages = z->free_pages - (1 << order) + 1;
int o;

- if (gfp_high)
+ if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
- if (can_try_harder)
+ if (alloc_flags & ALLOC_HARDER)
min -= min / 4;

if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -760,14 +763,41 @@ int zone_watermark_ok(struct zone *z, in
return 1;
}

-static inline int
-should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
-{
- if (!z->reclaim_pages)
- return 0;
- if (gfp_mask & __GFP_NORECLAIM)
- return 0;
- return 1;
+/*
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+ struct zonelist *zonelist, int alloc_flags)
+{
+ struct zone **z = zonelist->zones;
+ struct page *page = NULL;
+ int classzone_idx = zone_idx(*z);
+
+ /*
+ * Go through the zonelist once, looking for a zone with enough free.
+ * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ */
+ do {
+ if ((alloc_flags & ALLOC_CPUSET) &&
+ !cpuset_zone_allowed(*z, gfp_mask))
+ continue;
+
+ if (alloc_flags & ALLOC_WATERMARKS) {
+ if (!zone_watermark_ok(*z, order, (*z)->pages_low,
+ classzone_idx, alloc_flags))
+ continue;
+ }
+
+ page = buffered_rmqueue(*z, order, gfp_mask);
+ if (page) {
+ zone_statistics(zonelist, *z);
+ break;
+ }
+ z++;
+ } while (*z != NULL);
+ return page;
}

/*
@@ -778,70 +808,49 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
struct zonelist *zonelist)
{
const int wait = gfp_mask & __GFP_WAIT;
- struct zone **zones, *z;
+ struct zone **z;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
- int i;
- int classzone_idx;
int do_retry;
- int can_try_harder;
+ int alloc_flags;
int did_some_progress;

might_sleep_if(wait);

- /*
- * The caller may dip into page reserves a bit more if the caller
- * cannot run direct reclaim, or is the caller has realtime scheduling
- * policy
- */
- can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
-
- zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
+ z = zonelist->zones; /* the list of zones suitable for gfp_mask */

- if (unlikely(zones[0] == NULL)) {
+ if (unlikely(*z == NULL)) {
/* Should this ever happen?? */
return NULL;
}
+restart:
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ zonelist, ALLOC_WATERMARKS|ALLOC_CPUSET);
+ if (page)
+ goto got_pg;

- classzone_idx = zone_idx(zones[0]);
+ do {
+ wakeup_kswapd(*z, order);
+ z++;
+ } while (*z != NULL);

-restart:
/*
- * Go through the zonelist once, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+ * OK, we're below the kswapd watermark and have kicked background
+ * reclaim. Now things get more complex, so st up alloc_flags according
+ * to how we want to proceed.
+ *
+ * The caller may dip into page reserves a bit more if the caller
+ * cannot run direct reclaim, or is the caller has realtime scheduling
+ * policy.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- int do_reclaim = should_reclaim_zone(z, gfp_mask);
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- /*
- * If the zone is to attempt early page reclaim then this loop
- * will try to reclaim pages and check the watermark a second
- * time before giving up and falling back to the next zone.
- */
-zone_reclaim_retry:
- if (!zone_watermark_ok(z, order, z->pages_low,
- classzone_idx, 0, 0)) {
- if (!do_reclaim)
- continue;
- else {
- zone_reclaim(z, gfp_mask, order);
- /* Only try reclaim once */
- do_reclaim = 0;
- goto zone_reclaim_retry;
- }
- }
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
-
- for (i = 0; (z = zones[i]) != NULL; i++)
- wakeup_kswapd(z, order);
+ alloc_flags = 0;
+ if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+ alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_HIGH)
+ alloc_flags |= ALLOC_HIGH;
+ if (wait)
+ alloc_flags |= ALLOC_CPUSET;

/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -851,19 +860,9 @@ zone_reclaim_retry:
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (wait && !cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+ if (page)
+ goto got_pg;

/* This allocation should allow future memory freeing. */

@@ -871,13 +870,10 @@ zone_reclaim_retry:
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
/* go through the zonelist yet again, ignoring mins */
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order,
+ zonelist, ALLOC_CPUSET);
+ if (page)
+ goto got_pg;
}
goto nopage;
}
@@ -894,7 +890,7 @@ rebalance:
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;

- did_some_progress = try_to_free_pages(zones, gfp_mask);
+ did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);

p->reclaim_state = NULL;
p->flags &= ~PF_MEMALLOC;
@@ -902,19 +898,10 @@ rebalance:
cond_resched();

if (likely(did_some_progress)) {
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_min,
- classzone_idx, can_try_harder,
- gfp_mask & __GFP_HIGH))
- continue;
-
- if (!cpuset_zone_allowed(z, gfp_mask))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask, order,
+ zonelist, alloc_flags);
+ if (page)
+ goto got_pg;
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
/*
* Go through the zonelist yet one more time, keep
@@ -922,18 +909,10 @@ rebalance:
* a parallel oom killing, we must fail if we're still
* under heavy pressure.
*/
- for (i = 0; (z = zones[i]) != NULL; i++) {
- if (!zone_watermark_ok(z, order, z->pages_high,
- classzone_idx, 0, 0))
- continue;
-
- if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
- continue;
-
- page = buffered_rmqueue(z, order, gfp_mask);
- if (page)
- goto got_pg;
- }
+ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+ zonelist, ALLOC_WATERMARKS|ALLOC_CPUSET);
+ if (page)
+ goto got_pg;

out_of_memory(gfp_mask, order);
goto restart;
@@ -966,9 +945,7 @@ nopage:
dump_stack();
show_mem();
}
- return NULL;
got_pg:
- zone_statistics(zonelist, z);
return page;
}

Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h 2005-11-08 14:39:56.000000000 +1100
+++ linux-2.6/include/linux/mmzone.h 2005-11-08 14:41:26.000000000 +1100
@@ -302,7 +302,7 @@ void get_zone_counts(unsigned long *acti
void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone, int order);
int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
- int alloc_type, int can_try_harder, int gfp_high);
+ int classzone_idx, int alloc_flags);

#ifdef CONFIG_HAVE_MEMORY_PRESENT
void memory_present(int nid, unsigned long start, unsigned long end);
Index: linux-2.6/mm/vmscan.c
===================================================================
--- linux-2.6.orig/mm/vmscan.c 2005-11-08 14:39:56.000000000 +1100
+++ linux-2.6/mm/vmscan.c 2005-11-08 14:41:26.000000000 +1100
@@ -1069,7 +1069,7 @@ loop_again:

if (nr_pages == 0) { /* Not software suspend */
if (zone_watermark_ok(zone, order,
- zone->pages_high, first_low_zone, 0, 0))
+ zone->pages_high, first_low_zone, 0))
continue;

all_zones_ok = 0;
@@ -1222,7 +1222,7 @@ void wakeup_kswapd(struct zone *zone, in
return;

pgdat = zone->zone_pgdat;
- if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
+ if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;


Attachments:
__alloc_pages-cleanup.patch (9.81 kB)

2005-11-08 05:29:58

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Paul Jackson wrote:

> I thought Nick had an alternative proposal, involving just boolean

Hi Paul,

I sent the cleanup patch plus some modifications to lkml in a
subthread but forgot to CC you on it.

Let me know if you have any comments or suggestions on it.

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-11-08 05:44:35

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Nick wrote:
> The compiler will constant fold this out if it is halfway smart.

How could that happen - when get_page_from_freelist() is called twice,
once with skip_cpuset_chk == 0 and once with skip_cpuset_chk == 1?


> +#define ALLOC_WATERMARKS 0x01 /* check watermarks */
> +#define ALLOC_HARDER 0x02 /* try to alloc harder */
> +#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */
> +#define ALLOC_CPUSET 0x08 /* check for correct cpuset */

Names - bless you.

If these names were in a header, then calls to zone_watermark_ok()
from mm/vmscan.c could use them too?


> + * reclaim. Now things get more complex, so st up alloc_flags according

Typo: s/st/set/


At first glance, I think you've expressed the cpuset flags correctly.
Well, correctly maintained their current meaning. Read on, and you
will see that I think that is not right.

I'm just reading the raw patch, so likely I missed something here.
But it seems to me that zone_watermark_ok() is called from __alloc_pages()
only if the ALLOC_WATERMARKS flag is set, and it seems that the two
alloc_flags values ALLOC_HARDER and ALLOC_HIGH are only of use if
zone_watermark() is called. So what use is it setting ALLOC_HARDER
or ALLOC_HIGH if ALLOC_WATERMARKS is not set? If the get_page_from_freelist()
check:
if (alloc_flags & ALLOC_WATERMARKS)
was instead:
if (alloc_flags & ALLOC_WATERMARKS|ALLOC_HARDER|ALLOC_HIGH)
then this would make more sense to me. Or changing ALLOC_WATERMARKS
to ALLOC_EASY, and make it behave similarly to the HARDER & HIGH flags.
Or maybe if the initialization of alloc_flags:
> + alloc_flags = 0;
was instead:
+ alloc_flags = ALLOC_WATERMARKS;

As a possible future change, I wonder if there is a way to avoid having
the ALLOC_CPUSET flag separate, and instead collapse the logic (even if
it means modifying the conditions a little when cpusets are honored) to
make the cpuset_zone_allowed() check based on some combination of these
other WATERMARKS/HARDER/HIGH values. For example, it might make sense
to check cpuset_zone_allowed() under the same conditions that it made
sense to call zone_watermark_ok() from get_page_from_freelist(), or
perhaps to check cpusets unless we are HIGH or not checking
watermarks. To the best of my knowledge, subtle variations between
when we check some level of watermarks and when we check cpusets are
not worth it - not worth the extra conditional jump in the machine
code, and not worth the extra bit of logic and flags in the source code.

The cpuset check in the 'ignoring mins' code shortly after this for the
PF_MEMALLOC or TIF_MEMDIE cases seems bogus. This is the case where we
should be most willing to use memory, regardless of where we find it.
That cpuset check should be removed.

My current inclination - check cpusets in the WATERMARKS or HARDER
or (HIGH && wait) cases, but ignore cpusets in the (HIGH && !wait) or
'ignoring mins' cases. Can "HIGH && wait" even happen ?? Are
allocations either GFP_ATOMIC (aka GFP_HIGH) or (exclusive or)
GFP_WAIT, never both? Perhaps GFP_HIGH should be permanently
deleted (another cleanup) in favor of the more popular and expressive
GFP_ATOMIC, and __GFP_WAIT retired, in favor of !GFP_ATOMIC.

However, I appreciate your preference to separate cleanup from semantic
change. Perhaps this means leaving the ALLOC_CPUSET flag in your
cleanup patch, then one of us following on top of that with a patch to
simplify and fix the cpuset invocation semantics and a second cleanup
patch to remove ALLOC_CPUSET as a separate flag.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-08 05:58:21

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Paul Jackson wrote:
> Nick wrote:
>
>>The compiler will constant fold this out if it is halfway smart.
>
>
> How could that happen - when get_page_from_freelist() is called twice,
> once with skip_cpuset_chk == 0 and once with skip_cpuset_chk == 1?
>

Because it is on the other side of an &&, which evaulates to a
constant zero when !CONFIG_CPUSETS.

>
>
>>+#define ALLOC_WATERMARKS 0x01 /* check watermarks */
>>+#define ALLOC_HARDER 0x02 /* try to alloc harder */
>>+#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */
>>+#define ALLOC_CPUSET 0x08 /* check for correct cpuset */
>
>
> Names - bless you.
>
> If these names were in a header, then calls to zone_watermark_ok()
> from mm/vmscan.c could use them too?
>
>
>
>>+ * reclaim. Now things get more complex, so st up alloc_flags according
>
>
> Typo: s/st/set/
>
>
> At first glance, I think you've expressed the cpuset flags correctly.
> Well, correctly maintained their current meaning. Read on, and you
> will see that I think that is not right.
>
> I'm just reading the raw patch, so likely I missed something here.
> But it seems to me that zone_watermark_ok() is called from __alloc_pages()
> only if the ALLOC_WATERMARKS flag is set, and it seems that the two
> alloc_flags values ALLOC_HARDER and ALLOC_HIGH are only of use if
> zone_watermark() is called. So what use is it setting ALLOC_HARDER
> or ALLOC_HIGH if ALLOC_WATERMARKS is not set? If the get_page_from_freelist()
> check:
> if (alloc_flags & ALLOC_WATERMARKS)
> was instead:
> if (alloc_flags & ALLOC_WATERMARKS|ALLOC_HARDER|ALLOC_HIGH)
> then this would make more sense to me. Or changing ALLOC_WATERMARKS
> to ALLOC_EASY, and make it behave similarly to the HARDER & HIGH flags.
> Or maybe if the initialization of alloc_flags:
>
>>+ alloc_flags = 0;
>
> was instead:
> + alloc_flags = ALLOC_WATERMARKS;
>

Yep that's a bug. Thanks. Maybe instead we should have a specific
flag for ALLOC_NO_WATERMARKS because that is the unusual case. The
use of the flag there would be a good annotation too.



> The cpuset check in the 'ignoring mins' code shortly after this for the
> PF_MEMALLOC or TIF_MEMDIE cases seems bogus. This is the case where we
> should be most willing to use memory, regardless of where we find it.
> That cpuset check should be removed.
>

OK that would be fine, but let's do that (and your suggested possible
consolidation of ALLOC_CPUSET) in another patch?

> My current inclination - check cpusets in the WATERMARKS or HARDER
> or (HIGH && wait) cases, but ignore cpusets in the (HIGH && !wait) or
> 'ignoring mins' cases. Can "HIGH && wait" even happen ?? Are

Yes there is nothing preventing it.

> allocations either GFP_ATOMIC (aka GFP_HIGH) or (exclusive or)
> GFP_WAIT, never both? Perhaps GFP_HIGH should be permanently
> deleted (another cleanup) in favor of the more popular and expressive
> GFP_ATOMIC, and __GFP_WAIT retired, in favor of !GFP_ATOMIC.
>

Having __GFP_HIGH as its own flag gives some more flexibility. I
don't think it has a downside?

> However, I appreciate your preference to separate cleanup from semantic
> change. Perhaps this means leaving the ALLOC_CPUSET flag in your
> cleanup patch, then one of us following on top of that with a patch to
> simplify and fix the cpuset invocation semantics and a second cleanup
> patch to remove ALLOC_CPUSET as a separate flag.
>

That would be good. I'll send off a fresh patch with the
ALLOC_WATERMARKS fixed after Rohit gets around to looking over
it.

--
SUSE Labs, Novell Inc.

Send instant messages to your online friends http://au.messenger.yahoo.com

2005-11-08 06:22:44

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Nick wrote:
> Because it is on the other side of an &&, which evaulates to a
> constant zero when !CONFIG_CPUSETS.

Ah so.

> Having __GFP_HIGH as its own flag gives some more flexibility. I
> don't think it has a downside?

With respect to GFP_ATOMIC, __GFP_HIGH has no flexibility, as they are
#defined to be the same thing.

With respect to __GFP_WAIT, if we only ever use it exactly when
we don't use __GFP_HIGH aka GFP_ATOMIC, then there is a definite
downside. My old brain doesn't fold constants nearly as reliably or
rapidly as a compiler. Every apparent degree of freedom that is unused
wastes a few of my remaining precious neurons understanding it.
It directly leads to such bugs as the one I noted in my last reply,
when I realized that checking cpusets in the 'ignoring mins' case
was bogus.

__GFP_HIGH has a second cost - it is easily confused with __GFP_HIGHMEM.

> That would be good. I'll send off a fresh patch with the
> ALLOC_WATERMARKS fixed after Rohit gets around to looking over
> it.

Good.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-08 18:11:14

by Rohit Seth

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

On Tue, 2005-11-08 at 17:00 +1100, Nick Piggin wrote:

>
> > However, I appreciate your preference to separate cleanup from semantic
> > change. Perhaps this means leaving the ALLOC_CPUSET flag in your
> > cleanup patch, then one of us following on top of that with a patch to
> > simplify and fix the cpuset invocation semantics and a second cleanup
> > patch to remove ALLOC_CPUSET as a separate flag.
> >
>
> That would be good. I'll send off a fresh patch with the
> ALLOC_WATERMARKS fixed after Rohit gets around to looking over
> it.
>

Nick, your changes have really come out good. Thanks. I think it is
definitely a good starting point as it maintains all of existing
behavior.

I guess now I can argue about why we should keep the watermark low for
GFP_HIGH ;-)

Paul, sorry for troubling you with those magic numbers again in the
original patch...

-rohit

2005-11-08 19:55:04

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Rohit wrote:
> Paul, sorry for troubling you with those magic numbers again in the
> original patch...

That's no problem. I enjoyed the opportunity to protest it again ;).

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-09 00:17:51

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

If you're going to remove the early reclaim logic, then
lets also nuke the related apparatus: should_reclaim_zone()
and __GFP_NORECLAIM (which is used in a couple of pagemap.h
macros as well)?

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-09 02:53:05

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Rohit Seth wrote:

>On Tue, 2005-11-08 at 17:00 +1100, Nick Piggin wrote:
>
>
>>That would be good. I'll send off a fresh patch with the
>>ALLOC_WATERMARKS fixed after Rohit gets around to looking over
>>it.
>>
>>
>
>Nick, your changes have really come out good. Thanks. I think it is
>definitely a good starting point as it maintains all of existing
>behavior.
>
>

Great, glad you agree. I'll send the revised copy upstream.

>I guess now I can argue about why we should keep the watermark low for
>GFP_HIGH ;-)
>
>

Yep, I would be happy to discuss this with you and linux-mm :)


Send instant messages to your online friends http://au.messenger.yahoo.com

2005-11-13 05:09:41

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

The __GFP_HIGH, GFP_ATOMIC, __GFP_WAIT flags are still driving me bonkers.

It seems to me that:
1) __GFP_WAIT is supposed to mean can wait, and __alloc_pages()
keys off that bit to set its "wait" variable. Good so far.
2) __GFP_HIGH is supposed to mean can access emergency pools
(use lower watermarks), and __alloc_pages() does that. Also
good so far.
3) But gfp.h defines GFP_ATOMIC to be an alias for __GFP_HIGH,
and many callers through out the kernel use GFP_ATOMIC to mean
"can't sleep" or "can't wait" or some such. These folks are
not getting the service they expect - they are asking for the
most aggressive form of allocation (short perhaps of the
special case for allocations that will net free more memory
than they require, such as exiting), and they get the half way
improvement instead, with the possibility of sleeping (!).

The confusion even extends to the comments in __alloc_pages(),
such as in the lines:

/* Atomic allocations - we can't balance anything */
if (!wait)
goto nopage;

The "!wait" condition is --not-- GFP_ATOMIC, which is what
one might think was meant by "Atomic allocations", and likely
what the many users of GFP_ATOMIC were expecting - a nopage
response in such cases.

Perhaps GFP_ATOMIC should be its own __GFP_ATOMIC bit, with a BUG_ON
if both __GFP_ATOMIC and __GFP_WAIT are set at the same time,
leaving __GFP_HIGH for the few uses where people were just asking
to go a bit lower in the reserves.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-13 05:14:38

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

An even stranger line:

fs/xfs/linux-2.6/xfs_buf.c has:
aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);

Given the gfp.h line:
#define GFP_ATOMIC (__GFP_VALID | __GFP_HIGH)

that xfs_buf line makes no sense. There is almost no chance
that the author of that xfs_buf.c line was aware they were
spelling the empty gfp flag __GFP_VALID.

--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401

2005-11-13 07:00:58

by Nathan Scott

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

On Sat, Nov 12, 2005 at 09:14:29PM -0800, Paul Jackson wrote:
> An even stranger line:
>
> fs/xfs/linux-2.6/xfs_buf.c has:
> aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
>
> Given the gfp.h line:
> #define GFP_ATOMIC (__GFP_VALID | __GFP_HIGH)
>
> that xfs_buf line makes no sense. There is almost no chance
> that the author of that xfs_buf.c line was aware they were
> spelling the empty gfp flag __GFP_VALID.

Actually, there is a very good chance of that - it was
Andrew's recommendation a few months back... ;)

cheers.

--
Nathan

2005-11-13 07:12:35

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

Paul Jackson <[email protected]> wrote:
>
> fs/xfs/linux-2.6/xfs_buf.c has:
> aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);

That's a reasonable thing to do, actually.

GFP_ATOMIC means "don't sleep" (!__GFP_WAIT) and "use emergency pools"
(__GFP_HIGH).

XFS is saying "don't sleep" and "don't use the emergency pools".

Yes, the fact that GFP_ATOMIC also implies "use the emergency pool" is
unfortunate, and perhaps the two should always have been separated out, at
least to make the programmer think about whether the code really needs
access to the emergency pools. Usually it does.

But I haven't seen much sign that it's causing any problems.

2005-11-13 07:47:22

by Paul Jackson

[permalink] [raw]
Subject: Re: [PATCH]: Cleanup of __alloc_pages

> Yes, the fact that GFP_ATOMIC also implies "use the emergency pool" is
> unfortunate, and perhaps the two should always have been separated out, at
> least to make the programmer think about whether the code really needs
> access to the emergency pools. Usually it does.

Ah - now it makes more sense.

The key invisible fact in the gfp.h line:

#define GFP_ATOMIC (__GFP_VALID | __GFP_HIGH)

is that __GFP_WAIT is *not* set (making it mean don't sleep).
All the other commonly used GFP_* flags do have __GFP_WAIT.

I have no issue with ATOMIC also meaning "use emergency pool".
That's an appropriate simplication, that fits the usage well.

I just had a mental block on the invisible unset __GFP_WAIT bit.

Would you look kindly on a patch that did:


--- 2.6.14-mm2.orig/include/linux/gfp.h 2005-11-12 23:36:57.258103418 -0800
+++ 2.6.14-mm2/include/linux/gfp.h 2005-11-12 23:42:35.287219455 -0800
@@ -58,6 +58,7 @@ struct vm_area_struct;
__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
__GFP_NOMEMALLOC|__GFP_HARDWALL)

+/* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */
#define GFP_ATOMIC (__GFP_VALID | __GFP_HIGH)
#define GFP_NOIO (__GFP_VALID | __GFP_WAIT)
#define GFP_NOFS (__GFP_VALID | __GFP_WAIT | __GFP_IO)


--
I won't rest till it's the best ...
Programmer, Linux Scalability
Paul Jackson <[email protected]> 1.925.600.0401