2004-06-05 01:44:15

by Andi Kleen

[permalink] [raw]
Subject: [PATCH] Use numa policy API for boot time policy


Suggested by Manfred Spraul.

__get_free_pages had a hack to do node interleaving allocation at boot time.
This patch sets an interleave process policy using the NUMA API for init
and the idle threads instead. Before entering the user space init the policy
is reset to default again. Result is the same.

Advantage is less code and removing of a check from a fast path.

Removes more code than it adds.

I verified that the memory distribution after boot is roughly the same.

diff -u linux-2.6.7rc2-work/include/linux/mempolicy.h-o linux-2.6.7rc2-work/include/linux/mempolicy.h
--- linux-2.6.7rc2-work/include/linux/mempolicy.h-o 2004-05-31 23:22:36.000000000 +0200
+++ linux-2.6.7rc2-work/include/linux/mempolicy.h 2004-06-05 00:40:54.000000000 +0200
@@ -153,6 +153,9 @@
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
unsigned long idx);

+extern void numa_default_policy(void);
+extern void numa_policy_init(void);
+
#else

struct mempolicy {};
@@ -215,6 +218,14 @@
#define vma_policy(vma) NULL
#define vma_set_policy(vma, pol) do {} while(0)

+static inline void numa_policy_init(void)
+{
+}
+
+static inline void numa_default_policy(void)
+{
+}
+
#endif /* CONFIG_NUMA */
#endif /* __KERNEL__ */

diff -u linux-2.6.7rc2-work/mm/page_alloc.c-o linux-2.6.7rc2-work/mm/page_alloc.c
--- linux-2.6.7rc2-work/mm/page_alloc.c-o 2004-05-31 23:22:37.000000000 +0200
+++ linux-2.6.7rc2-work/mm/page_alloc.c 2004-05-31 23:35:32.000000000 +0200
@@ -732,53 +732,12 @@

EXPORT_SYMBOL(__alloc_pages);

-#ifdef CONFIG_NUMA
-/* Early boot: Everything is done by one cpu, but the data structures will be
- * used by all cpus - spread them on all nodes.
- */
-static __init unsigned long get_boot_pages(unsigned int gfp_mask, unsigned int order)
-{
-static int nodenr;
- int i = nodenr;
- struct page *page;
-
- for (;;) {
- if (i > nodenr + numnodes)
- return 0;
- if (node_present_pages(i%numnodes)) {
- struct zone **z;
- /* The node contains memory. Check that there is
- * memory in the intended zonelist.
- */
- z = NODE_DATA(i%numnodes)->node_zonelists[gfp_mask & GFP_ZONEMASK].zones;
- while (*z) {
- if ( (*z)->free_pages > (1UL<<order))
- goto found_node;
- z++;
- }
- }
- i++;
- }
-found_node:
- nodenr = i+1;
- page = alloc_pages_node(i%numnodes, gfp_mask, order);
- if (!page)
- return 0;
- return (unsigned long) page_address(page);
-}
-#endif
-
/*
* Common helper functions.
*/
fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
{
struct page * page;
-
-#ifdef CONFIG_NUMA
- if (unlikely(system_state == SYSTEM_BOOTING))
- return get_boot_pages(gfp_mask, order);
-#endif
page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
diff -u linux-2.6.7rc2-work/mm/mempolicy.c-o linux-2.6.7rc2-work/mm/mempolicy.c
--- linux-2.6.7rc2-work/mm/mempolicy.c-o 2004-05-31 23:22:55.000000000 +0200
+++ linux-2.6.7rc2-work/mm/mempolicy.c 2004-06-05 00:50:50.000000000 +0200
@@ -1001,7 +1001,8 @@
up(&p->sem);
}

-static __init int numa_policy_init(void)
+/* assumes fs == KERNEL_DS */
+void __init numa_policy_init(void)
{
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
@@ -1010,6 +1011,17 @@
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL, NULL);
- return 0;
+
+ /* Set interleaving policy for system init. This way not all
+ the data structures allocated at system boot end up in node zero. */
+
+ if (sys_set_mempolicy(MPOL_INTERLEAVE, &node_online_map, MAX_NUMNODES) < 0)
+ printk("numa_policy_init: interleaving failed\n");
+}
+
+/* Reset policy of current process to default.
+ * Assumes fs == KERNEL_DS */
+void __init numa_default_policy(void)
+{
+ sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
}
-module_init(numa_policy_init);
diff -u linux-2.6.7rc2-work/init/main.c-o linux-2.6.7rc2-work/init/main.c
--- linux-2.6.7rc2-work/init/main.c-o 2004-05-31 23:22:55.000000000 +0200
+++ linux-2.6.7rc2-work/init/main.c 2004-06-02 03:45:14.000000000 +0200
@@ -43,6 +43,7 @@
#include <linux/efi.h>
#include <linux/unistd.h>
#include <linux/rmap.h>
+#include <linux/mempolicy.h>

#include <asm/io.h>
#include <asm/bugs.h>
@@ -385,6 +386,7 @@
static void noinline rest_init(void)
{
kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
+ numa_default_policy();
unlock_kernel();
cpu_idle();
}
@@ -456,6 +458,7 @@
#endif
mem_init();
kmem_cache_init();
+ numa_policy_init();
if (late_time_init)
late_time_init();
calibrate_delay();
@@ -645,6 +648,7 @@
free_initmem();
unlock_kernel();
system_state = SYSTEM_RUNNING;
+ numa_default_policy();

if (sys_open("/dev/console", O_RDWR, 0) < 0)
printk("Warning: unable to open an initial console.\n");


2004-06-05 01:57:13

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy

Andi Kleen wrote:

>Suggested by Manfred Spraul.
>
>__get_free_pages had a hack to do node interleaving allocation at boot time.
>This patch sets an interleave process policy using the NUMA API for init
>and the idle threads instead. Before entering the user space init the policy
>is reset to default again. Result is the same.
>
>Advantage is less code and removing of a check from a fast path.
>
>Removes more code than it adds.
>
>I verified that the memory distribution after boot is roughly the same.
>
>
>
Does it work for order != 0 allocations? It's important that the big
hash tables do not end up all in node 0. AFAICS alloc_pages_current()
calls interleave_nodes() only for order==0 allocs.

--
Manfred

2004-06-05 02:18:21

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy

On Sat, 05 Jun 2004 03:56:53 +0200
Manfred Spraul <[email protected]> wrote:

> Andi Kleen wrote:
>
> >Suggested by Manfred Spraul.
> >
> >__get_free_pages had a hack to do node interleaving allocation at boot time.
> >This patch sets an interleave process policy using the NUMA API for init
> >and the idle threads instead. Before entering the user space init the policy
> >is reset to default again. Result is the same.
> >
> >Advantage is less code and removing of a check from a fast path.
> >
> >Removes more code than it adds.
> >
> >I verified that the memory distribution after boot is roughly the same.
> >
> >
> >
> Does it work for order != 0 allocations? It's important that the big
> hash tables do not end up all in node 0. AFAICS alloc_pages_current()
> calls interleave_nodes() only for order==0 allocs.

That's correct. It will only work for order 0 allocations.

But it sounds quite bogus anyways to move the complete hash tables
to another node anyways. It would probably be better to use vmalloc()
and a interleaving mapping for it. Then you would get the NUMA bandwidth
benefit even for accessing single tables.

-Andi

2004-06-05 02:34:38

by Anton Blanchard

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy


Hi,

> That's correct. It will only work for order 0 allocations.
>
> But it sounds quite bogus anyways to move the complete hash tables
> to another node anyways. It would probably be better to use vmalloc()
> and a interleaving mapping for it. Then you would get the NUMA bandwidth
> benefit even for accessing single tables.

I posted some before and after numbers when we merged Manfreds patch,
it would be interesting to see the same thing with your patch applied.

Im not only worried about NUMA bandwidth but keeping the amount of
memory left in all the nodes reasonably even. Allocating all the big
hashes on node 0 will decrease that balance.

Anton

2004-06-05 10:21:07

by Manfred Spraul

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy

Andi Kleen wrote:

>On Sat, 05 Jun 2004 03:56:53 +0200
>Manfred Spraul <[email protected]> wrote:
>
>
>>Does it work for order != 0 allocations? It's important that the big
>>hash tables do not end up all in node 0. AFAICS alloc_pages_current()
>>calls interleave_nodes() only for order==0 allocs.
>>
>>
>
>That's correct. It will only work for order 0 allocations.
>
>
>
What's the purpose of the "&& order == 0)" test for MPOL_INTERLEAVE in
alloc_pages_current?
What would break if it's removed?

And what about in_interrupt() allocations? During boot everything should
be interleaved - I'd modify default_policy to MPOL_INTERLEAVE instead of
setting process affinity.

--
Manfred

2004-06-05 10:24:55

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy

On Sat, 5 Jun 2004 12:32:12 +1000
Anton Blanchard <[email protected]> wrote:

>
> Hi,
>
> > That's correct. It will only work for order 0 allocations.
> >
> > But it sounds quite bogus anyways to move the complete hash tables
> > to another node anyways. It would probably be better to use vmalloc()
> > and a interleaving mapping for it. Then you would get the NUMA bandwidth
> > benefit even for accessing single tables.
>
> I posted some before and after numbers when we merged Manfreds patch,
> it would be interesting to see the same thing with your patch applied.
>
> Im not only worried about NUMA bandwidth but keeping the amount of
> memory left in all the nodes reasonably even. Allocating all the big
> hashes on node 0 will decrease that balance.

It would be a one liner change to allow process policy interleaving
for orders > 0 in mempolicy. But I'm not sure how useful it is, since
the granuality would be really bad.

Have you ever tried to switch to implement a vmalloc_interleave() for these
tables instead? My bet is that it will perform better.

-Andi

2004-06-05 10:35:33

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy

On Sat, 05 Jun 2004 12:20:53 +0200
Manfred Spraul <[email protected]> wrote:

> Andi Kleen wrote:
>
> >On Sat, 05 Jun 2004 03:56:53 +0200
> >Manfred Spraul <[email protected]> wrote:
> >
> >
> >>Does it work for order != 0 allocations? It's important that the big
> >>hash tables do not end up all in node 0. AFAICS alloc_pages_current()
> >>calls interleave_nodes() only for order==0 allocs.
> >>
> >>
> >
> >That's correct. It will only work for order 0 allocations.
> >
> >
> >
> What's the purpose of the "&& order == 0)" test for MPOL_INTERLEAVE in
> alloc_pages_current?
> What would break if it's removed?

Nothing. Just the interleaving will not be very good.
Just the vma interleaving relies on order 0 right now.

But I would really try to use vmalloc() for this. In fact you don't
even need vmalloc_interleaved(), because the normal vmalloc allocation
together with the interleave policy should do the right thing.

>
> And what about in_interrupt() allocations? During boot everything should
> be interleaved - I'd modify default_policy to MPOL_INTERLEAVE instead of
> setting process affinity.

Better don't do that. It may break some subtle assumptions.

I guess the in_interrupt() allocations will have to live with that.
They should be relatively rare.

In theory you could add a system_state == SYSTEM_BOOTING check again,
but polluting the fast path for this would be imho overkill.

-Andi

2004-06-05 10:50:02

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy

On Sat, 5 Jun 2004 12:22:39 +0200
Andi Kleen <[email protected]> wrote:


> Have you ever tried to switch to implement a vmalloc_interleave() for these
> tables instead? My bet is that it will perform better.

Actually vmalloc_interleaved() is not needed. With process interleaving
policy a ordinary vmalloc() should do that already.

-Andi

2004-06-09 15:48:02

by Anton Blanchard

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy


> It would be a one liner change to allow process policy interleaving
> for orders > 0 in mempolicy. But I'm not sure how useful it is, since
> the granuality would be really bad.

OK. Id like to take a quick look at order > 0 allocations during boot
to see if its worth it. The ppc64 page size is small and we might be
doing a significant number of order 1 allocations.

> Have you ever tried to switch to implement a vmalloc_interleave() for these
> tables instead? My bet is that it will perform better.

Im warming to this idea. We would need a per arch override, since there
is a trade off here between interleaving and TLB usage.

We also have a problem in 2.6 on our bigger machines where our dcache
hash and inode hash cache are limited to MAX_ORDER (16MB on ppc64). By
using vmalloc would allow us to interleave the memory and allocate more
than 16MB for those hashes.

Anton

2004-06-09 15:58:20

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy

On Thu, 10 Jun 2004 01:44:29 +1000
Anton Blanchard <[email protected]> wrote:

>
> > It would be a one liner change to allow process policy interleaving
> > for orders > 0 in mempolicy. But I'm not sure how useful it is, since
> > the granuality would be really bad.
>
> OK. Id like to take a quick look at order > 0 allocations during boot
> to see if its worth it. The ppc64 page size is small and we might be
> doing a significant number of order 1 allocations.

For what?

> > Have you ever tried to switch to implement a vmalloc_interleave() for these
> > tables instead? My bet is that it will perform better.
>
> Im warming to this idea. We would need a per arch override, since there
> is a trade off here between interleaving and TLB usage.

Actually just standard vmalloc is enough. The interleave policy in alloc_pages
will transparently interleave the order 0 pages allocated by vmalloc.

When I find some time I will try that on Opteron too.

>
> We also have a problem in 2.6 on our bigger machines where our dcache
> hash and inode hash cache are limited to MAX_ORDER (16MB on ppc64). By
> using vmalloc would allow us to interleave the memory and allocate more
> than 16MB for those hashes.

IMHO 16MB hash table for a kernel structure is madness. A different data
structure is probably needed if it's really a problem
(is your dcache that big?). Or maybe just limit the dcache more aggressively
to keep the max number of entries smaller.

-Andi

2004-06-09 16:13:30

by Anton Blanchard

[permalink] [raw]
Subject: Re: [PATCH] Use numa policy API for boot time policy


> For what?

No idea, I just want to convince myself that there arent any out there.

> IMHO 16MB hash table for a kernel structure is madness. A different data
> structure is probably needed if it's really a problem
> (is your dcache that big?). Or maybe just limit the dcache more aggressively
> to keep the max number of entries smaller.

Yep, specSFS (an NFS benchmark) shows this up quite badly. I think Jose
and Martin were looking at strategies for keeping the dcache under
control.

This was on a machine with only 64GB of RAM, if we had an NFS server
with more memory then its reasonable to want more memory dedicated to
dentries. At that point we either need to increase the hash or look at
using another structure.

Anton