2007-05-17 02:39:20

by Zou, Nanhai

[permalink] [raw]
Subject: [Patch] Allocate sparsemem memmap above 4G on X86_64

On system with huge amount of physical memory.
VFS cache and memory memmap may eat all available system memory under
4G, then system may fail to allocated swiotlb bounce buffer.

There was a fix in arch/x86_64/mm/numa.c, but that fix does not cover
sparsemem model.
This patch add fix to sparsemem model.

Signed-off-by: Zou Nan hai <[email protected]>
Acked-by: Siddha, Suresh <[email protected]>
---
include/asm-x86_64/mmzone.h | 5 +++++
include/linux/bootmem.h | 3 +++
mm/sparse.c | 5 +++++
3 files changed, 13 insertions(+)

diff -Nraup a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
--- a/include/asm-x86_64/mmzone.h 2007-05-17 09:38:02.000000000 +0800
+++ b/include/asm-x86_64/mmzone.h 2007-05-17 09:54:10.000000000 +0800
@@ -52,5 +52,10 @@ extern int pfn_valid(unsigned long pfn);
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1uL))
#endif

+#define ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE 1
+#define alloc_bootmem_high_node(pgdat,size) \
+({__alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);})
+
+
#endif
#endif
diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
--- a/include/linux/bootmem.h 2007-05-17 09:38:02.000000000 +0800
+++ b/include/linux/bootmem.h 2007-05-17 09:37:00.000000000 +0800
@@ -131,5 +131,8 @@ extern void *alloc_large_system_hash(con
#endif
extern int hashdist; /* Distribute hashes across NUMA nodes? */

+#ifndef ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE
+#define alloc_bootmem_high_node(pgdat, size) ({NULL;})
+#endif

#endif /* _LINUX_BOOTMEM_H */
diff -Nraup a/mm/sparse.c b/mm/sparse.c
--- a/mm/sparse.c 2007-05-17 09:38:03.000000000 +0800
+++ b/mm/sparse.c 2007-05-17 09:54:27.000000000 +0800
@@ -219,6 +219,11 @@ static struct page __init *sparse_early_
if (map)
return map;

+ map = alloc_bootmem_high_node(NODE_DATA(nid),
+ sizeof(struct page) * PAGES_PER_SECTION);
+ if (map)
+ return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)





2007-05-17 19:34:52

by Andrew Morton

[permalink] [raw]
Subject: Re: [Patch] Allocate sparsemem memmap above 4G on X86_64

On 17 May 2007 10:40:07 +0800
Zou Nan hai <[email protected]> wrote:

> On system with huge amount of physical memory.
> VFS cache and memory memmap may eat all available system memory under
> 4G, then system may fail to allocated swiotlb bounce buffer.
>
> There was a fix in arch/x86_64/mm/numa.c, but that fix does not cover
> sparsemem model.
> This patch add fix to sparsemem model.
>

This seems like something we need in 2.6.22, but this implementation is a
bit ugly-looking.

>
> diff -Nraup a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
> --- a/include/asm-x86_64/mmzone.h 2007-05-17 09:38:02.000000000 +0800
> +++ b/include/asm-x86_64/mmzone.h 2007-05-17 09:54:10.000000000 +0800
> @@ -52,5 +52,10 @@ extern int pfn_valid(unsigned long pfn);
> #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1uL))
> #endif
>
> +#define ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE 1
> +#define alloc_bootmem_high_node(pgdat,size) \
> +({__alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);})
> +
> +
> #endif
> #endif
> diff -Nraup a/include/linux/bootmem.h b/include/linux/bootmem.h
> --- a/include/linux/bootmem.h 2007-05-17 09:38:02.000000000 +0800
> +++ b/include/linux/bootmem.h 2007-05-17 09:37:00.000000000 +0800
> @@ -131,5 +131,8 @@ extern void *alloc_large_system_hash(con
> #endif
> extern int hashdist; /* Distribute hashes across NUMA nodes? */
>
> +#ifndef ARCH_HAS_ALLOC_BOOTMEM_HIGH_NODE
> +#define alloc_bootmem_high_node(pgdat, size) ({NULL;})
> +#endif
>
> #endif /* _LINUX_BOOTMEM_H */
> diff -Nraup a/mm/sparse.c b/mm/sparse.c
> --- a/mm/sparse.c 2007-05-17 09:38:03.000000000 +0800
> +++ b/mm/sparse.c 2007-05-17 09:54:27.000000000 +0800
> @@ -219,6 +219,11 @@ static struct page __init *sparse_early_
> if (map)
> return map;
>
> + map = alloc_bootmem_high_node(NODE_DATA(nid),
> + sizeof(struct page) * PAGES_PER_SECTION);
> + if (map)
> + return map;
> +

Please use tabs, not spaces

> map = alloc_bootmem_node(NODE_DATA(nid),
> sizeof(struct page) * PAGES_PER_SECTION);
> if (map)
>

Please always prefer to use static inline functions rather than macros.
They are more readable, they are more likely to have comments attached to
them and they provide typechecking.

Please prefer to uninline functions by default. One reason for this is
that adding inlines to headers increases include complexity. This code is
all __init anyway, so the possible few bytes of text will get removed.


Try to avoid using the ARCH_HAS_FOO thing. We have two alternatives:

a) use __attribute__((weak))

b) do:

extern void foo(void);
#define foo foo

then, elsewhere,

#ifndef foo
#define foo() bar()
#endif

Both tricks avoid the introduction of two new symbols into the global
namespace to solve a single problem.

2007-05-18 02:52:17

by Zou, Nanhai

[permalink] [raw]
Subject: Re: [Patch] Allocate sparsemem memmap above 4G on X86_64

On Fri, 2007-05-18 at 03:32, Andrew Morton wrote:
> On 17 May 2007 10:40:07 +0800
> Zou Nan hai <[email protected]> wrote:
>
> >
> Please always prefer to use static inline functions rather than macros.
> They are more readable, they are more likely to have comments attached to
> them and they provide typechecking.
>
> Please prefer to uninline functions by default. One reason for this is
> that adding inlines to headers increases include complexity. This code is
> all __init anyway, so the possible few bytes of text will get removed.
>
>
> Try to avoid using the ARCH_HAS_FOO thing. We have two alternatives:
>
> a) use __attribute__((weak))
>
> b) do:
>
> extern void foo(void);
> #define foo foo
>
> then, elsewhere,
>
> #ifndef foo
> #define foo() bar()
> #endif
>
> Both tricks avoid the introduction of two new symbols into the global
> namespace to solve a single problem.
On systems with huge amount of physical memory, VFS cache and memory
memmap may eat all available system memory under 4G, then the system may
fail to allocate swiotlb bounce buffer.
There was a fix for this issue in arch/x86_64/mm/numa.c, but that fix
dose not cover sparsemem model.
This patch add fix to sparsemem model by first try to allocate memmap
above 4G.

Signed-off-by: Zou Nan hai <[email protected]>
Acked-by: Suresh Siddha <[email protected]>
---
arch/x86_64/mm/init.c | 6 ++++++
mm/sparse.c | 11 +++++++++++
2 files changed, 17 insertions(+)

diff -Nraup a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
--- a/arch/x86_64/mm/init.c 2007-05-19 16:54:46.000000000 +0800
+++ b/arch/x86_64/mm/init.c 2007-05-19 17:43:47.000000000 +0800
@@ -761,3 +761,9 @@ int in_gate_area_no_task(unsigned long a
{
return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
}
+
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+ return __alloc_bootmem_core(pgdat->bdata, size,
+ SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+}
diff -Nraup a/mm/sparse.c b/mm/sparse.c
--- a/mm/sparse.c 2007-05-19 16:54:48.000000000 +0800
+++ b/mm/sparse.c 2007-05-19 17:44:01.000000000 +0800
@@ -209,6 +209,12 @@ static int __meminit sparse_init_one_sec
return 1;
}

+__attribute__((weak))
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+ return NULL;
+}
+
static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
@@ -219,6 +225,11 @@ static struct page __init *sparse_early_
if (map)
return map;

+ map = alloc_bootmem_high_node(NODE_DATA(nid),
+ sizeof(struct page) * PAGES_PER_SECTION);
+ if (map)
+ return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)

2007-05-18 03:24:49

by Andrew Morton

[permalink] [raw]
Subject: Re: [Patch] Allocate sparsemem memmap above 4G on X86_64

On 18 May 2007 10:52:57 +0800 Zou Nan hai <[email protected]> wrote:

> On Fri, 2007-05-18 at 03:32, Andrew Morton wrote:
> > On 17 May 2007 10:40:07 +0800
> > Zou Nan hai <[email protected]> wrote:
> >
> > >
> > Please always prefer to use static inline functions rather than macros.
> > They are more readable, they are more likely to have comments attached to
> > them and they provide typechecking.
> >
> > Please prefer to uninline functions by default. One reason for this is
> > that adding inlines to headers increases include complexity. This code is
> > all __init anyway, so the possible few bytes of text will get removed.
> >
> >
> > Try to avoid using the ARCH_HAS_FOO thing. We have two alternatives:
> >
> > a) use __attribute__((weak))
> >
> > b) do:
> >
> > extern void foo(void);
> > #define foo foo
> >
> > then, elsewhere,
> >
> > #ifndef foo
> > #define foo() bar()
> > #endif
> >
> > Both tricks avoid the introduction of two new symbols into the global
> > namespace to solve a single problem.
> On systems with huge amount of physical memory, VFS cache and memory
> memmap may eat all available system memory under 4G, then the system may
> fail to allocate swiotlb bounce buffer.
> There was a fix for this issue in arch/x86_64/mm/numa.c, but that fix
> dose not cover sparsemem model.
> This patch add fix to sparsemem model by first try to allocate memmap
> above 4G.
>
> Signed-off-by: Zou Nan hai <[email protected]>
> Acked-by: Suresh Siddha <[email protected]>
> ---
> arch/x86_64/mm/init.c | 6 ++++++
> mm/sparse.c | 11 +++++++++++
> 2 files changed, 17 insertions(+)
>
> diff -Nraup a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
> --- a/arch/x86_64/mm/init.c 2007-05-19 16:54:46.000000000 +0800
> +++ b/arch/x86_64/mm/init.c 2007-05-19 17:43:47.000000000 +0800
> @@ -761,3 +761,9 @@ int in_gate_area_no_task(unsigned long a
> {
> return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
> }
> +
> +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
> +{
> + return __alloc_bootmem_core(pgdat->bdata, size,
> + SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
> +}
> diff -Nraup a/mm/sparse.c b/mm/sparse.c
> --- a/mm/sparse.c 2007-05-19 16:54:48.000000000 +0800
> +++ b/mm/sparse.c 2007-05-19 17:44:01.000000000 +0800
> @@ -209,6 +209,12 @@ static int __meminit sparse_init_one_sec
> return 1;
> }
>
> +__attribute__((weak))
> +void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
> +{
> + return NULL;
> +}
> +
> static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
> {
> struct page *map;
> @@ -219,6 +225,11 @@ static struct page __init *sparse_early_
> if (map)
> return map;
>
> + map = alloc_bootmem_high_node(NODE_DATA(nid),
> + sizeof(struct page) * PAGES_PER_SECTION);
> + if (map)
> + return map;
> +
> map = alloc_bootmem_node(NODE_DATA(nid),
> sizeof(struct page) * PAGES_PER_SECTION);
> if (map)

Fair enough. But we should ensure that there's a prototype in a header
file which is included by both definition sites and by all callers. So the
compiler checks that everything is consistent:

--- a/include/linux/bootmem.h~x86_64-allocate-sparsemem-memmap-above-4g-fix
+++ a/include/linux/bootmem.h
@@ -59,6 +59,7 @@ extern void *__alloc_bootmem_core(struct
unsigned long align,
unsigned long goal,
unsigned long limit);
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size);

#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
extern void reserve_bootmem(unsigned long addr, unsigned long size);
_


Andi, does this all look OK for 2.6.22 and for 2.6.21?