2013-03-06 21:50:26

by Cliff Wickman

[permalink] [raw]
Subject: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

From: Cliff Wickman <[email protected]>

Allocating a large number of 1GB hugetlbfs pages at boot takes a
very long time.

Large system sites would at times like to allocate a very large amount of
memory as 1GB pages. They would put this on the kernel boot line:
default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
start_kernel
kernel_init
do_pre_smp_initcalls
hugetlb_init
hugetlb_init_hstates
hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems).
This estimate is approximate (it depends on core frequency & number of hops
to remote memory) but should be within a factor of 2 on most systems.
A benchmark attempting to reserve a TB for 1GB pages would thus require
~1000 seconds of boot time just for this allocating. 32TB would take 8 hours.

I propose passing a flag to the early allocator to indicate that no zeroing
of a page should be done. The 'no zeroing' flag would have to be passed
down this code path:

hugetlb_hstate_alloc_pages
alloc_bootmem_huge_page
__alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c)
__alloc_memory_core_early NO_ZERO
if (!(flags & NO_ZERO))
memset(ptr, 0, size);

Or this path if CONFIG_NO_BOOTMEM is not set:

hugetlb_hstate_alloc_pages
alloc_bootmem_huge_page
__alloc_bootmem_node_nopanic NO_ZERO (bootmem.c)
alloc_bootmem_core NO_ZERO
if (!(flags & NO_ZERO))
memset(region, 0, size);
__alloc_bootmem_nopanic NO_ZERO
___alloc_bootmem_nopanic NO_ZERO
alloc_bootmem_core NO_ZERO
if (!(flags & NO_ZERO))
memset(region, 0, size);

Signed-off-by: Cliff Wickman <[email protected]>

---
arch/x86/kernel/setup_percpu.c | 4 ++--
include/linux/bootmem.h | 23 ++++++++++++++++-------
mm/bootmem.c | 12 +++++++-----
mm/hugetlb.c | 3 ++-
mm/nobootmem.c | 41 +++++++++++++++++++++++------------------
mm/page_cgroup.c | 2 +-
mm/sparse.c | 2 +-
7 files changed, 52 insertions(+), 35 deletions(-)

Index: linux/include/linux/bootmem.h
===================================================================
--- linux.orig/include/linux/bootmem.h
+++ linux/include/linux/bootmem.h
@@ -8,6 +8,11 @@
#include <asm/dma.h>

/*
+ * allocation flags
+ */
+#define NO_ZERO 0x00000001
+
+/*
* simple boot-time physical memory area allocator.
*/

@@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
unsigned long goal);
extern void *__alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal,
+ u32 flags);
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
@@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
- unsigned long goal);
+ unsigned long goal,
+ u32 flags);
void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ u32 flags);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
@@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
#define alloc_bootmem_align(x, align) \
__alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_nopanic(x) \
- __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+ __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
#define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_pages_nopanic(x) \
- __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+ __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
#define alloc_bootmem_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_node_nopanic(pgdat, x) \
- __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
+ __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
+ BOOTMEM_LOW_LIMIT, 0)
#define alloc_bootmem_pages_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
- __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
+ __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)

#define alloc_bootmem_low(x) \
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
Index: linux/arch/x86/kernel/setup_percpu.c
===================================================================
--- linux.orig/arch/x86/kernel/setup_percpu.c
+++ linux/arch/x86/kernel/setup_percpu.c
@@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
void *ptr;

if (!node_online(node) || !NODE_DATA(node)) {
- ptr = __alloc_bootmem_nopanic(size, align, goal);
+ ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
cpu, size, __pa(ptr));
} else {
ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
- size, align, goal);
+ size, align, goal, 0);
pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
cpu, size, node, __pa(ptr));
}
Index: linux/mm/nobootmem.c
===================================================================
--- linux.orig/mm/nobootmem.c
+++ linux/mm/nobootmem.c
@@ -33,7 +33,7 @@ unsigned long min_low_pfn;
unsigned long max_pfn;

static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
+ u64 goal, u64 limit, u32 flags)
{
void *ptr;
u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
return NULL;

ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
+ if (!(flags & NO_ZERO))
+ memset(ptr, 0, size);
memblock_reserve(addr, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
@@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
static void * __init ___alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit)
+ unsigned long limit,
+ u32 flags)
{
void *ptr;

@@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no

restart:

- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+ ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
+ limit, 0);

if (ptr)
return ptr;
@@ -244,17 +247,17 @@ restart:
* Returns NULL on failure.
*/
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
- unsigned long goal)
+ unsigned long goal, u32 flags)
{
unsigned long limit = -1UL;

- return ___alloc_bootmem_nopanic(size, align, goal, limit);
+ return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
}

static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+ unsigned long goal, unsigned long limit, u32 flags)
{
- void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+ void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);

if (mem)
return mem;
@@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
{
unsigned long limit = -1UL;

- return ___alloc_bootmem(size, align, goal, limit);
+ return ___alloc_bootmem(size, align, goal, limit, 0);
}

void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit)
+ unsigned long limit,
+ u32 flags)
{
void *ptr;

again:
ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, limit);
+ goal, limit, flags);
if (ptr)
return ptr;

ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, limit);
+ goal, limit, flags);
if (ptr)
return ptr;

@@ -315,12 +319,13 @@ again:
}

void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
+ unsigned long align, unsigned long goal, u32 flags)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);

- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
+ 0, flags);
}

void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
{
void *ptr;

- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
if (ptr)
return ptr;

@@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
* The function panics if the request can not be satisfied.
*/
void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
+ unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
@@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
unsigned long goal)
{
- return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+ return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
}

void * __init __alloc_bootmem_low_nopanic(unsigned long size,
@@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
unsigned long goal)
{
return ___alloc_bootmem_nopanic(size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
+ ARCH_LOW_ADDRESS_LIMIT, 0);
}

/**
Index: linux/mm/sparse.c
===================================================================
--- linux.orig/mm/sparse.c
+++ linux/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
- SMP_CACHE_BYTES, goal, limit);
+ SMP_CACHE_BYTES, goal, limit, 0);
if (!p && limit) {
limit = 0;
goto again;
Index: linux/mm/hugetlb.c
===================================================================
--- linux.orig/mm/hugetlb.c
+++ linux/mm/hugetlb.c
@@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
addr = __alloc_bootmem_node_nopanic(
NODE_DATA(hstate_next_node_to_alloc(h,
&node_states[N_MEMORY])),
- huge_page_size(h), huge_page_size(h), 0);
+ huge_page_size(h), huge_page_size(h),
+ 0, NO_ZERO);

if (addr) {
/*
Index: linux/mm/bootmem.c
===================================================================
--- linux.orig/mm/bootmem.c
+++ linux/mm/bootmem.c
@@ -660,7 +660,7 @@ restart:
* Returns NULL on failure.
*/
void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
- unsigned long goal)
+ unsigned long goal, u32 flags)
{
unsigned long limit = 0;

@@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l

void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+ unsigned long goal, unsigned long limit,
+ u32 flags)
{
void *ptr;

@@ -734,12 +735,13 @@ again:
}

void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
+ unsigned long align, unsigned long goal, u32 flags)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);

- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
+ 0, flags);
}

void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
{
void *ptr;

- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
if (ptr)
return ptr;

Index: linux/mm/page_cgroup.c
===================================================================
--- linux.orig/mm/page_cgroup.c
+++ linux/mm/page_cgroup.c
@@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
table_size = sizeof(struct page_cgroup) * nr_pages;

base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
- table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_cgroup = base;


2013-03-10 05:55:14

by Hillf Danton

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman <[email protected]> wrote:
> From: Cliff Wickman <[email protected]>
>
> Allocating a large number of 1GB hugetlbfs pages at boot takes a
> very long time.
>
> Large system sites would at times like to allocate a very large amount of
> memory as 1GB pages. They would put this on the kernel boot line:
> default_hugepagesz=1G hugepagesz=1G hugepages=4096
> [Dynamic allocation of 1G pages is not an option, as zone pages only go
> up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>
> Each page is zeroed as it is allocated, and all allocation is done by
> cpu 0, as this path is early in boot:
> start_kernel
> kernel_init
> do_pre_smp_initcalls
> hugetlb_init
> hugetlb_init_hstates
> hugetlb_hstate_alloc_pages
>
> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> on large numa systems).
> This estimate is approximate (it depends on core frequency & number of hops
> to remote memory) but should be within a factor of 2 on most systems.
> A benchmark attempting to reserve a TB for 1GB pages would thus require
> ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours.
>
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done. The 'no zeroing' flag would have to be passed
> down this code path:
>

FYI: huge pages are cleared just after allocated, for instance,
clear_huge_page() in hugetlb_no_page()

Hillf
> hugetlb_hstate_alloc_pages
> alloc_bootmem_huge_page
> __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c)
> __alloc_memory_core_early NO_ZERO
> if (!(flags & NO_ZERO))
> memset(ptr, 0, size);
>
> Or this path if CONFIG_NO_BOOTMEM is not set:
>
> hugetlb_hstate_alloc_pages
> alloc_bootmem_huge_page
> __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c)
> alloc_bootmem_core NO_ZERO
> if (!(flags & NO_ZERO))
> memset(region, 0, size);
> __alloc_bootmem_nopanic NO_ZERO
> ___alloc_bootmem_nopanic NO_ZERO
> alloc_bootmem_core NO_ZERO
> if (!(flags & NO_ZERO))
> memset(region, 0, size);
>
> Signed-off-by: Cliff Wickman <[email protected]>
>
> ---
> arch/x86/kernel/setup_percpu.c | 4 ++--
> include/linux/bootmem.h | 23 ++++++++++++++++-------
> mm/bootmem.c | 12 +++++++-----
> mm/hugetlb.c | 3 ++-
> mm/nobootmem.c | 41 +++++++++++++++++++++++------------------
> mm/page_cgroup.c | 2 +-
> mm/sparse.c | 2 +-
> 7 files changed, 52 insertions(+), 35 deletions(-)
>
> Index: linux/include/linux/bootmem.h
> ===================================================================
> --- linux.orig/include/linux/bootmem.h
> +++ linux/include/linux/bootmem.h
> @@ -8,6 +8,11 @@
> #include <asm/dma.h>
>
> /*
> + * allocation flags
> + */
> +#define NO_ZERO 0x00000001
> +
> +/*
> * simple boot-time physical memory area allocator.
> */
>
> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
> unsigned long goal);
> extern void *__alloc_bootmem_nopanic(unsigned long size,
> unsigned long align,
> - unsigned long goal);
> + unsigned long goal,
> + u32 flags);
> extern void *__alloc_bootmem_node(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
> extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> - unsigned long goal);
> + unsigned long goal,
> + u32 flags);
> void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit);
> + unsigned long limit,
> + u32 flags);
> extern void *__alloc_bootmem_low(unsigned long size,
> unsigned long align,
> unsigned long goal);
> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
> #define alloc_bootmem_align(x, align) \
> __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_nopanic(x) \
> - __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_pages(x) \
> __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_pages_nopanic(x) \
> - __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_node(pgdat, x) \
> __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_node_nopanic(pgdat, x) \
> - __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> + BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_pages_node(pgdat, x) \
> __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> - __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>
> #define alloc_bootmem_low(x) \
> __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> Index: linux/arch/x86/kernel/setup_percpu.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_percpu.c
> +++ linux/arch/x86/kernel/setup_percpu.c
> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
> void *ptr;
>
> if (!node_online(node) || !NODE_DATA(node)) {
> - ptr = __alloc_bootmem_nopanic(size, align, goal);
> + ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
> pr_info("cpu %d has no node %d or node-local memory\n",
> cpu, node);
> pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
> cpu, size, __pa(ptr));
> } else {
> ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> - size, align, goal);
> + size, align, goal, 0);
> pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
> cpu, size, node, __pa(ptr));
> }
> Index: linux/mm/nobootmem.c
> ===================================================================
> --- linux.orig/mm/nobootmem.c
> +++ linux/mm/nobootmem.c
> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
> unsigned long max_pfn;
>
> static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> - u64 goal, u64 limit)
> + u64 goal, u64 limit, u32 flags)
> {
> void *ptr;
> u64 addr;
> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
> return NULL;
>
> ptr = phys_to_virt(addr);
> - memset(ptr, 0, size);
> + if (!(flags & NO_ZERO))
> + memset(ptr, 0, size);
> memblock_reserve(addr, size);
> /*
> * The min_count is set to 0 so that bootmem allocated blocks
> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
> static void * __init ___alloc_bootmem_nopanic(unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit)
> + unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>
> restart:
>
> - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> + limit, 0);
>
> if (ptr)
> return ptr;
> @@ -244,17 +247,17 @@ restart:
> * Returns NULL on failure.
> */
> void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> - unsigned long goal)
> + unsigned long goal, u32 flags)
> {
> unsigned long limit = -1UL;
>
> - return ___alloc_bootmem_nopanic(size, align, goal, limit);
> + return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> }
>
> static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> - unsigned long goal, unsigned long limit)
> + unsigned long goal, unsigned long limit, u32 flags)
> {
> - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>
> if (mem)
> return mem;
> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
> {
> unsigned long limit = -1UL;
>
> - return ___alloc_bootmem(size, align, goal, limit);
> + return ___alloc_bootmem(size, align, goal, limit, 0);
> }
>
> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit)
> + unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> again:
> ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> - goal, limit);
> + goal, limit, flags);
> if (ptr)
> return ptr;
>
> ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> - goal, limit);
> + goal, limit, flags);
> if (ptr)
> return ptr;
>
> @@ -315,12 +319,13 @@ again:
> }
>
> void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal, u32 flags)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> + 0, flags);
> }
>
> void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
> {
> void *ptr;
>
> - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
> if (ptr)
> return ptr;
>
> @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
> * The function panics if the request can not be satisfied.
> */
> void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
> void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
> unsigned long goal)
> {
> - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
> }
>
> void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
> unsigned long goal)
> {
> return ___alloc_bootmem_nopanic(size, align, goal,
> - ARCH_LOW_ADDRESS_LIMIT);
> + ARCH_LOW_ADDRESS_LIMIT, 0);
> }
>
> /**
> Index: linux/mm/sparse.c
> ===================================================================
> --- linux.orig/mm/sparse.c
> +++ linux/mm/sparse.c
> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
> nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> again:
> p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> - SMP_CACHE_BYTES, goal, limit);
> + SMP_CACHE_BYTES, goal, limit, 0);
> if (!p && limit) {
> limit = 0;
> goto again;
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
> addr = __alloc_bootmem_node_nopanic(
> NODE_DATA(hstate_next_node_to_alloc(h,
> &node_states[N_MEMORY])),
> - huge_page_size(h), huge_page_size(h), 0);
> + huge_page_size(h), huge_page_size(h),
> + 0, NO_ZERO);
>
> if (addr) {
> /*
> Index: linux/mm/bootmem.c
> ===================================================================
> --- linux.orig/mm/bootmem.c
> +++ linux/mm/bootmem.c
> @@ -660,7 +660,7 @@ restart:
> * Returns NULL on failure.
> */
> void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> - unsigned long goal)
> + unsigned long goal, u32 flags)
> {
> unsigned long limit = 0;
>
> @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>
> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size, unsigned long align,
> - unsigned long goal, unsigned long limit)
> + unsigned long goal, unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> @@ -734,12 +735,13 @@ again:
> }
>
> void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal, u32 flags)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> + 0, flags);
> }
>
> void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
> {
> void *ptr;
>
> - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
> if (ptr)
> return ptr;
>
> Index: linux/mm/page_cgroup.c
> ===================================================================
> --- linux.orig/mm/page_cgroup.c
> +++ linux/mm/page_cgroup.c
> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
> table_size = sizeof(struct page_cgroup) * nr_pages;
>
> base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
> if (!base)
> return -ENOMEM;
> NODE_DATA(nid)->node_page_cgroup = base;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
>

2013-03-11 12:32:44

by Cliff Wickman

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

On Sun, Mar 10, 2013 at 01:55:10PM +0800, Hillf Danton wrote:
> On Thu, Mar 7, 2013 at 5:50 AM, Cliff Wickman <[email protected]> wrote:
> > From: Cliff Wickman <[email protected]>
> >
> > Allocating a large number of 1GB hugetlbfs pages at boot takes a
> > very long time.
> >
> > Large system sites would at times like to allocate a very large amount of
> > memory as 1GB pages. They would put this on the kernel boot line:
> > default_hugepagesz=1G hugepagesz=1G hugepages=4096
> > [Dynamic allocation of 1G pages is not an option, as zone pages only go
> > up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
> >
> > Each page is zeroed as it is allocated, and all allocation is done by
> > cpu 0, as this path is early in boot:
> > start_kernel
> > kernel_init
> > do_pre_smp_initcalls
> > hugetlb_init
> > hugetlb_init_hstates
> > hugetlb_hstate_alloc_pages
> >
> > Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> > on large numa systems).
> > This estimate is approximate (it depends on core frequency & number of hops
> > to remote memory) but should be within a factor of 2 on most systems.
> > A benchmark attempting to reserve a TB for 1GB pages would thus require
> > ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours.
> >
> > I propose passing a flag to the early allocator to indicate that no zeroing
> > of a page should be done. The 'no zeroing' flag would have to be passed
> > down this code path:
> >
>
> FYI: huge pages are cleared just after allocated, for instance,
> clear_huge_page() in hugetlb_no_page()
>
> Hillf

Yes, I should have added that comment to the changelog. And because
this is true there is no need to clear a huge page at boot time.

-Cliff
> > hugetlb_hstate_alloc_pages
> > alloc_bootmem_huge_page
> > __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c)
> > __alloc_memory_core_early NO_ZERO
> > if (!(flags & NO_ZERO))
> > memset(ptr, 0, size);
> >
> > Or this path if CONFIG_NO_BOOTMEM is not set:
> >
> > hugetlb_hstate_alloc_pages
> > alloc_bootmem_huge_page
> > __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c)
> > alloc_bootmem_core NO_ZERO
> > if (!(flags & NO_ZERO))
> > memset(region, 0, size);
> > __alloc_bootmem_nopanic NO_ZERO
> > ___alloc_bootmem_nopanic NO_ZERO
> > alloc_bootmem_core NO_ZERO
> > if (!(flags & NO_ZERO))
> > memset(region, 0, size);
> >
> > Signed-off-by: Cliff Wickman <[email protected]>
> >
> > ---
> > arch/x86/kernel/setup_percpu.c | 4 ++--
> > include/linux/bootmem.h | 23 ++++++++++++++++-------
> > mm/bootmem.c | 12 +++++++-----
> > mm/hugetlb.c | 3 ++-
> > mm/nobootmem.c | 41 +++++++++++++++++++++++------------------
> > mm/page_cgroup.c | 2 +-
> > mm/sparse.c | 2 +-
> > 7 files changed, 52 insertions(+), 35 deletions(-)
> >
> > Index: linux/include/linux/bootmem.h
> > ===================================================================
> > --- linux.orig/include/linux/bootmem.h
> > +++ linux/include/linux/bootmem.h
> > @@ -8,6 +8,11 @@
> > #include <asm/dma.h>
> >
> > /*
> > + * allocation flags
> > + */
> > +#define NO_ZERO 0x00000001
> > +
> > +/*
> > * simple boot-time physical memory area allocator.
> > */
> >
> > @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
> > unsigned long goal);
> > extern void *__alloc_bootmem_nopanic(unsigned long size,
> > unsigned long align,
> > - unsigned long goal);
> > + unsigned long goal,
> > + u32 flags);
> > extern void *__alloc_bootmem_node(pg_data_t *pgdat,
> > unsigned long size,
> > unsigned long align,
> > @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
> > extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> > unsigned long size,
> > unsigned long align,
> > - unsigned long goal);
> > + unsigned long goal,
> > + u32 flags);
> > void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> > unsigned long size,
> > unsigned long align,
> > unsigned long goal,
> > - unsigned long limit);
> > + unsigned long limit,
> > + u32 flags);
> > extern void *__alloc_bootmem_low(unsigned long size,
> > unsigned long align,
> > unsigned long goal);
> > @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
> > #define alloc_bootmem_align(x, align) \
> > __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
> > #define alloc_bootmem_nopanic(x) \
> > - __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
> > #define alloc_bootmem_pages(x) \
> > __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > #define alloc_bootmem_pages_nopanic(x) \
> > - __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > + __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> > #define alloc_bootmem_node(pgdat, x) \
> > __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > #define alloc_bootmem_node_nopanic(pgdat, x) \
> > - __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> > + __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> > + BOOTMEM_LOW_LIMIT, 0)
> > #define alloc_bootmem_pages_node(pgdat, x) \
> > __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> > - __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> > + __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> >
> > #define alloc_bootmem_low(x) \
> > __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> > Index: linux/arch/x86/kernel/setup_percpu.c
> > ===================================================================
> > --- linux.orig/arch/x86/kernel/setup_percpu.c
> > +++ linux/arch/x86/kernel/setup_percpu.c
> > @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
> > void *ptr;
> >
> > if (!node_online(node) || !NODE_DATA(node)) {
> > - ptr = __alloc_bootmem_nopanic(size, align, goal);
> > + ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
> > pr_info("cpu %d has no node %d or node-local memory\n",
> > cpu, node);
> > pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
> > cpu, size, __pa(ptr));
> > } else {
> > ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> > - size, align, goal);
> > + size, align, goal, 0);
> > pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
> > cpu, size, node, __pa(ptr));
> > }
> > Index: linux/mm/nobootmem.c
> > ===================================================================
> > --- linux.orig/mm/nobootmem.c
> > +++ linux/mm/nobootmem.c
> > @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
> > unsigned long max_pfn;
> >
> > static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> > - u64 goal, u64 limit)
> > + u64 goal, u64 limit, u32 flags)
> > {
> > void *ptr;
> > u64 addr;
> > @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
> > return NULL;
> >
> > ptr = phys_to_virt(addr);
> > - memset(ptr, 0, size);
> > + if (!(flags & NO_ZERO))
> > + memset(ptr, 0, size);
> > memblock_reserve(addr, size);
> > /*
> > * The min_count is set to 0 so that bootmem allocated blocks
> > @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
> > static void * __init ___alloc_bootmem_nopanic(unsigned long size,
> > unsigned long align,
> > unsigned long goal,
> > - unsigned long limit)
> > + unsigned long limit,
> > + u32 flags)
> > {
> > void *ptr;
> >
> > @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
> >
> > restart:
> >
> > - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> > + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> > + limit, 0);
> >
> > if (ptr)
> > return ptr;
> > @@ -244,17 +247,17 @@ restart:
> > * Returns NULL on failure.
> > */
> > void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> > - unsigned long goal)
> > + unsigned long goal, u32 flags)
> > {
> > unsigned long limit = -1UL;
> >
> > - return ___alloc_bootmem_nopanic(size, align, goal, limit);
> > + return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> > }
> >
> > static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> > - unsigned long goal, unsigned long limit)
> > + unsigned long goal, unsigned long limit, u32 flags)
> > {
> > - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> > + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> >
> > if (mem)
> > return mem;
> > @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
> > {
> > unsigned long limit = -1UL;
> >
> > - return ___alloc_bootmem(size, align, goal, limit);
> > + return ___alloc_bootmem(size, align, goal, limit, 0);
> > }
> >
> > void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> > unsigned long size,
> > unsigned long align,
> > unsigned long goal,
> > - unsigned long limit)
> > + unsigned long limit,
> > + u32 flags)
> > {
> > void *ptr;
> >
> > again:
> > ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> > - goal, limit);
> > + goal, limit, flags);
> > if (ptr)
> > return ptr;
> >
> > ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> > - goal, limit);
> > + goal, limit, flags);
> > if (ptr)
> > return ptr;
> >
> > @@ -315,12 +319,13 @@ again:
> > }
> >
> > void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> > - unsigned long align, unsigned long goal)
> > + unsigned long align, unsigned long goal, u32 flags)
> > {
> > if (WARN_ON_ONCE(slab_is_available()))
> > return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> >
> > - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> > + 0, flags);
> > }
> >
> > void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
> > {
> > void *ptr;
> >
> > - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> > + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
> > if (ptr)
> > return ptr;
> >
> > @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
> > * The function panics if the request can not be satisfied.
> > */
> > void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > - unsigned long align, unsigned long goal)
> > + unsigned long align, unsigned long goal)
> > {
> > if (WARN_ON_ONCE(slab_is_available()))
> > return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> > @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
> > void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
> > unsigned long goal)
> > {
> > - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> > + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
> > }
> >
> > void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> > @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
> > unsigned long goal)
> > {
> > return ___alloc_bootmem_nopanic(size, align, goal,
> > - ARCH_LOW_ADDRESS_LIMIT);
> > + ARCH_LOW_ADDRESS_LIMIT, 0);
> > }
> >
> > /**
> > Index: linux/mm/sparse.c
> > ===================================================================
> > --- linux.orig/mm/sparse.c
> > +++ linux/mm/sparse.c
> > @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
> > nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> > again:
> > p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> > - SMP_CACHE_BYTES, goal, limit);
> > + SMP_CACHE_BYTES, goal, limit, 0);
> > if (!p && limit) {
> > limit = 0;
> > goto again;
> > Index: linux/mm/hugetlb.c
> > ===================================================================
> > --- linux.orig/mm/hugetlb.c
> > +++ linux/mm/hugetlb.c
> > @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
> > addr = __alloc_bootmem_node_nopanic(
> > NODE_DATA(hstate_next_node_to_alloc(h,
> > &node_states[N_MEMORY])),
> > - huge_page_size(h), huge_page_size(h), 0);
> > + huge_page_size(h), huge_page_size(h),
> > + 0, NO_ZERO);
> >
> > if (addr) {
> > /*
> > Index: linux/mm/bootmem.c
> > ===================================================================
> > --- linux.orig/mm/bootmem.c
> > +++ linux/mm/bootmem.c
> > @@ -660,7 +660,7 @@ restart:
> > * Returns NULL on failure.
> > */
> > void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> > - unsigned long goal)
> > + unsigned long goal, u32 flags)
> > {
> > unsigned long limit = 0;
> >
> > @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
> >
> > void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> > unsigned long size, unsigned long align,
> > - unsigned long goal, unsigned long limit)
> > + unsigned long goal, unsigned long limit,
> > + u32 flags)
> > {
> > void *ptr;
> >
> > @@ -734,12 +735,13 @@ again:
> > }
> >
> > void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> > - unsigned long align, unsigned long goal)
> > + unsigned long align, unsigned long goal, u32 flags)
> > {
> > if (WARN_ON_ONCE(slab_is_available()))
> > return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> >
> > - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> > + 0, flags);
> > }
> >
> > void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> > @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
> > {
> > void *ptr;
> >
> > - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> > + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
> > if (ptr)
> > return ptr;
> >
> > Index: linux/mm/page_cgroup.c
> > ===================================================================
> > --- linux.orig/mm/page_cgroup.c
> > +++ linux/mm/page_cgroup.c
> > @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
> > table_size = sizeof(struct page_cgroup) * nr_pages;
> >
> > base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> > - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> > + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
> > if (!base)
> > return -ENOMEM;
> > NODE_DATA(nid)->node_page_cgroup = base;
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
> >
> >

--
Cliff Wickman
SGI
[email protected]
(651) 683-3824

2013-03-14 08:51:43

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

On Wed 06-03-13 15:50:20, Cliff Wickman wrote:
[...]
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done. The 'no zeroing' flag would have to be passed
> down this code path:
>
> hugetlb_hstate_alloc_pages
> alloc_bootmem_huge_page
> __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c)
> __alloc_memory_core_early NO_ZERO
> if (!(flags & NO_ZERO))
> memset(ptr, 0, size);
>
> Or this path if CONFIG_NO_BOOTMEM is not set:
>
> hugetlb_hstate_alloc_pages
> alloc_bootmem_huge_page
> __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c)
> alloc_bootmem_core NO_ZERO
> if (!(flags & NO_ZERO))
> memset(region, 0, size);
> __alloc_bootmem_nopanic NO_ZERO
> ___alloc_bootmem_nopanic NO_ZERO
> alloc_bootmem_core NO_ZERO
> if (!(flags & NO_ZERO))
> memset(region, 0, size);

Yes, the patch makes sense. I just think it make unnecessary churn.
Can we just add __alloc_bootmem_node_nopanic_nozero and hide the flag
downwards the call chain so that we do not have to touch all
__alloc_bootmem_node_nopanic callers?

Thanks

> Signed-off-by: Cliff Wickman <[email protected]>
>
> ---
> arch/x86/kernel/setup_percpu.c | 4 ++--
> include/linux/bootmem.h | 23 ++++++++++++++++-------
> mm/bootmem.c | 12 +++++++-----
> mm/hugetlb.c | 3 ++-
> mm/nobootmem.c | 41 +++++++++++++++++++++++------------------
> mm/page_cgroup.c | 2 +-
> mm/sparse.c | 2 +-
> 7 files changed, 52 insertions(+), 35 deletions(-)
>
[...]
--
Michal Hocko
SUSE Labs

2013-04-03 02:43:53

by Robin Holt

[permalink] [raw]
Subject: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2

Reserving a large number of 1GB hugetlbfs pages at boot takes a very
long time due to the pages being memset to 0 during the reservation.
This is unneeded as the pages will be zeroed by clear_huge_page() when
being allocated by the user.

Large system sites would at times like to allocate a very large amount
of memory as 1GB pages. They would put this on the kernel boot line:
default_hugepagesz=1G hugepagesz=1G hugepages=4096
[Dynamic allocation of 1G pages is not an option, as zone pages only go
up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]

Each page is zeroed as it is allocated, and all allocation is done by
cpu 0, as this path is early in boot:
start_kernel
kernel_init
do_pre_smp_initcalls
hugetlb_init
hugetlb_init_hstates
hugetlb_hstate_alloc_pages

Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
on large numa systems). This estimate is approximate (it depends on
core frequency & number of hops to remote memory) but should be within
a factor of 2 on most systems. A benchmark attempting to reserve a TB
for 1GB pages would thus require ~1000 seconds of boot time just for
this allocating. 32TB would take 8 hours.

Signed-off-by: Robin Holt <[email protected]>
To: Cliff Whickman <[email protected]>
To: Michal Hocko <[email protected]>
Cc: lkml <[email protected]>
Cc: Linux mm <[email protected]>
Cc: x86 Maintainers <[email protected]>
---

Changes since -v1
- Reworked to remove the special NO_ZERO flag and push that down further
in the call chain.

Note: I compiled this only with a .config which specified
CONFIG_NO_BOOTMEM (x86_64). I have not tried a config which uses a
bootmem allocator.

include/linux/bootmem.h | 8 +++++++-
mm/bootmem.c | 21 +++++++++++++++++----
mm/hugetlb.c | 2 +-
mm/nobootmem.c | 37 +++++++++++++++++++++++++++----------
mm/sparse.c | 2 +-
5 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab..04563fc 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -92,11 +92,17 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
+extern void *__alloc_bootmem_node_nopanic_notzeroed(
+ pg_data_t *pgdat,
+ unsigned long size,
+ unsigned long align,
+ unsigned long goal);
void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit);
+ unsigned long limit,
+ int zeroed);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb0..b2e4027 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,

void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+ unsigned long goal, unsigned long limit,
+ int zeroed)
{
void *ptr;

if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
+ if (zeroed)
+ return kzalloc(size, GFP_NOWAIT);
+ else
+ return kmalloc(size, GFP_NOWAIT);
again:

/* do not panic in alloc_bootmem_bdata() */
@@ -733,13 +737,22 @@ again:
return NULL;
}

+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);

- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
}

void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -748,7 +761,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
{
void *ptr;

- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
if (ptr)
return ptr;

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ca9a7c6..7683f6a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
while (nr_nodes) {
void *addr;

- addr = __alloc_bootmem_node_nopanic(
+ addr = __alloc_bootmem_node_nopanic_notzeroed(
NODE_DATA(hstate_next_node_to_alloc(h,
&node_states[N_MEMORY])),
huge_page_size(h), huge_page_size(h), 0);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36..342511b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -32,8 +32,8 @@ unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;

-static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
+static void * __init ___alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit, int zeroed)
{
void *ptr;
u64 addr;
@@ -46,7 +46,8 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
return NULL;

ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
+ if (zeroed)
+ memset(ptr, 0, size);
memblock_reserve(addr, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
@@ -56,6 +57,12 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
return ptr;
}

+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ return ___alloc_memory_core_early(nid, size, align, goal, limit, 1);
+}
+
/*
* free_bootmem_late - free bootmem pages directly to page allocator
* @addr: starting address of the range
@@ -291,18 +298,19 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit)
+ unsigned long limit,
+ int zeroed)
{
void *ptr;

again:
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, limit);
+ ptr = ___alloc_memory_core_early(pgdat->node_id, size, align,
+ goal, limit, zeroed);
if (ptr)
return ptr;

- ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
- goal, limit);
+ ptr = ___alloc_memory_core_early(MAX_NUMNODES, size, align,
+ goal, limit, zeroed);
if (ptr)
return ptr;

@@ -314,13 +322,22 @@ again:
return NULL;
}

+void * __init __alloc_bootmem_node_nopanic_notzeroed(pg_data_t *pgdat, unsigned long size,
+ unsigned long align, unsigned long goal)
+{
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kmalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
+}
+
void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
unsigned long align, unsigned long goal)
{
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);

- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+ return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 1);
}

void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
@@ -329,7 +346,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
{
void *ptr;

- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+ ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 1);
if (ptr)
return ptr;

diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc8..8a1c5ad 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
- SMP_CACHE_BYTES, goal, limit);
+ SMP_CACHE_BYTES, goal, limit, 1);
if (!p && limit) {
limit = 0;
goto again;
--
1.8.1.2

2013-04-03 14:00:54

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2

On Tue 02-04-13 21:43:44, Robin Holt wrote:
[...]
> diff --git a/mm/bootmem.c b/mm/bootmem.c
> index 2b0bcb0..b2e4027 100644
> --- a/mm/bootmem.c
> +++ b/mm/bootmem.c
> @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
>
> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size, unsigned long align,
> - unsigned long goal, unsigned long limit)
> + unsigned long goal, unsigned long limit,
> + int zeroed)
> {
> void *ptr;
>
> if (WARN_ON_ONCE(slab_is_available()))
> - return kzalloc(size, GFP_NOWAIT);
> + if (zeroed)
> + return kzalloc(size, GFP_NOWAIT);
> + else
> + return kmalloc(size, GFP_NOWAIT);
> again:
>
> /* do not panic in alloc_bootmem_bdata() */

You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
Otherwise this is a no-op for early allocations when slab is not
available which is the case unless something is broken.

[...]
--
Michal Hocko
SUSE Labs

2013-04-03 14:02:51

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2

On Tue 02-04-13 21:43:44, Robin Holt wrote:
[...]
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ca9a7c6..7683f6a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> while (nr_nodes) {
> void *addr;
>
> - addr = __alloc_bootmem_node_nopanic(
> + addr = __alloc_bootmem_node_nopanic_notzeroed(
> NODE_DATA(hstate_next_node_to_alloc(h,
> &node_states[N_MEMORY])),
> huge_page_size(h), huge_page_size(h), 0);

Ohh, and powerpc seems to have its own opinion how to allocate huge
pages. See arch/powerpc/mm/hugetlbpage.c

--
Michal Hocko
SUSE Labs

2013-04-03 17:00:15

by Robin Holt

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2

On Wed, Apr 03, 2013 at 04:02:47PM +0200, Michal Hocko wrote:
> On Tue 02-04-13 21:43:44, Robin Holt wrote:
> [...]
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index ca9a7c6..7683f6a 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> > while (nr_nodes) {
> > void *addr;
> >
> > - addr = __alloc_bootmem_node_nopanic(
> > + addr = __alloc_bootmem_node_nopanic_notzeroed(
> > NODE_DATA(hstate_next_node_to_alloc(h,
> > &node_states[N_MEMORY])),
> > huge_page_size(h), huge_page_size(h), 0);
>
> Ohh, and powerpc seems to have its own opinion how to allocate huge
> pages. See arch/powerpc/mm/hugetlbpage.c

Do I need to address their allocations? Can I leave that part of the
changes as something powerpc can address if they are affected by this?

Robin

2013-04-03 17:21:35

by Robin Holt

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2

On Wed, Apr 03, 2013 at 04:00:49PM +0200, Michal Hocko wrote:
> On Tue 02-04-13 21:43:44, Robin Holt wrote:
> [...]
> > diff --git a/mm/bootmem.c b/mm/bootmem.c
> > index 2b0bcb0..b2e4027 100644
> > --- a/mm/bootmem.c
> > +++ b/mm/bootmem.c
> > @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
> >
> > void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> > unsigned long size, unsigned long align,
> > - unsigned long goal, unsigned long limit)
> > + unsigned long goal, unsigned long limit,
> > + int zeroed)
> > {
> > void *ptr;
> >
> > if (WARN_ON_ONCE(slab_is_available()))
> > - return kzalloc(size, GFP_NOWAIT);
> > + if (zeroed)
> > + return kzalloc(size, GFP_NOWAIT);
> > + else
> > + return kmalloc(size, GFP_NOWAIT);
> > again:
> >
> > /* do not panic in alloc_bootmem_bdata() */
>
> You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
> Otherwise this is a no-op for early allocations when slab is not
> available which is the case unless something is broken.

Michal,

Does this do what you would expect? I compiled this for ia64, but I
have not tested it at all.

Robin

---
mm/bootmem.c | 30 +++++++++++++++++++-----------
1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index b2e4027..350e0ab 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -497,7 +497,8 @@ static unsigned long __init align_off(struct bootmem_data *bdata,

static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
+ unsigned long goal, unsigned long limit,
+ int zeroed)
{
unsigned long fallback = 0;
unsigned long min, max, start, sidx, midx, step;
@@ -584,7 +585,8 @@ find_block:

region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
start_off);
- memset(region, 0, size);
+ if (zeroed)
+ memset(region, 0, size);
/*
* The min_count is set to 0 so that bootmem allocated blocks
* are never reported as leaks.
@@ -605,13 +607,18 @@ find_block:
static void * __init alloc_bootmem_core(unsigned long size,
unsigned long align,
unsigned long goal,
- unsigned long limit)
+ unsigned long limit,
+ int zeroed)
{
bootmem_data_t *bdata;
void *region;

- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
+ if (WARN_ON_ONCE(slab_is_available())) {
+ if (zeroed)
+ return kzalloc(size, GFP_NOWAIT);
+ else
+ return kmalloc(size, GFP_NOWAIT);
+ }

list_for_each_entry(bdata, &bdata_list, list) {
if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
@@ -619,7 +626,7 @@ static void * __init alloc_bootmem_core(unsigned long size,
if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
break;

- region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
+ region = alloc_bootmem_bdata(bdata, size, align, goal, limit, zeroed);
if (region)
return region;
}
@@ -635,7 +642,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
void *ptr;

restart:
- ptr = alloc_bootmem_core(size, align, goal, limit);
+ ptr = alloc_bootmem_core(size, align, goal, limit, 1);
if (ptr)
return ptr;
if (goal) {
@@ -710,22 +717,23 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
{
void *ptr;

- if (WARN_ON_ONCE(slab_is_available()))
+ if (WARN_ON_ONCE(slab_is_available())) {
if (zeroed)
return kzalloc(size, GFP_NOWAIT);
else
return kmalloc(size, GFP_NOWAIT);
+ }
again:

/* do not panic in alloc_bootmem_bdata() */
if (limit && goal + size > limit)
limit = 0;

- ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
+ ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit, zeroed);
if (ptr)
return ptr;

- ptr = alloc_bootmem_core(size, align, goal, limit);
+ ptr = alloc_bootmem_core(size, align, goal, limit, zeroed);
if (ptr)
return ptr;

@@ -813,7 +821,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,

new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
- new_goal, 0);
+ new_goal, 0, 1);
if (ptr)
return ptr;
}
--
1.8.1.2

2013-04-04 00:17:20

by Simon Jeons

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

On 03/07/2013 05:50 AM, Cliff Wickman wrote:
> From: Cliff Wickman <[email protected]>
>
> Allocating a large number of 1GB hugetlbfs pages at boot takes a
> very long time.
>
> Large system sites would at times like to allocate a very large amount of
> memory as 1GB pages. They would put this on the kernel boot line:
> default_hugepagesz=1G hugepagesz=1G hugepages=4096
> [Dynamic allocation of 1G pages is not an option, as zone pages only go
> up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>
> Each page is zeroed as it is allocated, and all allocation is done by
> cpu 0, as this path is early in boot:

How you confirm they are done by cpu 0? just cpu 0 works during boot?

> start_kernel
> kernel_init
> do_pre_smp_initcalls
> hugetlb_init
> hugetlb_init_hstates
> hugetlb_hstate_alloc_pages
>
> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
> on large numa systems).
> This estimate is approximate (it depends on core frequency & number of hops
> to remote memory) but should be within a factor of 2 on most systems.
> A benchmark attempting to reserve a TB for 1GB pages would thus require
> ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours.
>
> I propose passing a flag to the early allocator to indicate that no zeroing
> of a page should be done. The 'no zeroing' flag would have to be passed
> down this code path:
>
> hugetlb_hstate_alloc_pages
> alloc_bootmem_huge_page
> __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c)
> __alloc_memory_core_early NO_ZERO
> if (!(flags & NO_ZERO))
> memset(ptr, 0, size);
>
> Or this path if CONFIG_NO_BOOTMEM is not set:
>
> hugetlb_hstate_alloc_pages
> alloc_bootmem_huge_page
> __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c)
> alloc_bootmem_core NO_ZERO
> if (!(flags & NO_ZERO))
> memset(region, 0, size);
> __alloc_bootmem_nopanic NO_ZERO
> ___alloc_bootmem_nopanic NO_ZERO
> alloc_bootmem_core NO_ZERO
> if (!(flags & NO_ZERO))
> memset(region, 0, size);
>
> Signed-off-by: Cliff Wickman <[email protected]>
>
> ---
> arch/x86/kernel/setup_percpu.c | 4 ++--
> include/linux/bootmem.h | 23 ++++++++++++++++-------
> mm/bootmem.c | 12 +++++++-----
> mm/hugetlb.c | 3 ++-
> mm/nobootmem.c | 41 +++++++++++++++++++++++------------------
> mm/page_cgroup.c | 2 +-
> mm/sparse.c | 2 +-
> 7 files changed, 52 insertions(+), 35 deletions(-)
>
> Index: linux/include/linux/bootmem.h
> ===================================================================
> --- linux.orig/include/linux/bootmem.h
> +++ linux/include/linux/bootmem.h
> @@ -8,6 +8,11 @@
> #include <asm/dma.h>
>
> /*
> + * allocation flags
> + */
> +#define NO_ZERO 0x00000001
> +
> +/*
> * simple boot-time physical memory area allocator.
> */
>
> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
> unsigned long goal);
> extern void *__alloc_bootmem_nopanic(unsigned long size,
> unsigned long align,
> - unsigned long goal);
> + unsigned long goal,
> + u32 flags);
> extern void *__alloc_bootmem_node(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
> extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> - unsigned long goal);
> + unsigned long goal,
> + u32 flags);
> void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit);
> + unsigned long limit,
> + u32 flags);
> extern void *__alloc_bootmem_low(unsigned long size,
> unsigned long align,
> unsigned long goal);
> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
> #define alloc_bootmem_align(x, align) \
> __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_nopanic(x) \
> - __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_pages(x) \
> __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_pages_nopanic(x) \
> - __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_node(pgdat, x) \
> __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_node_nopanic(pgdat, x) \
> - __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
> + BOOTMEM_LOW_LIMIT, 0)
> #define alloc_bootmem_pages_node(pgdat, x) \
> __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
> - __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
> + __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>
> #define alloc_bootmem_low(x) \
> __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
> Index: linux/arch/x86/kernel/setup_percpu.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_percpu.c
> +++ linux/arch/x86/kernel/setup_percpu.c
> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
> void *ptr;
>
> if (!node_online(node) || !NODE_DATA(node)) {
> - ptr = __alloc_bootmem_nopanic(size, align, goal);
> + ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
> pr_info("cpu %d has no node %d or node-local memory\n",
> cpu, node);
> pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
> cpu, size, __pa(ptr));
> } else {
> ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
> - size, align, goal);
> + size, align, goal, 0);
> pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
> cpu, size, node, __pa(ptr));
> }
> Index: linux/mm/nobootmem.c
> ===================================================================
> --- linux.orig/mm/nobootmem.c
> +++ linux/mm/nobootmem.c
> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
> unsigned long max_pfn;
>
> static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
> - u64 goal, u64 limit)
> + u64 goal, u64 limit, u32 flags)
> {
> void *ptr;
> u64 addr;
> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
> return NULL;
>
> ptr = phys_to_virt(addr);
> - memset(ptr, 0, size);
> + if (!(flags & NO_ZERO))
> + memset(ptr, 0, size);
> memblock_reserve(addr, size);
> /*
> * The min_count is set to 0 so that bootmem allocated blocks
> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
> static void * __init ___alloc_bootmem_nopanic(unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit)
> + unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>
> restart:
>
> - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
> + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
> + limit, 0);
>
> if (ptr)
> return ptr;
> @@ -244,17 +247,17 @@ restart:
> * Returns NULL on failure.
> */
> void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> - unsigned long goal)
> + unsigned long goal, u32 flags)
> {
> unsigned long limit = -1UL;
>
> - return ___alloc_bootmem_nopanic(size, align, goal, limit);
> + return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
> }
>
> static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
> - unsigned long goal, unsigned long limit)
> + unsigned long goal, unsigned long limit, u32 flags)
> {
> - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
> + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>
> if (mem)
> return mem;
> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
> {
> unsigned long limit = -1UL;
>
> - return ___alloc_bootmem(size, align, goal, limit);
> + return ___alloc_bootmem(size, align, goal, limit, 0);
> }
>
> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit)
> + unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> again:
> ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
> - goal, limit);
> + goal, limit, flags);
> if (ptr)
> return ptr;
>
> ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
> - goal, limit);
> + goal, limit, flags);
> if (ptr)
> return ptr;
>
> @@ -315,12 +319,13 @@ again:
> }
>
> void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal, u32 flags)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> + 0, flags);
> }
>
> void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
> {
> void *ptr;
>
> - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
> + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
> if (ptr)
> return ptr;
>
> @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
> * The function panics if the request can not be satisfied.
> */
> void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
> void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
> unsigned long goal)
> {
> - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
> + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
> }
>
> void * __init __alloc_bootmem_low_nopanic(unsigned long size,
> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
> unsigned long goal)
> {
> return ___alloc_bootmem_nopanic(size, align, goal,
> - ARCH_LOW_ADDRESS_LIMIT);
> + ARCH_LOW_ADDRESS_LIMIT, 0);
> }
>
> /**
> Index: linux/mm/sparse.c
> ===================================================================
> --- linux.orig/mm/sparse.c
> +++ linux/mm/sparse.c
> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
> nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
> again:
> p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
> - SMP_CACHE_BYTES, goal, limit);
> + SMP_CACHE_BYTES, goal, limit, 0);
> if (!p && limit) {
> limit = 0;
> goto again;
> Index: linux/mm/hugetlb.c
> ===================================================================
> --- linux.orig/mm/hugetlb.c
> +++ linux/mm/hugetlb.c
> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
> addr = __alloc_bootmem_node_nopanic(
> NODE_DATA(hstate_next_node_to_alloc(h,
> &node_states[N_MEMORY])),
> - huge_page_size(h), huge_page_size(h), 0);
> + huge_page_size(h), huge_page_size(h),
> + 0, NO_ZERO);
>
> if (addr) {
> /*
> Index: linux/mm/bootmem.c
> ===================================================================
> --- linux.orig/mm/bootmem.c
> +++ linux/mm/bootmem.c
> @@ -660,7 +660,7 @@ restart:
> * Returns NULL on failure.
> */
> void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
> - unsigned long goal)
> + unsigned long goal, u32 flags)
> {
> unsigned long limit = 0;
>
> @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>
> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> unsigned long size, unsigned long align,
> - unsigned long goal, unsigned long limit)
> + unsigned long goal, unsigned long limit,
> + u32 flags)
> {
> void *ptr;
>
> @@ -734,12 +735,13 @@ again:
> }
>
> void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
> - unsigned long align, unsigned long goal)
> + unsigned long align, unsigned long goal, u32 flags)
> {
> if (WARN_ON_ONCE(slab_is_available()))
> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>
> - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
> + 0, flags);
> }
>
> void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
> {
> void *ptr;
>
> - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
> + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
> if (ptr)
> return ptr;
>
> Index: linux/mm/page_cgroup.c
> ===================================================================
> --- linux.orig/mm/page_cgroup.c
> +++ linux/mm/page_cgroup.c
> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
> table_size = sizeof(struct page_cgroup) * nr_pages;
>
> base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
> - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
> + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
> if (!base)
> return -ENOMEM;
> NODE_DATA(nid)->node_page_cgroup = base;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [email protected]. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

2013-04-04 08:08:51

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2

On Wed 03-04-13 12:00:12, Robin Holt wrote:
> On Wed, Apr 03, 2013 at 04:02:47PM +0200, Michal Hocko wrote:
> > On Tue 02-04-13 21:43:44, Robin Holt wrote:
> > [...]
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index ca9a7c6..7683f6a 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -1185,7 +1185,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> > > while (nr_nodes) {
> > > void *addr;
> > >
> > > - addr = __alloc_bootmem_node_nopanic(
> > > + addr = __alloc_bootmem_node_nopanic_notzeroed(
> > > NODE_DATA(hstate_next_node_to_alloc(h,
> > > &node_states[N_MEMORY])),
> > > huge_page_size(h), huge_page_size(h), 0);
> >
> > Ohh, and powerpc seems to have its own opinion how to allocate huge
> > pages. See arch/powerpc/mm/hugetlbpage.c
>
> Do I need to address their allocations? Can I leave that part of the
> changes as something powerpc can address if they are affected by this?

I mentioned powerpc basically because I encountered it as the only
alternative implementation of alloc_bootmem_huge_page. I haven't checked
how it does the job and now that I am looking closer it uses memblock
allocator so it would need a separate fix.
I guess you are right saying that this should be handled when the need
arises.

--
Michal Hocko
SUSE Labs

2013-04-04 08:17:08

by Michal Hocko

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: Do not zero hugetlbfs pages at boot. -v2

On Wed 03-04-13 12:21:32, Robin Holt wrote:
> On Wed, Apr 03, 2013 at 04:00:49PM +0200, Michal Hocko wrote:
> > On Tue 02-04-13 21:43:44, Robin Holt wrote:
> > [...]
> > > diff --git a/mm/bootmem.c b/mm/bootmem.c
> > > index 2b0bcb0..b2e4027 100644
> > > --- a/mm/bootmem.c
> > > +++ b/mm/bootmem.c
> > > @@ -705,12 +705,16 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
> > >
> > > void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> > > unsigned long size, unsigned long align,
> > > - unsigned long goal, unsigned long limit)
> > > + unsigned long goal, unsigned long limit,
> > > + int zeroed)
> > > {
> > > void *ptr;
> > >
> > > if (WARN_ON_ONCE(slab_is_available()))
> > > - return kzalloc(size, GFP_NOWAIT);
> > > + if (zeroed)
> > > + return kzalloc(size, GFP_NOWAIT);
> > > + else
> > > + return kmalloc(size, GFP_NOWAIT);
> > > again:
> > >
> > > /* do not panic in alloc_bootmem_bdata() */
> >
> > You need to update alloc_bootmem_bdata and alloc_bootmem_core as well.
> > Otherwise this is a no-op for early allocations when slab is not
> > available which is the case unless something is broken.
>
> Michal,
>
> Does this do what you would expect?

yes, it looks right when I quickly glanced over it. I haven't checked
deeply yet. I would suggest reposting and adding more *bootmem people
into CC (e.g. Johannes Weiner, Yinghai Lu, Tejun Heo and maybe others).

> I compiled this for ia64, but I have not tested it at all.
>
> Robin
>
> ---
> mm/bootmem.c | 30 +++++++++++++++++++-----------
> 1 file changed, 19 insertions(+), 11 deletions(-)
>
> diff --git a/mm/bootmem.c b/mm/bootmem.c
> index b2e4027..350e0ab 100644
> --- a/mm/bootmem.c
> +++ b/mm/bootmem.c
> @@ -497,7 +497,8 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
>
> static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
> unsigned long size, unsigned long align,
> - unsigned long goal, unsigned long limit)
> + unsigned long goal, unsigned long limit,
> + int zeroed)
> {
> unsigned long fallback = 0;
> unsigned long min, max, start, sidx, midx, step;
> @@ -584,7 +585,8 @@ find_block:
>
> region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
> start_off);
> - memset(region, 0, size);
> + if (zeroed)
> + memset(region, 0, size);
> /*
> * The min_count is set to 0 so that bootmem allocated blocks
> * are never reported as leaks.
> @@ -605,13 +607,18 @@ find_block:
> static void * __init alloc_bootmem_core(unsigned long size,
> unsigned long align,
> unsigned long goal,
> - unsigned long limit)
> + unsigned long limit,
> + int zeroed)
> {
> bootmem_data_t *bdata;
> void *region;
>
> - if (WARN_ON_ONCE(slab_is_available()))
> - return kzalloc(size, GFP_NOWAIT);
> + if (WARN_ON_ONCE(slab_is_available())) {
> + if (zeroed)
> + return kzalloc(size, GFP_NOWAIT);
> + else
> + return kmalloc(size, GFP_NOWAIT);
> + }
>
> list_for_each_entry(bdata, &bdata_list, list) {
> if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
> @@ -619,7 +626,7 @@ static void * __init alloc_bootmem_core(unsigned long size,
> if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
> break;
>
> - region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
> + region = alloc_bootmem_bdata(bdata, size, align, goal, limit, zeroed);
> if (region)
> return region;
> }
> @@ -635,7 +642,7 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
> void *ptr;
>
> restart:
> - ptr = alloc_bootmem_core(size, align, goal, limit);
> + ptr = alloc_bootmem_core(size, align, goal, limit, 1);
> if (ptr)
> return ptr;
> if (goal) {
> @@ -710,22 +717,23 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
> {
> void *ptr;
>
> - if (WARN_ON_ONCE(slab_is_available()))
> + if (WARN_ON_ONCE(slab_is_available())) {
> if (zeroed)
> return kzalloc(size, GFP_NOWAIT);
> else
> return kmalloc(size, GFP_NOWAIT);
> + }
> again:
>
> /* do not panic in alloc_bootmem_bdata() */
> if (limit && goal + size > limit)
> limit = 0;
>
> - ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
> + ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit, zeroed);
> if (ptr)
> return ptr;
>
> - ptr = alloc_bootmem_core(size, align, goal, limit);
> + ptr = alloc_bootmem_core(size, align, goal, limit, zeroed);
> if (ptr)
> return ptr;
>
> @@ -813,7 +821,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
>
> new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
> ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
> - new_goal, 0);
> + new_goal, 0, 1);
> if (ptr)
> return ptr;
> }
> --
> 1.8.1.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Michal Hocko
SUSE Labs

2013-04-04 12:16:12

by Cliff Wickman

[permalink] [raw]
Subject: Re: [PATCH] mm, x86: no zeroing of hugetlbfs pages at boot

On Thu, Apr 04, 2013 at 08:17:08AM +0800, Simon Jeons wrote:
> On 03/07/2013 05:50 AM, Cliff Wickman wrote:
>> From: Cliff Wickman <[email protected]>
>>
>> Allocating a large number of 1GB hugetlbfs pages at boot takes a
>> very long time.
>>
>> Large system sites would at times like to allocate a very large amount of
>> memory as 1GB pages. They would put this on the kernel boot line:
>> default_hugepagesz=1G hugepagesz=1G hugepages=4096
>> [Dynamic allocation of 1G pages is not an option, as zone pages only go
>> up to MAX_ORDER, and MAX_ORDER cannot exceed the section size.]
>>
>> Each page is zeroed as it is allocated, and all allocation is done by
>> cpu 0, as this path is early in boot:
>
> How you confirm they are done by cpu 0? just cpu 0 works during boot?

Yes, in kernel_init() you see the call to do_pre_smp_initcalls() just
before the call to smp_init(). It is smp_init() that starts the other
cpus. They don't come out of reset until then.

>> start_kernel
>> kernel_init
>> do_pre_smp_initcalls
>> hugetlb_init
>> hugetlb_init_hstates
>> hugetlb_hstate_alloc_pages
>>
>> Zeroing remote (offnode) memory takes ~1GB/sec (and most memory is offnode
>> on large numa systems).
>> This estimate is approximate (it depends on core frequency & number of hops
>> to remote memory) but should be within a factor of 2 on most systems.
>> A benchmark attempting to reserve a TB for 1GB pages would thus require
>> ~1000 seconds of boot time just for this allocating. 32TB would take 8 hours.
>>
>> I propose passing a flag to the early allocator to indicate that no zeroing
>> of a page should be done. The 'no zeroing' flag would have to be passed
>> down this code path:
>>
>> hugetlb_hstate_alloc_pages
>> alloc_bootmem_huge_page
>> __alloc_bootmem_node_nopanic NO_ZERO (nobootmem.c)
>> __alloc_memory_core_early NO_ZERO
>> if (!(flags & NO_ZERO))
>> memset(ptr, 0, size);
>>
>> Or this path if CONFIG_NO_BOOTMEM is not set:
>>
>> hugetlb_hstate_alloc_pages
>> alloc_bootmem_huge_page
>> __alloc_bootmem_node_nopanic NO_ZERO (bootmem.c)
>> alloc_bootmem_core NO_ZERO
>> if (!(flags & NO_ZERO))
>> memset(region, 0, size);
>> __alloc_bootmem_nopanic NO_ZERO
>> ___alloc_bootmem_nopanic NO_ZERO
>> alloc_bootmem_core NO_ZERO
>> if (!(flags & NO_ZERO))
>> memset(region, 0, size);
>>
>> Signed-off-by: Cliff Wickman <[email protected]>
>>
>> ---
>> arch/x86/kernel/setup_percpu.c | 4 ++--
>> include/linux/bootmem.h | 23 ++++++++++++++++-------
>> mm/bootmem.c | 12 +++++++-----
>> mm/hugetlb.c | 3 ++-
>> mm/nobootmem.c | 41 +++++++++++++++++++++++------------------
>> mm/page_cgroup.c | 2 +-
>> mm/sparse.c | 2 +-
>> 7 files changed, 52 insertions(+), 35 deletions(-)
>>
>> Index: linux/include/linux/bootmem.h
>> ===================================================================
>> --- linux.orig/include/linux/bootmem.h
>> +++ linux/include/linux/bootmem.h
>> @@ -8,6 +8,11 @@
>> #include <asm/dma.h>
>> /*
>> + * allocation flags
>> + */
>> +#define NO_ZERO 0x00000001
>> +
>> +/*
>> * simple boot-time physical memory area allocator.
>> */
>> @@ -79,7 +84,8 @@ extern void *__alloc_bootmem(unsigned lo
>> unsigned long goal);
>> extern void *__alloc_bootmem_nopanic(unsigned long size,
>> unsigned long align,
>> - unsigned long goal);
>> + unsigned long goal,
>> + u32 flags);
>> extern void *__alloc_bootmem_node(pg_data_t *pgdat,
>> unsigned long size,
>> unsigned long align,
>> @@ -91,12 +97,14 @@ void *__alloc_bootmem_node_high(pg_data_
>> extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>> unsigned long size,
>> unsigned long align,
>> - unsigned long goal);
>> + unsigned long goal,
>> + u32 flags);
>> void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>> unsigned long size,
>> unsigned long align,
>> unsigned long goal,
>> - unsigned long limit);
>> + unsigned long limit,
>> + u32 flags);
>> extern void *__alloc_bootmem_low(unsigned long size,
>> unsigned long align,
>> unsigned long goal);
>> @@ -120,19 +128,20 @@ extern void *__alloc_bootmem_low_node(pg
>> #define alloc_bootmem_align(x, align) \
>> __alloc_bootmem(x, align, BOOTMEM_LOW_LIMIT)
>> #define alloc_bootmem_nopanic(x) \
>> - __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>> + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT, 0)
>> #define alloc_bootmem_pages(x) \
>> __alloc_bootmem(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>> #define alloc_bootmem_pages_nopanic(x) \
>> - __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>> + __alloc_bootmem_nopanic(x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>> #define alloc_bootmem_node(pgdat, x) \
>> __alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>> #define alloc_bootmem_node_nopanic(pgdat, x) \
>> - __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, BOOTMEM_LOW_LIMIT)
>> + __alloc_bootmem_node_nopanic(pgdat, x, SMP_CACHE_BYTES, \
>> + BOOTMEM_LOW_LIMIT, 0)
>> #define alloc_bootmem_pages_node(pgdat, x) \
>> __alloc_bootmem_node(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>> #define alloc_bootmem_pages_node_nopanic(pgdat, x) \
>> - __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT)
>> + __alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, BOOTMEM_LOW_LIMIT, 0)
>> #define alloc_bootmem_low(x) \
>> __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
>> Index: linux/arch/x86/kernel/setup_percpu.c
>> ===================================================================
>> --- linux.orig/arch/x86/kernel/setup_percpu.c
>> +++ linux/arch/x86/kernel/setup_percpu.c
>> @@ -104,14 +104,14 @@ static void * __init pcpu_alloc_bootmem(
>> void *ptr;
>> if (!node_online(node) || !NODE_DATA(node)) {
>> - ptr = __alloc_bootmem_nopanic(size, align, goal);
>> + ptr = __alloc_bootmem_nopanic(size, align, goal, 0);
>> pr_info("cpu %d has no node %d or node-local memory\n",
>> cpu, node);
>> pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
>> cpu, size, __pa(ptr));
>> } else {
>> ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
>> - size, align, goal);
>> + size, align, goal, 0);
>> pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
>> cpu, size, node, __pa(ptr));
>> }
>> Index: linux/mm/nobootmem.c
>> ===================================================================
>> --- linux.orig/mm/nobootmem.c
>> +++ linux/mm/nobootmem.c
>> @@ -33,7 +33,7 @@ unsigned long min_low_pfn;
>> unsigned long max_pfn;
>> static void * __init __alloc_memory_core_early(int nid, u64 size,
>> u64 align,
>> - u64 goal, u64 limit)
>> + u64 goal, u64 limit, u32 flags)
>> {
>> void *ptr;
>> u64 addr;
>> @@ -46,7 +46,8 @@ static void * __init __alloc_memory_core
>> return NULL;
>> ptr = phys_to_virt(addr);
>> - memset(ptr, 0, size);
>> + if (!(flags & NO_ZERO))
>> + memset(ptr, 0, size);
>> memblock_reserve(addr, size);
>> /*
>> * The min_count is set to 0 so that bootmem allocated blocks
>> @@ -208,7 +209,8 @@ void __init free_bootmem(unsigned long a
>> static void * __init ___alloc_bootmem_nopanic(unsigned long size,
>> unsigned long align,
>> unsigned long goal,
>> - unsigned long limit)
>> + unsigned long limit,
>> + u32 flags)
>> {
>> void *ptr;
>> @@ -217,7 +219,8 @@ static void * __init ___alloc_bootmem_no
>> restart:
>> - ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
>> limit);
>> + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal,
>> + limit, 0);
>> if (ptr)
>> return ptr;
>> @@ -244,17 +247,17 @@ restart:
>> * Returns NULL on failure.
>> */
>> void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
>> - unsigned long goal)
>> + unsigned long goal, u32 flags)
>> {
>> unsigned long limit = -1UL;
>> - return ___alloc_bootmem_nopanic(size, align, goal, limit);
>> + return ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>> }
>> static void * __init ___alloc_bootmem(unsigned long size, unsigned
>> long align,
>> - unsigned long goal, unsigned long limit)
>> + unsigned long goal, unsigned long limit, u32 flags)
>> {
>> - void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
>> + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit, flags);
>> if (mem)
>> return mem;
>> @@ -284,25 +287,26 @@ void * __init __alloc_bootmem(unsigned l
>> {
>> unsigned long limit = -1UL;
>> - return ___alloc_bootmem(size, align, goal, limit);
>> + return ___alloc_bootmem(size, align, goal, limit, 0);
>> }
>> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>> unsigned long size,
>> unsigned long align,
>> unsigned long goal,
>> - unsigned long limit)
>> + unsigned long limit,
>> + u32 flags)
>> {
>> void *ptr;
>> again:
>> ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
>> - goal, limit);
>> + goal, limit, flags);
>> if (ptr)
>> return ptr;
>> ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
>> - goal, limit);
>> + goal, limit, flags);
>> if (ptr)
>> return ptr;
>> @@ -315,12 +319,13 @@ again:
>> }
>> void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>> unsigned long size,
>> - unsigned long align, unsigned long goal)
>> + unsigned long align, unsigned long goal, u32 flags)
>> {
>> if (WARN_ON_ONCE(slab_is_available()))
>> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>> - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
>> + 0, flags);
>> }
>> void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long
>> size,
>> @@ -329,7 +334,7 @@ void * __init ___alloc_bootmem_node(pg_d
>> {
>> void *ptr;
>> - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
>> limit);
>> + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit, 0);
>> if (ptr)
>> return ptr;
>> @@ -354,7 +359,7 @@ void * __init ___alloc_bootmem_node(pg_d
>> * The function panics if the request can not be satisfied.
>> */
>> void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
>> - unsigned long align, unsigned long goal)
>> + unsigned long align, unsigned long goal)
>> {
>> if (WARN_ON_ONCE(slab_is_available()))
>> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>> @@ -388,7 +393,7 @@ void * __init __alloc_bootmem_node_high(
>> void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
>> unsigned long goal)
>> {
>> - return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
>> + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT, 0);
>> }
>> void * __init __alloc_bootmem_low_nopanic(unsigned long size,
>> @@ -396,7 +401,7 @@ void * __init __alloc_bootmem_low_nopani
>> unsigned long goal)
>> {
>> return ___alloc_bootmem_nopanic(size, align, goal,
>> - ARCH_LOW_ADDRESS_LIMIT);
>> + ARCH_LOW_ADDRESS_LIMIT, 0);
>> }
>> /**
>> Index: linux/mm/sparse.c
>> ===================================================================
>> --- linux.orig/mm/sparse.c
>> +++ linux/mm/sparse.c
>> @@ -281,7 +281,7 @@ sparse_early_usemaps_alloc_pgdat_section
>> nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
>> again:
>> p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
>> - SMP_CACHE_BYTES, goal, limit);
>> + SMP_CACHE_BYTES, goal, limit, 0);
>> if (!p && limit) {
>> limit = 0;
>> goto again;
>> Index: linux/mm/hugetlb.c
>> ===================================================================
>> --- linux.orig/mm/hugetlb.c
>> +++ linux/mm/hugetlb.c
>> @@ -1188,7 +1188,8 @@ int __weak alloc_bootmem_huge_page(struc
>> addr = __alloc_bootmem_node_nopanic(
>> NODE_DATA(hstate_next_node_to_alloc(h,
>> &node_states[N_MEMORY])),
>> - huge_page_size(h), huge_page_size(h), 0);
>> + huge_page_size(h), huge_page_size(h),
>> + 0, NO_ZERO);
>> if (addr) {
>> /*
>> Index: linux/mm/bootmem.c
>> ===================================================================
>> --- linux.orig/mm/bootmem.c
>> +++ linux/mm/bootmem.c
>> @@ -660,7 +660,7 @@ restart:
>> * Returns NULL on failure.
>> */
>> void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
>> - unsigned long goal)
>> + unsigned long goal, u32 flags)
>> {
>> unsigned long limit = 0;
>> @@ -705,7 +705,8 @@ void * __init __alloc_bootmem(unsigned l
>> void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>> unsigned long size, unsigned long align,
>> - unsigned long goal, unsigned long limit)
>> + unsigned long goal, unsigned long limit,
>> + u32 flags)
>> {
>> void *ptr;
>> @@ -734,12 +735,13 @@ again:
>> }
>> void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat,
>> unsigned long size,
>> - unsigned long align, unsigned long goal)
>> + unsigned long align, unsigned long goal, u32 flags)
>> {
>> if (WARN_ON_ONCE(slab_is_available()))
>> return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
>> - return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> + return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal,
>> + 0, flags);
>> }
>> void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long
>> size,
>> @@ -748,7 +750,7 @@ void * __init ___alloc_bootmem_node(pg_d
>> {
>> void *ptr;
>> - ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
>> + ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0, 0);
>> if (ptr)
>> return ptr;
>> Index: linux/mm/page_cgroup.c
>> ===================================================================
>> --- linux.orig/mm/page_cgroup.c
>> +++ linux/mm/page_cgroup.c
>> @@ -55,7 +55,7 @@ static int __init alloc_node_page_cgroup
>> table_size = sizeof(struct page_cgroup) * nr_pages;
>> base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
>> - table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
>> + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), 0);
>> if (!base)
>> return -ENOMEM;
>> NODE_DATA(nid)->node_page_cgroup = base;
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to [email protected]. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"[email protected]"> [email protected] </a>

--
Cliff Wickman
SGI
[email protected]
(651) 683-3824