2023-07-27 21:10:55

by Usama Arif

[permalink] [raw]
Subject: [v1 0/6] mm/memblock: Skip prep and initialization of struct pages freed later by HVO

If the region is for gigantic hugepages and if HVO is enabled, then those
struct pages which will be freed later by HVO don't need to be prepared and
initialized. This can save significant time when a large number of hugepages
are allocated at boot time.

For a 1G hugepage, this series avoid initialization and preparation of
262144 - 64 = 262080 struct pages per hugepage.

When tested on a 512G system (which can allocate max 500 1G hugepages), the
kexec-boot time with HVO and DEFERRED_STRUCT_PAGE_INIT enabled without this
patchseries to running init is 3.9 seconds. With this patch it is 1.2 seconds.
This represents an approximately 70% reduction in boot time and will
significantly reduce server downtime when using a large number of
gigantic pages.

Thanks,
Usama

[RFC->v1]:
- (Mike Rapoport) Change from passing hugepage_size in
memblock_alloc_try_nid_raw for skipping struct page initialization to
using MEMBLOCK_RSRV_NOINIT flag

Usama Arif (6):
mm: hugetlb: Skip prep of tail pages when HVO is enabled
mm: hugetlb_vmemmap: Use nid of the head page to reallocate it
memblock: add parameter to memblock_setclr_flag for selecting
memblock_type
memblock: introduce MEMBLOCK_RSRV_NOINIT flag
mm: move allocation of gigantic hstates to the start of mm_core_init
mm: hugetlb: Skip initialization of struct pages freed later by HVO

include/linux/hugetlb.h | 1 +
include/linux/memblock.h | 7 ++++
mm/hugetlb.c | 71 ++++++++++++++++++++++++++++++----------
mm/hugetlb_vmemmap.c | 6 ++--
mm/hugetlb_vmemmap.h | 10 ++++++
mm/memblock.c | 51 +++++++++++++++++++++--------
mm/mm_init.c | 4 +++
7 files changed, 117 insertions(+), 33 deletions(-)

--
2.25.1



2023-07-27 21:17:18

by Usama Arif

[permalink] [raw]
Subject: [v1 2/6] mm: hugetlb_vmemmap: Use nid of the head page to reallocate it

If tail page prep and initialization is skipped, then the "start"
page will not contain the correct nid. Use the nid from first
vmemap page.

Signed-off-by: Usama Arif <[email protected]>
---
mm/hugetlb_vmemmap.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index b721e87de2b3..bdf750a4786b 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -324,7 +324,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end,
.reuse_addr = reuse,
.vmemmap_pages = &vmemmap_pages,
};
- int nid = page_to_nid((struct page *)start);
+ int nid = page_to_nid((struct page *)reuse);
gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY |
__GFP_NOWARN;

--
2.25.1


2023-07-27 21:23:47

by Usama Arif

[permalink] [raw]
Subject: [v1 1/6] mm: hugetlb: Skip prep of tail pages when HVO is enabled

When vmemmap is optimizable, it will free all the
duplicated tail pages in hugetlb_vmemmap_optimize while
preparing the new hugepage. Hence, there is no need to
prepare them.

For 1G x86 hugepages, it avoids preparing
262144 - 64 = 262080 struct pages per hugepage.

Signed-off-by: Usama Arif <[email protected]>
---
mm/hugetlb.c | 32 +++++++++++++++++++++++---------
mm/hugetlb_vmemmap.c | 2 +-
mm/hugetlb_vmemmap.h | 7 +++++++
3 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 64a3239b6407..58cf5978bee1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1943,13 +1943,24 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni
}

static bool __prep_compound_gigantic_folio(struct folio *folio,
- unsigned int order, bool demote)
+ unsigned int order, bool demote,
+ bool hvo)
{
int i, j;
int nr_pages = 1 << order;
struct page *p;

__folio_clear_reserved(folio);
+
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ /*
+ * No need to prep pages that will be freed later by hugetlb_vmemmap_optimize
+ * in prep_new_huge_page. Hence, reduce nr_pages to the pages that will be kept.
+ */
+ if (hvo)
+ nr_pages = HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page);
+#endif
+
for (i = 0; i < nr_pages; i++) {
p = folio_page(folio, i);

@@ -2020,15 +2031,15 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
}

static bool prep_compound_gigantic_folio(struct folio *folio,
- unsigned int order)
+ unsigned int order, bool hvo)
{
- return __prep_compound_gigantic_folio(folio, order, false);
+ return __prep_compound_gigantic_folio(folio, order, false, hvo);
}

static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
- unsigned int order)
+ unsigned int order, bool hvo)
{
- return __prep_compound_gigantic_folio(folio, order, true);
+ return __prep_compound_gigantic_folio(folio, order, true, hvo);
}

/*
@@ -2185,7 +2196,8 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
if (!folio)
return NULL;
if (hstate_is_gigantic(h)) {
- if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+ if (!prep_compound_gigantic_folio(folio, huge_page_order(h),
+ vmemmap_should_optimize(h, &folio->page))) {
/*
* Rare failure to convert pages to compound page.
* Free pages and try again - ONCE!
@@ -3201,7 +3213,8 @@ static void __init gather_bootmem_prealloc(void)

VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(folio_ref_count(folio) != 1);
- if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
+ if (prep_compound_gigantic_folio(folio, huge_page_order(h),
+ vmemmap_should_optimize(h, page))) {
WARN_ON(folio_test_reserved(folio));
prep_new_hugetlb_folio(h, folio, folio_nid(folio));
free_huge_page(page); /* add to the hugepage allocator */
@@ -3624,8 +3637,9 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
subpage = folio_page(folio, i);
inner_folio = page_folio(subpage);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_folio_for_demote(inner_folio,
- target_hstate->order);
+ prep_compound_gigantic_folio_for_demote(folio,
+ target_hstate->order,
+ vmemmap_should_optimize(target_hstate, subpage));
else
prep_compound_page(subpage, target_hstate->order);
folio_change_private(inner_folio, NULL);
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index c2007ef5e9b0..b721e87de2b3 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -486,7 +486,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
}

/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
-static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
+bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
{
if (!READ_ONCE(vmemmap_optimize_enabled))
return false;
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 25bd0e002431..07555d2dc0cb 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -13,6 +13,7 @@
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
+bool vmemmap_should_optimize(const struct hstate *h, const struct page *head);

/*
* Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
@@ -51,6 +52,12 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate
{
return 0;
}
+
+bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
+{
+ return false;
+}
+
#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */

static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
--
2.25.1


2023-07-27 21:24:59

by Usama Arif

[permalink] [raw]
Subject: [v1 5/6] mm: move allocation of gigantic hstates to the start of mm_core_init

Whether the initialization of tail struct pages of a hugepage
happens or not will become dependent on the commandline
parameter hugetlb_free_vmemmap in the future. Hence,
hugetlb_hstate_alloc_pages needs to be after command line parameters
are parsed and the start of mm_core_init is a good point.

Signed-off-by: Usama Arif <[email protected]>
---
include/linux/hugetlb.h | 1 +
mm/hugetlb.c | 18 ++++++++++--------
mm/mm_init.c | 4 ++++
3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index ca3c8e10f24a..2b20553deef3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -1271,4 +1271,5 @@ hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
return huge_pte_offset(vma->vm_mm, addr, sz);
}

+void __init hugetlb_hstate_alloc_gigantic_pages(void);
#endif /* _LINUX_HUGETLB_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 58cf5978bee1..c1fcf2af591a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4418,14 +4418,6 @@ static int __init hugepages_setup(char *s)
}
}

- /*
- * Global state is always initialized later in hugetlb_init.
- * But we need to allocate gigantic hstates here early to still
- * use the bootmem allocator.
- */
- if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
- hugetlb_hstate_alloc_pages(parsed_hstate);
-
last_mhp = mhp;

return 1;
@@ -4437,6 +4429,16 @@ static int __init hugepages_setup(char *s)
}
__setup("hugepages=", hugepages_setup);

+void __init hugetlb_hstate_alloc_gigantic_pages(void)
+{
+ int i;
+
+ for (i = 0; i < HUGE_MAX_HSTATE; i++) {
+ if (hstate_is_gigantic(&hstates[i]))
+ hugetlb_hstate_alloc_pages(&hstates[i]);
+ }
+}
+
/*
* hugepagesz command line processing
* A specific huge page size can only be specified once with hugepagesz.
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a1963c3322af..5585c66c3c42 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -26,6 +26,7 @@
#include <linux/pgtable.h>
#include <linux/swap.h>
#include <linux/cma.h>
+#include <linux/hugetlb.h>
#include "internal.h"
#include "slab.h"
#include "shuffle.h"
@@ -2768,6 +2769,9 @@ static void __init mem_init_print_info(void)
*/
void __init mm_core_init(void)
{
+#ifdef CONFIG_HUGETLBFS
+ hugetlb_hstate_alloc_gigantic_pages();
+#endif
/* Initializations relying on SMP setup */
build_all_zonelists(NULL);
page_alloc_init_cpuhp();
--
2.25.1


2023-07-27 21:27:20

by Usama Arif

[permalink] [raw]
Subject: [v1 6/6] mm: hugetlb: Skip initialization of struct pages freed later by HVO

This is done by marking the region for which to skip initialization
with the MEMBLOCK_RSRV_NOINIT flag.
If the region is for hugepages and if HVO is enabled, then those
struct pages which will be freed later don't need to be initialized.
This can save significant time when a large number of hugepages are
allocated at boot time. HUGETLB_VMEMMAP_RESERVE_SIZE
struct pages at the start of hugepage still need to be initialized.

Signed-off-by: Usama Arif <[email protected]>
---
mm/hugetlb.c | 21 +++++++++++++++++++++
mm/hugetlb_vmemmap.c | 2 +-
mm/hugetlb_vmemmap.h | 3 +++
3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c1fcf2af591a..bb2b12f41026 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3166,6 +3166,9 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
struct huge_bootmem_page *m = NULL; /* initialize for clang */
int nr_nodes, node;
+ phys_addr_t hugetlb_vmemmap_reserve_size =
+ HUGETLB_VMEMMAP_RESERVE_SIZE * sizeof(struct page);
+ phys_addr_t noinit_base;

/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
@@ -3173,6 +3176,15 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
if (!m)
return 0;
+
+ if (vmemmap_optimize_enabled && hugetlb_vmemmap_optimizable(h)) {
+ noinit_base = virt_to_phys(
+ (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
+ memblock_rsrv_mark_noinit(
+ noinit_base,
+ huge_page_size(h) - hugetlb_vmemmap_reserve_size);
+ }
+
goto found;
}
/* allocate from next node when distributing huge pages */
@@ -3187,6 +3199,15 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
*/
if (!m)
return 0;
+
+ if (vmemmap_optimize_enabled && hugetlb_vmemmap_optimizable(h)) {
+ noinit_base = virt_to_phys(
+ (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
+ memblock_rsrv_mark_noinit(
+ noinit_base,
+ huge_page_size(h) - hugetlb_vmemmap_reserve_size);
+ }
+
goto found;
}

diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index bdf750a4786b..b5b7834e0f42 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -443,7 +443,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);

-static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
+bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);

/**
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 07555d2dc0cb..cb5171abe683 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -64,4 +64,7 @@ static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
{
return hugetlb_vmemmap_optimizable_size(h) != 0;
}
+
+extern bool vmemmap_optimize_enabled;
+
#endif /* _LINUX_HUGETLB_VMEMMAP_H */
--
2.25.1


2023-07-27 21:34:19

by Usama Arif

[permalink] [raw]
Subject: [v1 4/6] memblock: introduce MEMBLOCK_RSRV_NOINIT flag

For reserved memory regions marked with this flag,
reserve_bootmem_region is not called during memmap_init_reserved_pages.
This can be used to avoid struct page initialization for
regions which won't need them, for e.g. hugepages with
HVO enabled.

Signed-off-by: Usama Arif <[email protected]>
---
include/linux/memblock.h | 7 +++++++
mm/memblock.c | 32 ++++++++++++++++++++++++++------
2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f71ff9f0ec81..7f9d06c08592 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -47,6 +47,7 @@ enum memblock_flags {
MEMBLOCK_MIRROR = 0x2, /* mirrored region */
MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */
+ MEMBLOCK_RSRV_NOINIT = 0x10, /* don't call reserve_bootmem_region for this region */
};

/**
@@ -125,6 +126,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
+int memblock_rsrv_mark_noinit(phys_addr_t base, phys_addr_t size);

void memblock_free_all(void);
void memblock_free(void *ptr, size_t size);
@@ -259,6 +261,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
return m->flags & MEMBLOCK_NOMAP;
}

+static inline bool memblock_is_noinit(struct memblock_region *m)
+{
+ return m->flags & MEMBLOCK_RSRV_NOINIT;
+}
+
static inline bool memblock_is_driver_managed(struct memblock_region *m)
{
return m->flags & MEMBLOCK_DRIVER_MANAGED;
diff --git a/mm/memblock.c b/mm/memblock.c
index 4fd431d16ef2..3a15708af3b6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -997,6 +997,22 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP, 0);
}

+/**
+ * memblock_rsrv_mark_noinit - Mark a reserved memory region with flag MEMBLOCK_RSRV_NOINIT.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * For memory regions marked with %MEMBLOCK_RSRV_NOINIT, reserve_bootmem_region
+ * is not called during memmap_init_reserved_pages, hence struct pages are not
+ * initialized for this region.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_rsrv_mark_noinit(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 1, MEMBLOCK_RSRV_NOINIT, 1);
+}
+
static bool should_skip_region(struct memblock_type *type,
struct memblock_region *m,
int nid, int flags)
@@ -2113,13 +2129,17 @@ static void __init memmap_init_reserved_pages(void)
memblock_set_node(start, end, &memblock.reserved, nid);
}

- /* initialize struct pages for the reserved regions */
+ /*
+ * initialize struct pages for reserved regions that don't have
+ * the MEMBLOCK_RSRV_NOINIT flag set
+ */
for_each_reserved_mem_region(region) {
- nid = memblock_get_region_node(region);
- start = region->base;
- end = start + region->size;
-
- reserve_bootmem_region(start, end, nid);
+ if (!memblock_is_noinit(region)) {
+ nid = memblock_get_region_node(region);
+ start = region->base;
+ end = start + region->size;
+ reserve_bootmem_region(start, end, nid);
+ }
}
}

--
2.25.1


2023-07-28 05:02:07

by Mika Penttilä

[permalink] [raw]
Subject: Re: [v1 4/6] memblock: introduce MEMBLOCK_RSRV_NOINIT flag

Hi,

On 7/27/23 23:46, Usama Arif wrote:

> For reserved memory regions marked with this flag,
> reserve_bootmem_region is not called during memmap_init_reserved_pages.
> This can be used to avoid struct page initialization for
> regions which won't need them, for e.g. hugepages with
> HVO enabled.
>
> Signed-off-by: Usama Arif <[email protected]>
> ---
> include/linux/memblock.h | 7 +++++++
> mm/memblock.c | 32 ++++++++++++++++++++++++++------
> 2 files changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index f71ff9f0ec81..7f9d06c08592 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -47,6 +47,7 @@ enum memblock_flags {
> MEMBLOCK_MIRROR = 0x2, /* mirrored region */
> MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
> MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */
> + MEMBLOCK_RSRV_NOINIT = 0x10, /* don't call reserve_bootmem_region for this region */
> };
>
> /**
> @@ -125,6 +126,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
> int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
> int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
> int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
> +int memblock_rsrv_mark_noinit(phys_addr_t base, phys_addr_t size);
>
> void memblock_free_all(void);
> void memblock_free(void *ptr, size_t size);
> @@ -259,6 +261,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
> return m->flags & MEMBLOCK_NOMAP;
> }
>
> +static inline bool memblock_is_noinit(struct memblock_region *m)
> +{
> + return m->flags & MEMBLOCK_RSRV_NOINIT;
> +}
> +
> static inline bool memblock_is_driver_managed(struct memblock_region *m)
> {
> return m->flags & MEMBLOCK_DRIVER_MANAGED;
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 4fd431d16ef2..3a15708af3b6 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -997,6 +997,22 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
> return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP, 0);
> }
>
> +/**
> + * memblock_rsrv_mark_noinit - Mark a reserved memory region with flag MEMBLOCK_RSRV_NOINIT.
> + * @base: the base phys addr of the region
> + * @size: the size of the region
> + *
> + * For memory regions marked with %MEMBLOCK_RSRV_NOINIT, reserve_bootmem_region
> + * is not called during memmap_init_reserved_pages, hence struct pages are not
> + * initialized for this region.
> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +int __init_memblock memblock_rsrv_mark_noinit(phys_addr_t base, phys_addr_t size)
> +{
> + return memblock_setclr_flag(base, size, 1, MEMBLOCK_RSRV_NOINIT, 1);
> +}
> +
> static bool should_skip_region(struct memblock_type *type,
> struct memblock_region *m,
> int nid, int flags)
> @@ -2113,13 +2129,17 @@ static void __init memmap_init_reserved_pages(void)
> memblock_set_node(start, end, &memblock.reserved, nid);
> }
>
> - /* initialize struct pages for the reserved regions */
> + /*
> + * initialize struct pages for reserved regions that don't have
> + * the MEMBLOCK_RSRV_NOINIT flag set
> + */
> for_each_reserved_mem_region(region) {
> - nid = memblock_get_region_node(region);
> - start = region->base;
> - end = start + region->size;
> -
> - reserve_bootmem_region(start, end, nid);
> + if (!memblock_is_noinit(region)) {
> + nid = memblock_get_region_node(region);
> + start = region->base;
> + end = start + region->size;
> + reserve_bootmem_region(start, end, nid);
> + }
> }
> }
>

There's code like:

static inline void free_vmemmap_page(struct page *page)
{
if (PageReserved(page))
free_bootmem_page(page);
else
__free_page(page);
}

which depends on the PageReserved being in vmempages pages, so I think you can't skip that part?

--Mika



2023-07-28 08:45:38

by kernel test robot

[permalink] [raw]
Subject: Re: [v1 1/6] mm: hugetlb: Skip prep of tail pages when HVO is enabled

Hi Usama,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url: https://github.com/intel-lab-lkp/linux/commits/Usama-Arif/mm-hugetlb-Skip-prep-of-tail-pages-when-HVO-is-enabled/20230728-044839
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20230727204624.1942372-2-usama.arif%40bytedance.com
patch subject: [v1 1/6] mm: hugetlb: Skip prep of tail pages when HVO is enabled
config: arm64-randconfig-r016-20230727 (https://download.01.org/0day-ci/archive/20230728/[email protected]/config)
compiler: aarch64-linux-gcc (GCC) 12.3.0
reproduce: (https://download.01.org/0day-ci/archive/20230728/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All warnings (new ones prefixed by >>):

In file included from mm/hugetlb.c:49:
>> mm/hugetlb_vmemmap.h:56:6: warning: no previous prototype for 'vmemmap_should_optimize' [-Wmissing-prototypes]
56 | bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
| ^~~~~~~~~~~~~~~~~~~~~~~


vim +/vmemmap_should_optimize +56 mm/hugetlb_vmemmap.h

55
> 56 bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
57 {
58 return false;
59 }
60

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-07-28 12:41:52

by kernel test robot

[permalink] [raw]
Subject: Re: [v1 1/6] mm: hugetlb: Skip prep of tail pages when HVO is enabled

Hi Usama,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url: https://github.com/intel-lab-lkp/linux/commits/Usama-Arif/mm-hugetlb-Skip-prep-of-tail-pages-when-HVO-is-enabled/20230728-044839
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20230727204624.1942372-2-usama.arif%40bytedance.com
patch subject: [v1 1/6] mm: hugetlb: Skip prep of tail pages when HVO is enabled
config: arm64-randconfig-r032-20230727 (https://download.01.org/0day-ci/archive/20230728/[email protected]/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce: (https://download.01.org/0day-ci/archive/20230728/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All warnings (new ones prefixed by >>):

In file included from mm/hugetlb.c:49:
>> mm/hugetlb_vmemmap.h:56:6: warning: no previous prototype for function 'vmemmap_should_optimize' [-Wmissing-prototypes]
56 | bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
| ^
mm/hugetlb_vmemmap.h:56:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
56 | bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
| ^
| static
1 warning generated.


vim +/vmemmap_should_optimize +56 mm/hugetlb_vmemmap.h

55
> 56 bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
57 {
58 return false;
59 }
60

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-07-28 15:55:49

by Usama Arif

[permalink] [raw]
Subject: Re: [External] Re: [v1 4/6] memblock: introduce MEMBLOCK_RSRV_NOINIT flag



On 28/07/2023 05:30, Mika Penttilä wrote:
> Hi,
>
> On 7/27/23 23:46, Usama Arif wrote:
>
>> For reserved memory regions marked with this flag,
>> reserve_bootmem_region is not called during memmap_init_reserved_pages.
>> This can be used to avoid struct page initialization for
>> regions which won't need them, for e.g. hugepages with
>> HVO enabled.
>>
>> Signed-off-by: Usama Arif <[email protected]>
>> ---
>>   include/linux/memblock.h |  7 +++++++
>>   mm/memblock.c            | 32 ++++++++++++++++++++++++++------
>>   2 files changed, 33 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
>> index f71ff9f0ec81..7f9d06c08592 100644
>> --- a/include/linux/memblock.h
>> +++ b/include/linux/memblock.h
>> @@ -47,6 +47,7 @@ enum memblock_flags {
>>       MEMBLOCK_MIRROR        = 0x2,    /* mirrored region */
>>       MEMBLOCK_NOMAP        = 0x4,    /* don't add to kernel direct
>> mapping */
>>       MEMBLOCK_DRIVER_MANAGED = 0x8,    /* always detected via a
>> driver */
>> +    MEMBLOCK_RSRV_NOINIT    = 0x10,    /* don't call
>> reserve_bootmem_region for this region */
>>   };
>>   /**
>> @@ -125,6 +126,7 @@ int memblock_clear_hotplug(phys_addr_t base,
>> phys_addr_t size);
>>   int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
>>   int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
>>   int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
>> +int memblock_rsrv_mark_noinit(phys_addr_t base, phys_addr_t size);
>>   void memblock_free_all(void);
>>   void memblock_free(void *ptr, size_t size);
>> @@ -259,6 +261,11 @@ static inline bool memblock_is_nomap(struct
>> memblock_region *m)
>>       return m->flags & MEMBLOCK_NOMAP;
>>   }
>> +static inline bool memblock_is_noinit(struct memblock_region *m)
>> +{
>> +    return m->flags & MEMBLOCK_RSRV_NOINIT;
>> +}
>> +
>>   static inline bool memblock_is_driver_managed(struct memblock_region
>> *m)
>>   {
>>       return m->flags & MEMBLOCK_DRIVER_MANAGED;
>> diff --git a/mm/memblock.c b/mm/memblock.c
>> index 4fd431d16ef2..3a15708af3b6 100644
>> --- a/mm/memblock.c
>> +++ b/mm/memblock.c
>> @@ -997,6 +997,22 @@ int __init_memblock
>> memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
>>       return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP, 0);
>>   }
>> +/**
>> + * memblock_rsrv_mark_noinit - Mark a reserved memory region with
>> flag MEMBLOCK_RSRV_NOINIT.
>> + * @base: the base phys addr of the region
>> + * @size: the size of the region
>> + *
>> + * For memory regions marked with %MEMBLOCK_RSRV_NOINIT,
>> reserve_bootmem_region
>> + * is not called during memmap_init_reserved_pages, hence struct
>> pages are not
>> + * initialized for this region.
>> + *
>> + * Return: 0 on success, -errno on failure.
>> + */
>> +int __init_memblock memblock_rsrv_mark_noinit(phys_addr_t base,
>> phys_addr_t size)
>> +{
>> +    return memblock_setclr_flag(base, size, 1, MEMBLOCK_RSRV_NOINIT, 1);
>> +}
>> +
>>   static bool should_skip_region(struct memblock_type *type,
>>                      struct memblock_region *m,
>>                      int nid, int flags)
>> @@ -2113,13 +2129,17 @@ static void __init
>> memmap_init_reserved_pages(void)
>>           memblock_set_node(start, end, &memblock.reserved, nid);
>>       }
>> -    /* initialize struct pages for the reserved regions */
>> +    /*
>> +     * initialize struct pages for reserved regions that don't have
>> +     * the MEMBLOCK_RSRV_NOINIT flag set
>> +     */
>>       for_each_reserved_mem_region(region) {
>> -        nid = memblock_get_region_node(region);
>> -        start = region->base;
>> -        end = start + region->size;
>> -
>> -        reserve_bootmem_region(start, end, nid);
>> +        if (!memblock_is_noinit(region)) {
>> +            nid = memblock_get_region_node(region);
>> +            start = region->base;
>> +            end = start + region->size;
>> +            reserve_bootmem_region(start, end, nid);
>> +        }
>>       }
>>   }
>
> There's code like:
>
> static inline void free_vmemmap_page(struct page *page)
> {
>         if (PageReserved(page))
>                 free_bootmem_page(page);
>         else
>                 __free_page(page);
> }
>
> which depends on the PageReserved being in vmempages pages, so I think
> you can't skip that part?
>

free_vmemmap_page_list (free_vmemmap_page) is called on struct pages
(refer to as [1]) that point to memory *which contains* the struct pages
(refer to as [2]) for the hugepage. The above if
(!memblock_is_noinit(region)) to not reserve_bootmem_region is called
for the struct pages [2] for the hugepage. struct pages [1] are not
changed with my patch.

As an experiment if I run the diff at the bottom with and without these
patches I get the same log "HugeTLB: reserved pages 4096, normal pages
0", which means those struct pages are treated the same without and
without these patches. (Its 4096 as 262144 struct pages [2] per hugepage
* 64 bytes per struct page / PAGE_SIZE = 4096 struct pages [1] )

Also should have mentioned in cover letter, I used cat /proc/meminfo to
make sure it was working as expected. Reserving 500 1G hugepages with
and without these patches when hugetlb_free_vmemmap=on
MemTotal: 536207112 kB (511.4G)

when hugetlb_free_vmemmap=off
MemTotal: 528015112 kB (503G)


The expectation is that for 500 1G hugepages, when using HVO we have a
saving of 16380K*500=~8GB which is what we see with and without those
patches (511.4G - 503G). These patches didnt affect these numbers.



diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index b5b7834e0f42..bc0ec90552b7 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -208,6 +208,8 @@ static int vmemmap_remap_range(unsigned long start,
unsigned long end,
return 0;
}

+static int i = 0, j = 0;
+
/*
* Free a vmemmap page. A vmemmap page can be allocated from the memblock
* allocator or buddy allocator. If the PG_reserved flag is set, it means
@@ -216,10 +218,14 @@ static int vmemmap_remap_range(unsigned long
start, unsigned long end,
*/
static inline void free_vmemmap_page(struct page *page)
{
- if (PageReserved(page))
+ if (PageReserved(page)) {
+ i++;
free_bootmem_page(page);
- else
+ }
+ else {
+ j++;
__free_page(page);
+ }
}

/* Free a list of the vmemmap pages */
@@ -380,6 +386,7 @@ static int vmemmap_remap_free(unsigned long start,
unsigned long end,

free_vmemmap_page_list(&vmemmap_pages);

+ pr_err("reserved pages %u, normal pages %u", i, j);
return ret;
}





> --Mika
>
>

2023-07-28 16:22:31

by Mika Penttilä

[permalink] [raw]
Subject: Re: [External] Re: [v1 4/6] memblock: introduce MEMBLOCK_RSRV_NOINIT flag


On 7/28/23 16:47, Usama Arif wrote:
>
>
> On 28/07/2023 05:30, Mika Penttilä wrote:
>> Hi,
>>
>> On 7/27/23 23:46, Usama Arif wrote:
>>
>>> For reserved memory regions marked with this flag,
>>> reserve_bootmem_region is not called during memmap_init_reserved_pages.
>>> This can be used to avoid struct page initialization for
>>> regions which won't need them, for e.g. hugepages with
>>> HVO enabled.
>>>
>>> Signed-off-by: Usama Arif <[email protected]>
>>> ---
>>>   include/linux/memblock.h |  7 +++++++
>>>   mm/memblock.c            | 32 ++++++++++++++++++++++++++------
>>>   2 files changed, 33 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
>>> index f71ff9f0ec81..7f9d06c08592 100644
>>> --- a/include/linux/memblock.h
>>> +++ b/include/linux/memblock.h
>>> @@ -47,6 +47,7 @@ enum memblock_flags {
>>>       MEMBLOCK_MIRROR        = 0x2,    /* mirrored region */
>>>       MEMBLOCK_NOMAP        = 0x4,    /* don't add to kernel direct
>>> mapping */
>>>       MEMBLOCK_DRIVER_MANAGED = 0x8,    /* always detected via a
>>> driver */
>>> +    MEMBLOCK_RSRV_NOINIT    = 0x10,    /* don't call
>>> reserve_bootmem_region for this region */
>>>   };
>>>   /**
>>> @@ -125,6 +126,7 @@ int memblock_clear_hotplug(phys_addr_t base,
>>> phys_addr_t size);
>>>   int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
>>>   int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
>>>   int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
>>> +int memblock_rsrv_mark_noinit(phys_addr_t base, phys_addr_t size);
>>>   void memblock_free_all(void);
>>>   void memblock_free(void *ptr, size_t size);
>>> @@ -259,6 +261,11 @@ static inline bool memblock_is_nomap(struct
>>> memblock_region *m)
>>>       return m->flags & MEMBLOCK_NOMAP;
>>>   }
>>> +static inline bool memblock_is_noinit(struct memblock_region *m)
>>> +{
>>> +    return m->flags & MEMBLOCK_RSRV_NOINIT;
>>> +}
>>> +
>>>   static inline bool memblock_is_driver_managed(struct
>>> memblock_region *m)
>>>   {
>>>       return m->flags & MEMBLOCK_DRIVER_MANAGED;
>>> diff --git a/mm/memblock.c b/mm/memblock.c
>>> index 4fd431d16ef2..3a15708af3b6 100644
>>> --- a/mm/memblock.c
>>> +++ b/mm/memblock.c
>>> @@ -997,6 +997,22 @@ int __init_memblock
>>> memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
>>>       return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP, 0);
>>>   }
>>> +/**
>>> + * memblock_rsrv_mark_noinit - Mark a reserved memory region with
>>> flag MEMBLOCK_RSRV_NOINIT.
>>> + * @base: the base phys addr of the region
>>> + * @size: the size of the region
>>> + *
>>> + * For memory regions marked with %MEMBLOCK_RSRV_NOINIT,
>>> reserve_bootmem_region
>>> + * is not called during memmap_init_reserved_pages, hence struct
>>> pages are not
>>> + * initialized for this region.
>>> + *
>>> + * Return: 0 on success, -errno on failure.
>>> + */
>>> +int __init_memblock memblock_rsrv_mark_noinit(phys_addr_t base,
>>> phys_addr_t size)
>>> +{
>>> +    return memblock_setclr_flag(base, size, 1,
>>> MEMBLOCK_RSRV_NOINIT, 1);
>>> +}
>>> +
>>>   static bool should_skip_region(struct memblock_type *type,
>>>                      struct memblock_region *m,
>>>                      int nid, int flags)
>>> @@ -2113,13 +2129,17 @@ static void __init
>>> memmap_init_reserved_pages(void)
>>>           memblock_set_node(start, end, &memblock.reserved, nid);
>>>       }
>>> -    /* initialize struct pages for the reserved regions */
>>> +    /*
>>> +     * initialize struct pages for reserved regions that don't have
>>> +     * the MEMBLOCK_RSRV_NOINIT flag set
>>> +     */
>>>       for_each_reserved_mem_region(region) {
>>> -        nid = memblock_get_region_node(region);
>>> -        start = region->base;
>>> -        end = start + region->size;
>>> -
>>> -        reserve_bootmem_region(start, end, nid);
>>> +        if (!memblock_is_noinit(region)) {
>>> +            nid = memblock_get_region_node(region);
>>> +            start = region->base;
>>> +            end = start + region->size;
>>> +            reserve_bootmem_region(start, end, nid);
>>> +        }
>>>       }
>>>   }
>>
>> There's code like:
>>
>> static inline void free_vmemmap_page(struct page *page)
>> {
>>          if (PageReserved(page))
>>                  free_bootmem_page(page);
>>          else
>>                  __free_page(page);
>> }
>>
>> which depends on the PageReserved being in vmempages pages, so I
>> think you can't skip that part?
>>
>
> free_vmemmap_page_list (free_vmemmap_page) is called on struct pages
> (refer to as [1]) that point to memory *which contains* the struct
> pages (refer to as [2]) for the hugepage. The above if
> (!memblock_is_noinit(region)) to not reserve_bootmem_region is called
> for the struct pages [2] for the hugepage. struct pages [1] are not
> changed with my patch.
>
> As an experiment if I run the diff at the bottom with and without
> these patches I get the same log "HugeTLB: reserved pages 4096, normal
> pages 0", which means those struct pages are treated the same without
> and without these patches. (Its 4096 as 262144 struct pages [2] per
> hugepage * 64 bytes per struct page / PAGE_SIZE = 4096 struct pages [1] )
>
> Also should have mentioned in cover letter, I used cat /proc/meminfo
> to make sure it was working as expected. Reserving 500 1G hugepages
> with and without these patches when hugetlb_free_vmemmap=on
> MemTotal:       536207112 kB (511.4G)
>
> when hugetlb_free_vmemmap=off
> MemTotal:       528015112 kB (503G)
>
>
> The expectation is that for 500 1G hugepages, when using HVO we have a
> saving of 16380K*500=~8GB which is what we see with and without those
> patches (511.4G - 503G). These patches didnt affect these numbers.
>
>
You are right, thanks for the explanation!


--Mika




>
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index b5b7834e0f42..bc0ec90552b7 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -208,6 +208,8 @@ static int vmemmap_remap_range(unsigned long
> start, unsigned long end,
>         return 0;
>  }
>
> +static int i = 0, j = 0;
> +
>  /*
>   * Free a vmemmap page. A vmemmap page can be allocated from the
> memblock
>   * allocator or buddy allocator. If the PG_reserved flag is set, it
> means
> @@ -216,10 +218,14 @@ static int vmemmap_remap_range(unsigned long
> start, unsigned long end,
>   */
>  static inline void free_vmemmap_page(struct page *page)
>  {
> -       if (PageReserved(page))
> +       if (PageReserved(page)) {
> +               i++;
>                 free_bootmem_page(page);
> -       else
> +       }
> +       else {
> +               j++;
>                 __free_page(page);
> +       }
>  }
>
>  /* Free a list of the vmemmap pages */
> @@ -380,6 +386,7 @@ static int vmemmap_remap_free(unsigned long start,
> unsigned long end,
>
>         free_vmemmap_page_list(&vmemmap_pages);
>
> +       pr_err("reserved pages %u, normal pages %u", i, j);
>         return ret;
>  }
>
>
>
>
>
>> --Mika
>>
>>
>


2023-07-28 17:49:02

by kernel test robot

[permalink] [raw]
Subject: Re: [v1 6/6] mm: hugetlb: Skip initialization of struct pages freed later by HVO

Hi Usama,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]

url: https://github.com/intel-lab-lkp/linux/commits/Usama-Arif/mm-hugetlb-Skip-prep-of-tail-pages-when-HVO-is-enabled/20230728-044839
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20230727204624.1942372-7-usama.arif%40bytedance.com
patch subject: [v1 6/6] mm: hugetlb: Skip initialization of struct pages freed later by HVO
config: arm64-randconfig-r032-20230727 (https://download.01.org/0day-ci/archive/20230729/[email protected]/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce: (https://download.01.org/0day-ci/archive/20230729/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

In file included from mm/hugetlb.c:49:
mm/hugetlb_vmemmap.h:56:6: warning: no previous prototype for function 'vmemmap_should_optimize' [-Wmissing-prototypes]
56 | bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
| ^
mm/hugetlb_vmemmap.h:56:1: note: declare 'static' if the function is not intended to be used outside of this translation unit
56 | bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
| ^
| static
>> mm/hugetlb.c:3198:3: error: use of undeclared identifier 'HUGETLB_VMEMMAP_RESERVE_SIZE'
3198 | HUGETLB_VMEMMAP_RESERVE_SIZE * sizeof(struct page);
| ^
1 warning and 1 error generated.


vim +/HUGETLB_VMEMMAP_RESERVE_SIZE +3198 mm/hugetlb.c

3190
3191 int alloc_bootmem_huge_page(struct hstate *h, int nid)
3192 __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
3193 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
3194 {
3195 struct huge_bootmem_page *m = NULL; /* initialize for clang */
3196 int nr_nodes, node;
3197 phys_addr_t hugetlb_vmemmap_reserve_size =
> 3198 HUGETLB_VMEMMAP_RESERVE_SIZE * sizeof(struct page);
3199 phys_addr_t noinit_base;
3200
3201 /* do node specific alloc */
3202 if (nid != NUMA_NO_NODE) {
3203 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
3204 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3205 if (!m)
3206 return 0;
3207
3208 if (vmemmap_optimize_enabled && hugetlb_vmemmap_optimizable(h)) {
3209 noinit_base = virt_to_phys(
3210 (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
3211 memblock_rsrv_mark_noinit(
3212 noinit_base,
3213 huge_page_size(h) - hugetlb_vmemmap_reserve_size);
3214 }
3215
3216 goto found;
3217 }
3218 /* allocate from next node when distributing huge pages */
3219 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
3220 m = memblock_alloc_try_nid_raw(
3221 huge_page_size(h), huge_page_size(h),
3222 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
3223 /*
3224 * Use the beginning of the huge page to store the
3225 * huge_bootmem_page struct (until gather_bootmem
3226 * puts them into the mem_map).
3227 */
3228 if (!m)
3229 return 0;
3230
3231 if (vmemmap_optimize_enabled && hugetlb_vmemmap_optimizable(h)) {
3232 noinit_base = virt_to_phys(
3233 (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
3234 memblock_rsrv_mark_noinit(
3235 noinit_base,
3236 huge_page_size(h) - hugetlb_vmemmap_reserve_size);
3237 }
3238
3239 goto found;
3240 }
3241
3242 found:
3243 /* Put them into a private list first because mem_map is not up yet */
3244 INIT_LIST_HEAD(&m->list);
3245 list_add(&m->list, &huge_boot_pages);
3246 m->hstate = h;
3247 return 1;
3248 }
3249

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-07-28 18:15:13

by kernel test robot

[permalink] [raw]
Subject: Re: [v1 6/6] mm: hugetlb: Skip initialization of struct pages freed later by HVO

Hi Usama,

kernel test robot noticed the following build warnings:

[auto build test WARNING on akpm-mm/mm-everything]

url: https://github.com/intel-lab-lkp/linux/commits/Usama-Arif/mm-hugetlb-Skip-prep-of-tail-pages-when-HVO-is-enabled/20230728-044839
base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/r/20230727204624.1942372-7-usama.arif%40bytedance.com
patch subject: [v1 6/6] mm: hugetlb: Skip initialization of struct pages freed later by HVO
config: i386-debian-10.3 (https://download.01.org/0day-ci/archive/20230729/[email protected]/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce: (https://download.01.org/0day-ci/archive/20230729/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All warnings (new ones prefixed by >>):

In file included from mm/hugetlb.c:49:
mm/hugetlb_vmemmap.h:56:6: warning: no previous prototype for 'vmemmap_should_optimize' [-Wmissing-prototypes]
56 | bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
| ^~~~~~~~~~~~~~~~~~~~~~~
mm/hugetlb.c: In function '__alloc_bootmem_huge_page':
mm/hugetlb.c:3198:17: error: 'HUGETLB_VMEMMAP_RESERVE_SIZE' undeclared (first use in this function)
3198 | HUGETLB_VMEMMAP_RESERVE_SIZE * sizeof(struct page);
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
mm/hugetlb.c:3198:17: note: each undeclared identifier is reported only once for each function it appears in
>> mm/hugetlb.c:3210:42: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
3210 | (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
| ^
>> mm/hugetlb.c:3210:33: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
3210 | (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
| ^
mm/hugetlb.c:3233:42: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]
3233 | (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
| ^
mm/hugetlb.c:3233:33: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
3233 | (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
| ^


vim +3210 mm/hugetlb.c

3190
3191 int alloc_bootmem_huge_page(struct hstate *h, int nid)
3192 __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
3193 int __alloc_bootmem_huge_page(struct hstate *h, int nid)
3194 {
3195 struct huge_bootmem_page *m = NULL; /* initialize for clang */
3196 int nr_nodes, node;
3197 phys_addr_t hugetlb_vmemmap_reserve_size =
3198 HUGETLB_VMEMMAP_RESERVE_SIZE * sizeof(struct page);
3199 phys_addr_t noinit_base;
3200
3201 /* do node specific alloc */
3202 if (nid != NUMA_NO_NODE) {
3203 m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
3204 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
3205 if (!m)
3206 return 0;
3207
3208 if (vmemmap_optimize_enabled && hugetlb_vmemmap_optimizable(h)) {
3209 noinit_base = virt_to_phys(
> 3210 (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
3211 memblock_rsrv_mark_noinit(
3212 noinit_base,
3213 huge_page_size(h) - hugetlb_vmemmap_reserve_size);
3214 }
3215
3216 goto found;
3217 }
3218 /* allocate from next node when distributing huge pages */
3219 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
3220 m = memblock_alloc_try_nid_raw(
3221 huge_page_size(h), huge_page_size(h),
3222 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
3223 /*
3224 * Use the beginning of the huge page to store the
3225 * huge_bootmem_page struct (until gather_bootmem
3226 * puts them into the mem_map).
3227 */
3228 if (!m)
3229 return 0;
3230
3231 if (vmemmap_optimize_enabled && hugetlb_vmemmap_optimizable(h)) {
3232 noinit_base = virt_to_phys(
3233 (void *)((phys_addr_t) m + hugetlb_vmemmap_reserve_size));
3234 memblock_rsrv_mark_noinit(
3235 noinit_base,
3236 huge_page_size(h) - hugetlb_vmemmap_reserve_size);
3237 }
3238
3239 goto found;
3240 }
3241
3242 found:
3243 /* Put them into a private list first because mem_map is not up yet */
3244 INIT_LIST_HEAD(&m->list);
3245 list_add(&m->list, &huge_boot_pages);
3246 m->hstate = h;
3247 return 1;
3248 }
3249

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-07-29 07:43:17

by Mike Rapoport

[permalink] [raw]
Subject: Re: [v1 1/6] mm: hugetlb: Skip prep of tail pages when HVO is enabled

On Thu, Jul 27, 2023 at 09:46:19PM +0100, Usama Arif wrote:
> When vmemmap is optimizable, it will free all the
> duplicated tail pages in hugetlb_vmemmap_optimize while
> preparing the new hugepage. Hence, there is no need to
> prepare them.
>
> For 1G x86 hugepages, it avoids preparing
> 262144 - 64 = 262080 struct pages per hugepage.
>
> Signed-off-by: Usama Arif <[email protected]>
> ---
> mm/hugetlb.c | 32 +++++++++++++++++++++++---------
> mm/hugetlb_vmemmap.c | 2 +-
> mm/hugetlb_vmemmap.h | 7 +++++++
> 3 files changed, 31 insertions(+), 10 deletions(-)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 64a3239b6407..58cf5978bee1 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1943,13 +1943,24 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni
> }
>
> static bool __prep_compound_gigantic_folio(struct folio *folio,
> - unsigned int order, bool demote)
> + unsigned int order, bool demote,
> + bool hvo)

I think it would be cleaner to pass struct hstate * instead of order here
so that order and hvo can be computed locally.

> {
> int i, j;
> int nr_pages = 1 << order;
> struct page *p;
>
> __folio_clear_reserved(folio);
> +
> +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> + /*
> + * No need to prep pages that will be freed later by hugetlb_vmemmap_optimize
> + * in prep_new_huge_page. Hence, reduce nr_pages to the pages that will be kept.
> + */
> + if (hvo)

if (IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP) && hvo)

is better than ifdef IMO.

> + nr_pages = HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page);
> +#endif
> +
> for (i = 0; i < nr_pages; i++) {
> p = folio_page(folio, i);
>
> @@ -2020,15 +2031,15 @@ static bool __prep_compound_gigantic_folio(struct folio *folio,
> }
>
> static bool prep_compound_gigantic_folio(struct folio *folio,
> - unsigned int order)
> + unsigned int order, bool hvo)
> {
> - return __prep_compound_gigantic_folio(folio, order, false);
> + return __prep_compound_gigantic_folio(folio, order, false, hvo);
> }
>
> static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
> - unsigned int order)
> + unsigned int order, bool hvo)
> {
> - return __prep_compound_gigantic_folio(folio, order, true);
> + return __prep_compound_gigantic_folio(folio, order, true, hvo);
> }
>
> /*
> @@ -2185,7 +2196,8 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
> if (!folio)
> return NULL;
> if (hstate_is_gigantic(h)) {
> - if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
> + if (!prep_compound_gigantic_folio(folio, huge_page_order(h),
> + vmemmap_should_optimize(h, &folio->page))) {
> /*
> * Rare failure to convert pages to compound page.
> * Free pages and try again - ONCE!
> @@ -3201,7 +3213,8 @@ static void __init gather_bootmem_prealloc(void)
>
> VM_BUG_ON(!hstate_is_gigantic(h));
> WARN_ON(folio_ref_count(folio) != 1);
> - if (prep_compound_gigantic_folio(folio, huge_page_order(h))) {
> + if (prep_compound_gigantic_folio(folio, huge_page_order(h),
> + vmemmap_should_optimize(h, page))) {
> WARN_ON(folio_test_reserved(folio));
> prep_new_hugetlb_folio(h, folio, folio_nid(folio));
> free_huge_page(page); /* add to the hugepage allocator */
> @@ -3624,8 +3637,9 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
> subpage = folio_page(folio, i);
> inner_folio = page_folio(subpage);
> if (hstate_is_gigantic(target_hstate))
> - prep_compound_gigantic_folio_for_demote(inner_folio,
> - target_hstate->order);
> + prep_compound_gigantic_folio_for_demote(folio,
> + target_hstate->order,
> + vmemmap_should_optimize(target_hstate, subpage));
> else
> prep_compound_page(subpage, target_hstate->order);
> folio_change_private(inner_folio, NULL);
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index c2007ef5e9b0..b721e87de2b3 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -486,7 +486,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head)
> }
>
> /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
> -static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
> +bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
> {
> if (!READ_ONCE(vmemmap_optimize_enabled))
> return false;
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index 25bd0e002431..07555d2dc0cb 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -13,6 +13,7 @@
> #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head);
> void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head);
> +bool vmemmap_should_optimize(const struct hstate *h, const struct page *head);
>
> /*
> * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
> @@ -51,6 +52,12 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate
> {
> return 0;
> }
> +
> +bool vmemmap_should_optimize(const struct hstate *h, const struct page *head)
> +{
> + return false;
> +}
> +
> #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
>
> static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
> --
> 2.25.1
>

--
Sincerely yours,
Mike.

2023-07-29 07:55:45

by Mike Rapoport

[permalink] [raw]
Subject: Re: [v1 4/6] memblock: introduce MEMBLOCK_RSRV_NOINIT flag

On Thu, Jul 27, 2023 at 09:46:22PM +0100, Usama Arif wrote:
> For reserved memory regions marked with this flag,
> reserve_bootmem_region is not called during memmap_init_reserved_pages.
> This can be used to avoid struct page initialization for
> regions which won't need them, for e.g. hugepages with
> HVO enabled.
>
> Signed-off-by: Usama Arif <[email protected]>
> ---
> include/linux/memblock.h | 7 +++++++
> mm/memblock.c | 32 ++++++++++++++++++++++++++------
> 2 files changed, 33 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index f71ff9f0ec81..7f9d06c08592 100644
> --e a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -47,6 +47,7 @@ enum memblock_flags {
> MEMBLOCK_MIRROR = 0x2, /* mirrored region */
> MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */
> MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */
> + MEMBLOCK_RSRV_NOINIT = 0x10, /* don't call reserve_bootmem_region for this region */

The comment should reflect what it does, not how.

> };
>
> /**
> @@ -125,6 +126,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
> int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
> int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
> int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
> +int memblock_rsrv_mark_noinit(phys_addr_t base, phys_addr_t size);

Please spell out reserved here.

> void memblock_free_all(void);
> void memblock_free(void *ptr, size_t size);
> @@ -259,6 +261,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m)
> return m->flags & MEMBLOCK_NOMAP;
> }
>
> +static inline bool memblock_is_noinit(struct memblock_region *m)
> +{
> + return m->flags & MEMBLOCK_RSRV_NOINIT;
> +}
> +
> static inline bool memblock_is_driver_managed(struct memblock_region *m)
> {
> return m->flags & MEMBLOCK_DRIVER_MANAGED;
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 4fd431d16ef2..3a15708af3b6 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -997,6 +997,22 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
> return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP, 0);
> }
>
> +/**
> + * memblock_rsrv_mark_noinit - Mark a reserved memory region with flag MEMBLOCK_RSRV_NOINIT.
> + * @base: the base phys addr of the region
> + * @size: the size of the region
> + *
> + * For memory regions marked with %MEMBLOCK_RSRV_NOINIT, reserve_bootmem_region
> + * is not called during memmap_init_reserved_pages, hence struct pages are not
> + * initialized for this region.

Here as well, the part of how is much less important. Here you should
emphasize that struct pages for MEMBLOCK_RSRV_NOINIT regions are not
initialized.

> + *
> + * Return: 0 on success, -errno on failure.
> + */
> +int __init_memblock memblock_rsrv_mark_noinit(phys_addr_t base, phys_addr_t size)
> +{
> + return memblock_setclr_flag(base, size, 1, MEMBLOCK_RSRV_NOINIT, 1);
> +}
> +
> static bool should_skip_region(struct memblock_type *type,
> struct memblock_region *m,
> int nid, int flags)
> @@ -2113,13 +2129,17 @@ static void __init memmap_init_reserved_pages(void)
> memblock_set_node(start, end, &memblock.reserved, nid);
> }
>
> - /* initialize struct pages for the reserved regions */
> + /*
> + * initialize struct pages for reserved regions that don't have
> + * the MEMBLOCK_RSRV_NOINIT flag set
> + */
> for_each_reserved_mem_region(region) {
> - nid = memblock_get_region_node(region);
> - start = region->base;
> - end = start + region->size;
> -
> - reserve_bootmem_region(start, end, nid);
> + if (!memblock_is_noinit(region)) {
> + nid = memblock_get_region_node(region);
> + start = region->base;
> + end = start + region->size;

Please keep the empty line here

> + reserve_bootmem_region(start, end, nid);
> + }
> }
> }
>
> --
> 2.25.1
>

--
Sincerely yours,
Mike.

2023-07-29 07:57:15

by Mike Rapoport

[permalink] [raw]
Subject: Re: [v1 5/6] mm: move allocation of gigantic hstates to the start of mm_core_init

On Thu, Jul 27, 2023 at 09:46:23PM +0100, Usama Arif wrote:
> Whether the initialization of tail struct pages of a hugepage
> happens or not will become dependent on the commandline
> parameter hugetlb_free_vmemmap in the future. Hence,
> hugetlb_hstate_alloc_pages needs to be after command line parameters
> are parsed and the start of mm_core_init is a good point.
>
> Signed-off-by: Usama Arif <[email protected]>
> ---
> include/linux/hugetlb.h | 1 +
> mm/hugetlb.c | 18 ++++++++++--------
> mm/mm_init.c | 4 ++++
> 3 files changed, 15 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index ca3c8e10f24a..2b20553deef3 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -1271,4 +1271,5 @@ hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
> return huge_pte_offset(vma->vm_mm, addr, sz);
> }
>
> +void __init hugetlb_hstate_alloc_gigantic_pages(void);

this should be in mm/internal.h with a static inline stub for !CONFIG_HUGETLBFS

> #endif /* _LINUX_HUGETLB_H */
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 58cf5978bee1..c1fcf2af591a 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -4418,14 +4418,6 @@ static int __init hugepages_setup(char *s)
> }
> }
>
> - /*
> - * Global state is always initialized later in hugetlb_init.
> - * But we need to allocate gigantic hstates here early to still
> - * use the bootmem allocator.
> - */
> - if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
> - hugetlb_hstate_alloc_pages(parsed_hstate);
> -
> last_mhp = mhp;
>
> return 1;
> @@ -4437,6 +4429,16 @@ static int __init hugepages_setup(char *s)
> }
> __setup("hugepages=", hugepages_setup);
>
> +void __init hugetlb_hstate_alloc_gigantic_pages(void)
> +{
> + int i;
> +
> + for (i = 0; i < HUGE_MAX_HSTATE; i++) {
> + if (hstate_is_gigantic(&hstates[i]))
> + hugetlb_hstate_alloc_pages(&hstates[i]);
> + }
> +}
> +
> /*
> * hugepagesz command line processing
> * A specific huge page size can only be specified once with hugepagesz.
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index a1963c3322af..5585c66c3c42 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -26,6 +26,7 @@
> #include <linux/pgtable.h>
> #include <linux/swap.h>
> #include <linux/cma.h>
> +#include <linux/hugetlb.h>
> #include "internal.h"
> #include "slab.h"
> #include "shuffle.h"
> @@ -2768,6 +2769,9 @@ static void __init mem_init_print_info(void)
> */
> void __init mm_core_init(void)
> {
> +#ifdef CONFIG_HUGETLBFS
> + hugetlb_hstate_alloc_gigantic_pages();
> +#endif

Please add a comment why it should be called here.

> /* Initializations relying on SMP setup */
> build_all_zonelists(NULL);
> page_alloc_init_cpuhp();
> --
> 2.25.1
>

--
Sincerely yours,
Mike.