Hi all, hugetlb init parallelization has now been updated to v2.
To David Hildenbrand: padata multithread utilities has been used to reduce
code complexity.
To David Rientjes: The patch for measuring time will be separately included
in the reply. Please test during your free time, thanks.
# Introduction
Hugetlb initialization during boot takes up a considerable amount of time.
For instance, on a 2TB system, initializing 1,800 1GB huge pages takes 1-2
seconds out of 10 seconds. Initializing 11,776 1GB pages on a 12TB Intel
host takes 65.2 seconds [1], which is 17.4% of the total 373.78 seconds boot
time. This is a noteworthy figure.
Inspired by [2] and [3], hugetlb initialization can also be accelerated
through parallelization. Kernel already has infrastructure like
padata_do_multithreaded, this patch uses it to achieve effective results
by minimal modifications.
[1] https://lore.kernel.org/all/[email protected]/
[2] https://lore.kernel.org/all/[email protected]/
[3] https://lore.kernel.org/all/[email protected]/
# Test result
test no patch(ms) patched(ms) saved
------------------- -------------- ------------- --------
256c2t(4 node) 2M 2624 956 63.57%
256c2t(4 node) 1G 2679 1582 40.95%
128c1t(2 node) 2M 1788 684 61.74%
128c1t(2 node) 1G 3160 1618 48.80%
# Change log
Changes in v2:
- Reduce complexity with `padata_do_multithreaded`
- Support 1G hugetlb
v1:
- https://lore.kernel.org/all/[email protected]/
- parallelize 2M hugetlb initialization with workqueue
Gang Li (5):
hugetlb: code clean for hugetlb_hstate_alloc_pages
hugetlb: split hugetlb_hstate_alloc_pages
padata: dispatch works on different nodes
hugetlb: parallelize 2M hugetlb allocation and initialization
hugetlb: parallelize 1G hugetlb initialization
include/linux/hugetlb.h | 2 +-
include/linux/padata.h | 2 +
kernel/padata.c | 8 +-
mm/hugetlb.c | 201 +++++++++++++++++++++++++++-------------
mm/mm_init.c | 1 +
5 files changed, 148 insertions(+), 66 deletions(-)
--
2.30.2
When a group of tasks that access different nodes are scheduled on the
same node, they may encounter bandwidth bottlenecks and access latency.
Thus, numa_aware flag is introduced here, allowing tasks to be
distributed across different nodes to fully utilize the advantage of
multi-node systems.
Signed-off-by: Gang Li <[email protected]>
---
include/linux/padata.h | 2 ++
kernel/padata.c | 8 ++++++--
mm/mm_init.c | 1 +
3 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 495b16b6b4d72..f6c58c30ed96a 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -137,6 +137,7 @@ struct padata_shell {
* appropriate for one worker thread to do at once.
* @max_threads: Max threads to use for the job, actual number may be less
* depending on task size and minimum chunk size.
+ * @numa_aware: Dispatch jobs to different nodes.
*/
struct padata_mt_job {
void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
@@ -146,6 +147,7 @@ struct padata_mt_job {
unsigned long align;
unsigned long min_chunk;
int max_threads;
+ bool numa_aware;
};
/**
diff --git a/kernel/padata.c b/kernel/padata.c
index 179fb1518070c..80f82c563e46a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -485,7 +485,7 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
struct padata_work my_work, *pw;
struct padata_mt_job_state ps;
LIST_HEAD(works);
- int nworks;
+ int nworks, nid;
if (job->size == 0)
return;
@@ -517,7 +517,11 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
ps.chunk_size = roundup(ps.chunk_size, job->align);
list_for_each_entry(pw, &works, pw_list)
- queue_work(system_unbound_wq, &pw->pw_work);
+ if (job->numa_aware)
+ queue_work_node((++nid % num_node_state(N_MEMORY)),
+ system_unbound_wq, &pw->pw_work);
+ else
+ queue_work(system_unbound_wq, &pw->pw_work);
/* Use the current thread, which saves starting a workqueue worker. */
padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 077bfe393b5e2..1226f0c81fcb3 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2234,6 +2234,7 @@ static int __init deferred_init_memmap(void *data)
.align = PAGES_PER_SECTION,
.min_chunk = PAGES_PER_SECTION,
.max_threads = max_threads,
+ .numa_aware = false,
};
padata_do_multithreaded(&job);
--
2.30.2
1G and 2M huge pages have different allocation and initialization logic,
which leads to subtle differences in parallelization. Therefore, it is
appropriate to split hugetlb_hstate_alloc_pages into gigantic and
non-gigantic.
This patch has no functional changes.
Signed-off-by: Gang Li <[email protected]>
---
mm/hugetlb.c | 86 +++++++++++++++++++++++++++-------------------------
1 file changed, 45 insertions(+), 41 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 252d6866a0af8..8de1653fc4c4f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3502,6 +3502,47 @@ static void __init hugetlb_hstate_alloc_pages_report(unsigned long allocated, st
}
}
+static unsigned long __init hugetlb_hstate_alloc_pages_gigantic(struct hstate *h)
+{
+ unsigned long i;
+
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ /*
+ * gigantic pages not added to list as they are not
+ * added to pools now.
+ */
+ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
+ break;
+ cond_resched();
+ }
+
+ return i;
+}
+
+static unsigned long __init hugetlb_hstate_alloc_pages_non_gigantic(struct hstate *h)
+{
+ unsigned long i;
+ struct folio *folio;
+ LIST_HEAD(folio_list);
+ nodemask_t node_alloc_noretry;
+
+ /* Bit mask controlling how hard we retry per-node allocations.*/
+ nodes_clear(node_alloc_noretry);
+
+ for (i = 0; i < h->max_huge_pages; ++i) {
+ folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+ &node_alloc_noretry);
+ if (!folio)
+ break;
+ list_add(&folio->lru, &folio_list);
+ cond_resched();
+ }
+
+ prep_and_add_allocated_folios(h, &folio_list);
+
+ return i;
+}
+
/*
* NOTE: this routine is called in different contexts for gigantic and
* non-gigantic pages.
@@ -3515,10 +3556,7 @@ static void __init hugetlb_hstate_alloc_pages_report(unsigned long allocated, st
*/
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
- unsigned long i;
- struct folio *folio;
- LIST_HEAD(folio_list);
- nodemask_t *node_alloc_noretry;
+ unsigned long allocated;
/* skip gigantic hugepages allocation if hugetlb_cma enabled */
if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3532,46 +3570,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
/* below will do all node balanced alloc */
if (!hstate_is_gigantic(h)) {
- /*
- * Bit mask controlling how hard we retry per-node allocations.
- * Ignore errors as lower level routines can deal with
- * node_alloc_noretry == NULL. If this kmalloc fails at boot
- * time, we are likely in bigger trouble.
- */
- node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
- GFP_KERNEL);
+ allocated = hugetlb_hstate_alloc_pages_non_gigantic(h);
} else {
- /* allocations done at boot time */
- node_alloc_noretry = NULL;
- }
-
- /* bit mask controlling how hard we retry per-node allocations */
- if (node_alloc_noretry)
- nodes_clear(*node_alloc_noretry);
-
- for (i = 0; i < h->max_huge_pages; ++i) {
- if (hstate_is_gigantic(h)) {
- /*
- * gigantic pages not added to list as they are not
- * added to pools now.
- */
- if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
- break;
- } else {
- folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
- node_alloc_noretry);
- if (!folio)
- break;
- list_add(&folio->lru, &folio_list);
- }
- cond_resched();
+ allocated = hugetlb_hstate_alloc_pages_gigantic(h);
}
- /* list will be empty if hstate_is_gigantic */
- prep_and_add_allocated_folios(h, &folio_list);
-
- hugetlb_hstate_alloc_pages_report(i, h);
- kfree(node_alloc_noretry);
+ hugetlb_hstate_alloc_pages_report(allocated, h);
}
static void __init hugetlb_init_hstates(void)
--
2.30.2
The readability of `hugetlb_hstate_alloc_pages` is poor. By cleaning the
code, its readability can be improved, facilitating future modifications.
This patch extracts two functions to reduce the complexity of
`hugetlb_hstate_alloc_pages` and has no functional changes.
- hugetlb_hstate_alloc_pages_node_specific() to handle iterates through
each online node and performs allocation if necessary.
- hugetlb_hstate_alloc_pages_report() report error during allocation.
And the value of h->max_huge_pages is updated accordingly.
Signed-off-by: Gang Li <[email protected]>
---
mm/hugetlb.c | 46 +++++++++++++++++++++++++++++-----------------
1 file changed, 29 insertions(+), 17 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51f50bb3dc092..252d6866a0af8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3475,6 +3475,33 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
h->max_huge_pages_node[nid] = i;
}
+static bool __init hugetlb_hstate_alloc_pages_node_specific(struct hstate *h)
+{
+ int i;
+ bool node_specific_alloc = false;
+
+ for_each_online_node(i) {
+ if (h->max_huge_pages_node[i] > 0) {
+ hugetlb_hstate_alloc_pages_onenode(h, i);
+ node_specific_alloc = true;
+ }
+ }
+
+ return node_specific_alloc;
+}
+
+static void __init hugetlb_hstate_alloc_pages_report(unsigned long allocated, struct hstate *h)
+{
+ if (allocated < h->max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
+ h->max_huge_pages, buf, allocated);
+ h->max_huge_pages = allocated;
+ }
+}
+
/*
* NOTE: this routine is called in different contexts for gigantic and
* non-gigantic pages.
@@ -3492,7 +3519,6 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
struct folio *folio;
LIST_HEAD(folio_list);
nodemask_t *node_alloc_noretry;
- bool node_specific_alloc = false;
/* skip gigantic hugepages allocation if hugetlb_cma enabled */
if (hstate_is_gigantic(h) && hugetlb_cma_size) {
@@ -3501,14 +3527,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
}
/* do node specific alloc */
- for_each_online_node(i) {
- if (h->max_huge_pages_node[i] > 0) {
- hugetlb_hstate_alloc_pages_onenode(h, i);
- node_specific_alloc = true;
- }
- }
-
- if (node_specific_alloc)
+ if (hugetlb_hstate_alloc_pages_node_specific(h))
return;
/* below will do all node balanced alloc */
@@ -3551,14 +3570,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
/* list will be empty if hstate_is_gigantic */
prep_and_add_allocated_folios(h, &folio_list);
- if (i < h->max_huge_pages) {
- char buf[32];
-
- string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
- pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
- h->max_huge_pages, buf, i);
- h->max_huge_pages = i;
- }
+ hugetlb_hstate_alloc_pages_report(i, h);
kfree(node_alloc_noretry);
}
--
2.30.2
By distributing both the allocation and the initialization tasks across
multiple threads, the initialization of 2M hugetlb will be faster,
thereby improving the boot speed.
This patch can achieve 60% improvement in performance.
test no patch(ms) patched(ms) saved
------------------- -------------- ------------- --------
256c2t(4 node) 2M 2624 956 63.57%
128c1t(2 node) 2M 1788 684 61.74%
Signed-off-by: Gang Li <[email protected]>
---
mm/hugetlb.c | 71 ++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 52 insertions(+), 19 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8de1653fc4c4f..033e359fdb86b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,6 +35,7 @@
#include <linux/delayacct.h>
#include <linux/memory.h>
#include <linux/mm_inline.h>
+#include <linux/padata.h>
#include <asm/page.h>
#include <asm/pgalloc.h>
@@ -3502,6 +3503,37 @@ static void __init hugetlb_hstate_alloc_pages_report(unsigned long allocated, st
}
}
+static void __init hugetlb_alloc_node(unsigned long start, unsigned long end, void *arg)
+{
+ struct hstate *h = (struct hstate *)arg;
+ int i, num = end - start;
+ nodemask_t node_alloc_noretry;
+ unsigned long flags;
+
+ /* Bit mask controlling how hard we retry per-node allocations.*/
+ nodes_clear(node_alloc_noretry);
+
+ for (i = 0; i < num; ++i) {
+ struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+ &node_alloc_noretry);
+ if (!folio)
+ break;
+ spin_lock_irqsave(&hugetlb_lock, flags);
+ __prep_account_new_huge_page(h, folio_nid(folio));
+ enqueue_hugetlb_folio(h, folio);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
+ cond_resched();
+ }
+}
+
+static void __init hugetlb_vmemmap_optimize_node(unsigned long start, unsigned long end, void *arg)
+{
+ struct hstate *h = (struct hstate *)arg;
+ int nid = start;
+
+ hugetlb_vmemmap_optimize_folios(h, &h->hugepage_freelists[nid]);
+}
+
static unsigned long __init hugetlb_hstate_alloc_pages_gigantic(struct hstate *h)
{
unsigned long i;
@@ -3521,26 +3553,27 @@ static unsigned long __init hugetlb_hstate_alloc_pages_gigantic(struct hstate *h
static unsigned long __init hugetlb_hstate_alloc_pages_non_gigantic(struct hstate *h)
{
- unsigned long i;
- struct folio *folio;
- LIST_HEAD(folio_list);
- nodemask_t node_alloc_noretry;
-
- /* Bit mask controlling how hard we retry per-node allocations.*/
- nodes_clear(node_alloc_noretry);
-
- for (i = 0; i < h->max_huge_pages; ++i) {
- folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
- &node_alloc_noretry);
- if (!folio)
- break;
- list_add(&folio->lru, &folio_list);
- cond_resched();
- }
-
- prep_and_add_allocated_folios(h, &folio_list);
+ struct padata_mt_job job = {
+ .fn_arg = h,
+ .align = 1,
+ .numa_aware = true,
+ };
- return i;
+ job.thread_fn = hugetlb_alloc_node,
+ job.start = 0,
+ job.size = h->max_huge_pages,
+ job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2,
+ job.max_threads = num_node_state(N_MEMORY) * 2,
+ padata_do_multithreaded(&job);
+
+ job.thread_fn = hugetlb_vmemmap_optimize_node,
+ job.start = 0,
+ job.size = num_node_state(N_MEMORY),
+ job.min_chunk = 1,
+ job.max_threads = num_node_state(N_MEMORY),
+ padata_do_multithreaded(&job);
+
+ return h->nr_huge_pages;
}
/*
--
2.30.2
Optimizing the initialization speed of 1G huge pages through
parallelization.
1G hugetlbs are allocated from bootmem, a process that is already
very fast and does not currently require optimization. Therefore,
we focus on parallelizing only the initialization phase in
`gather_bootmem_prealloc`.
This patch can achieve 40%-50% improvement in performance.
test no patch(ms) patched(ms) saved
------------------- -------------- ------------- --------
256c2t(4 node) 1G 2679 1582 40.95%
128c1t(2 node) 1G 3160 1618 48.80%
Signed-off-by: Gang Li <[email protected]>
---
include/linux/hugetlb.h | 2 +-
mm/hugetlb.c | 40 +++++++++++++++++++++++++++++++++-------
2 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d3acecc5db4b3..ca94c43a63b84 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -178,7 +178,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
extern int sysctl_hugetlb_shm_group;
-extern struct list_head huge_boot_pages;
+extern struct list_head huge_boot_pages[MAX_NUMNODES];
/* arch callbacks */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 033e359fdb86b..eb33cb15dce61 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -69,7 +69,7 @@ static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
#endif
static unsigned long hugetlb_cma_size __initdata;
-__initdata LIST_HEAD(huge_boot_pages);
+__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
@@ -3331,7 +3331,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
huge_page_size(h) - PAGE_SIZE);
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
- list_add(&m->list, &huge_boot_pages);
+ list_add(&m->list, &huge_boot_pages[node]);
m->hstate = h;
return 1;
}
@@ -3382,8 +3382,6 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
/* Send list for bulk vmemmap optimization processing */
hugetlb_vmemmap_optimize_folios(h, folio_list);
- /* Add all new pool pages to free lists in one lock cycle */
- spin_lock_irqsave(&hugetlb_lock, flags);
list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
/*
@@ -3396,23 +3394,27 @@ static void __init prep_and_add_bootmem_folios(struct hstate *h,
HUGETLB_VMEMMAP_RESERVE_PAGES,
pages_per_huge_page(h));
}
+ /* Subdivide locks to achieve better parallel performance */
+ spin_lock_irqsave(&hugetlb_lock, flags);
__prep_account_new_huge_page(h, folio_nid(folio));
enqueue_hugetlb_folio(h, folio);
+ spin_unlock_irqrestore(&hugetlb_lock, flags);
}
- spin_unlock_irqrestore(&hugetlb_lock, flags);
}
/*
* Put bootmem huge pages into the standard lists after mem_map is up.
* Note: This only applies to gigantic (order > MAX_ORDER) pages.
*/
-static void __init gather_bootmem_prealloc(void)
+static void __init __gather_bootmem_prealloc(unsigned long start, unsigned long end, void *arg)
+
{
+ int nid = start;
LIST_HEAD(folio_list);
struct huge_bootmem_page *m;
struct hstate *h = NULL, *prev_h = NULL;
- list_for_each_entry(m, &huge_boot_pages, list) {
+ list_for_each_entry(m, &huge_boot_pages[nid], list) {
struct page *page = virt_to_page(m);
struct folio *folio = (void *)page;
@@ -3445,6 +3447,22 @@ static void __init gather_bootmem_prealloc(void)
prep_and_add_bootmem_folios(h, &folio_list);
}
+static void __init gather_bootmem_prealloc(void)
+{
+ struct padata_mt_job job = {
+ .thread_fn = __gather_bootmem_prealloc,
+ .fn_arg = NULL,
+ .start = 0,
+ .size = num_node_state(N_MEMORY),
+ .align = 1,
+ .min_chunk = 1,
+ .max_threads = num_node_state(N_MEMORY),
+ .numa_aware = true,
+ };
+
+ padata_do_multithreaded(&job);
+}
+
static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
{
unsigned long i;
@@ -3597,6 +3615,14 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
return;
}
+ /* hugetlb_hstate_alloc_pages will be called many times, init huge_boot_pages once*/
+ if (huge_boot_pages[0].next == NULL) {
+ int i = 0;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ INIT_LIST_HEAD(&huge_boot_pages[i]);
+ }
+
/* do node specific alloc */
if (hugetlb_hstate_alloc_pages_node_specific(h))
return;
--
2.30.2
Add timing to hugetlb allocations for further optimization.
Debug only.
Signed-off-by: Gang Li <[email protected]>
---
mm/hugetlb.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1169ef2f2176f..51f50bb3dc092 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4490,6 +4490,7 @@ static inline void hugetlb_sysctl_init(void) { }
static int __init hugetlb_init(void)
{
int i;
+ unsigned long start;
BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
__NR_HPAGEFLAGS);
@@ -4536,8 +4537,11 @@ static int __init hugetlb_init(void)
}
hugetlb_cma_check();
+ start = jiffies;
hugetlb_init_hstates();
gather_bootmem_prealloc();
+ pr_info("HugeTLB: 2M alloc, init and 1G init takes %u ms\n",
+ jiffies_to_msecs(jiffies - start));
report_hugepages();
hugetlb_sysfs_init();
@@ -4682,6 +4686,7 @@ static int __init hugepages_setup(char *s)
}
}
+ unsigned long start = jiffies;
/*
* Global state is always initialized later in hugetlb_init.
* But we need to allocate gigantic hstates here early to still
@@ -4692,6 +4697,7 @@ static int __init hugepages_setup(char *s)
last_mhp = mhp;
+ pr_info("HugeTLB: 1G alloc takes %u ms\n", jiffies_to_msecs(jiffies - start));
return 1;
invalid:
--
2.30.2
On 12/08/23 10:52, Gang Li wrote:
> Hi all, hugetlb init parallelization has now been updated to v2.
Thanks for your efforts, and sorry for my late comments.
> To David Hildenbrand: padata multithread utilities has been used to reduce
> code complexity.
>
> To David Rientjes: The patch for measuring time will be separately included
> in the reply. Please test during your free time, thanks.
>
> # Introduction
> Hugetlb initialization during boot takes up a considerable amount of time.
> For instance, on a 2TB system, initializing 1,800 1GB huge pages takes 1-2
> seconds out of 10 seconds. Initializing 11,776 1GB pages on a 12TB Intel
> host takes 65.2 seconds [1], which is 17.4% of the total 373.78 seconds boot
> time. This is a noteworthy figure.
One issue to be concerned with is hugetlb page allocation on systems with
unbalanced numa node memory. Commit f60858f9d327 ("hugetlbfs: don't retry
when pool page allocations start to fail") was added to deal with issues
reported on such systems. So, users are certainly using hugetlb pages
on systems with imbalances.
If performing allocations in parallel, I believe we would want the total
number of hugetlb pages allocated to be the same as today. For example,
consider a simple 2 node system with 16GB total memory:
node 0: 2GB
node 1: 14GB
With today's code, allocating 6656 2MB pages via the kernel command line
results in:
node 0: 924 pages
node 1: 5732 pages
total: 6656 pages
With code to parallel allocations in this series:
node 0: 924 pages
node 1: 1547 pages
total: 2471 pages
--
Mike Kravetz
On Fri, 8 Dec 2023, Gang Li wrote:
> Hi all, hugetlb init parallelization has now been updated to v2.
>
> To David Hildenbrand: padata multithread utilities has been used to reduce
> code complexity.
>
> To David Rientjes: The patch for measuring time will be separately included
> in the reply. Please test during your free time, thanks.
>
I'd love to, but what kernel is this based on? :) I can't get this to
apply to any kernels that I have recently benchmarked with.
> # Introduction
> Hugetlb initialization during boot takes up a considerable amount of time.
> For instance, on a 2TB system, initializing 1,800 1GB huge pages takes 1-2
> seconds out of 10 seconds. Initializing 11,776 1GB pages on a 12TB Intel
> host takes 65.2 seconds [1], which is 17.4% of the total 373.78 seconds boot
> time. This is a noteworthy figure.
>
> Inspired by [2] and [3], hugetlb initialization can also be accelerated
> through parallelization. Kernel already has infrastructure like
> padata_do_multithreaded, this patch uses it to achieve effective results
> by minimal modifications.
>
> [1] https://lore.kernel.org/all/[email protected]/
> [2] https://lore.kernel.org/all/[email protected]/
> [3] https://lore.kernel.org/all/[email protected]/
>
> # Test result
> test no patch(ms) patched(ms) saved
> ------------------- -------------- ------------- --------
> 256c2t(4 node) 2M 2624 956 63.57%
> 256c2t(4 node) 1G 2679 1582 40.95%
> 128c1t(2 node) 2M 1788 684 61.74%
> 128c1t(2 node) 1G 3160 1618 48.80%
>
> # Change log
> Changes in v2:
> - Reduce complexity with `padata_do_multithreaded`
> - Support 1G hugetlb
>
> v1:
> - https://lore.kernel.org/all/[email protected]/
> - parallelize 2M hugetlb initialization with workqueue
>
> Gang Li (5):
> hugetlb: code clean for hugetlb_hstate_alloc_pages
> hugetlb: split hugetlb_hstate_alloc_pages
> padata: dispatch works on different nodes
> hugetlb: parallelize 2M hugetlb allocation and initialization
> hugetlb: parallelize 1G hugetlb initialization
>
> include/linux/hugetlb.h | 2 +-
> include/linux/padata.h | 2 +
> kernel/padata.c | 8 +-
> mm/hugetlb.c | 201 +++++++++++++++++++++++++++-------------
> mm/mm_init.c | 1 +
> 5 files changed, 148 insertions(+), 66 deletions(-)
>
> --
> 2.30.2
>
>
On 12/12/23 14:14, David Rientjes wrote:
> On Fri, 8 Dec 2023, Gang Li wrote:
>
> > Hi all, hugetlb init parallelization has now been updated to v2.
> >
> > To David Hildenbrand: padata multithread utilities has been used to reduce
> > code complexity.
> >
> > To David Rientjes: The patch for measuring time will be separately included
> > in the reply. Please test during your free time, thanks.
> >
>
> I'd love to, but what kernel is this based on? :) I can't get this to
> apply to any kernels that I have recently benchmarked with.
I was able to apply and build on top of v6.7-rc5.
Gang Li,
Since hugetlb now depends on CONFIG_PADATA, the Kconfig file should be
updated to reflect this.
--
Mike Kravetz
>
> list_for_each_entry(pw, &works, pw_list)
> - queue_work(system_unbound_wq, &pw->pw_work);
> + if (job->numa_aware)
> + queue_work_node((++nid % num_node_state(N_MEMORY)),
The nid may fall on a NUMA node with only memory but no CPU. In that case you
may still put the work on the unbound queue. You could end up on one CPU node for work
from all memory nodes without CPU. Is this what you want? Or you would
like to spread them between CPU nodes?
Tim
> + system_unbound_wq, &pw->pw_work);
> + else
> + queue_work(system_unbound_wq, &pw->pw_work);
>
> /* Use the current thread, which saves starting a workqueue worker. */
> padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
On Tue, 12 Dec 2023, Mike Kravetz wrote:
> On 12/12/23 14:14, David Rientjes wrote:
> > On Fri, 8 Dec 2023, Gang Li wrote:
> >
> > > Hi all, hugetlb init parallelization has now been updated to v2.
> > >
> > > To David Hildenbrand: padata multithread utilities has been used to reduce
> > > code complexity.
> > >
> > > To David Rientjes: The patch for measuring time will be separately included
> > > in the reply. Please test during your free time, thanks.
> > >
> >
> > I'd love to, but what kernel is this based on? :) I can't get this to
> > apply to any kernels that I have recently benchmarked with.
>
> I was able to apply and build on top of v6.7-rc5.
>
> Gang Li,
> Since hugetlb now depends on CONFIG_PADATA, the Kconfig file should be
> updated to reflect this.
Gotcha, thanks.
I got this:
ld: error: undefined symbol: padata_do_multithreaded
referenced by hugetlb.c:3470 (./mm/hugetlb.c:3470)
vmlinux.o:(gather_bootmem_prealloc)
referenced by hugetlb.c:3592 (./mm/hugetlb.c:3592)
vmlinux.o:(hugetlb_hstate_alloc_pages_non_gigantic)
referenced by hugetlb.c:3599 (./mm/hugetlb.c:3599)
vmlinux.o:(hugetlb_hstate_alloc_pages_non_gigantic)
So, yeah we need to enable DEFERRED_STRUCT_PAGE_INIT for this to build.
On 6.6 I measured "hugepagesz=1G hugepages=11776" on as 12TB host to be
77s this time around.
A latest Linus build with this patch set does not boot successfully, so
I'll need to look into that and try to capture the failure. Not sure if
it's related to this patch or the latest Linus build in general.
Hi,
On 2023/12/13 08:10, David Rientjes wrote:
> On 6.6 I measured "hugepagesz=1G hugepages=11776" on as 12TB host to be
> 77s this time around.
Thanks for your test! Is this the total kernel boot time, or just the
hugetlb initialization time?
>
> A latest Linus build with this patch set does not boot successfully, so
Which branch/tag is it compiled on?
I test this patch on v6.7-rc4 and next-20231130.
> I'll need to look into that and try to capture the failure. Not sure if
> it's related to this patch or the latest Linus build in general.
>
On 2023/12/13 07:40, Tim Chen wrote:
>
>>
>> list_for_each_entry(pw, &works, pw_list)
>> - queue_work(system_unbound_wq, &pw->pw_work);
>> + if (job->numa_aware)
>> + queue_work_node((++nid % num_node_state(N_MEMORY)),
>
> The nid may fall on a NUMA node with only memory but no CPU. In that case you
> may still put the work on the unbound queue. You could end up on one CPU node for work
> from all memory nodes without CPU. Is this what you want? Or you would
> like to spread them between CPU nodes?
>
> Tim
Hi, thank you for your reminder. My intention was to fully utilize all
memory bandwidth.
For memory nodes without CPUs, I also hope to be able to spread them on
different CPUs.
On 2023/12/13 04:06, Mike Kravetz wrote:
> With today's code, allocating 6656 2MB pages via the kernel command line
> results in:
> node 0: 924 pages
> node 1: 5732 pages
> total: 6656 pages
>
> With code to parallel allocations in this series:
> node 0: 924 pages
> node 1: 1547 pages
> total: 2471 pages
Hi Mike,
Disable numa_aware for hugetlb_alloc_node should solve this problem.
I will fix it in v3.
On Mon, 18 Dec 2023, Gang Li wrote:
> Hi,
>
> On 2023/12/13 08:10, David Rientjes wrote:
> > On 6.6 I measured "hugepagesz=1G hugepages=11776" on as 12TB host to be
> > 77s this time around.
>
> Thanks for your test! Is this the total kernel boot time, or just the
> hugetlb initialization time?
>
Ah, sorry for not being specific. It's just the hugetlb preallocation of
11776 1GB hugetlb pages, total boot takes a few more minutes.
> > A latest Linus build with this patch set does not boot successfully, so
>
> Which branch/tag is it compiled on?
> I test this patch on v6.7-rc4 and next-20231130.
>
It was the latest Linus tip of tree. I'll continue to try again until I
get a successful boot and report back, serial console won't be possible
for unrelated reasons.
On Thu, 21 Dec 2023, David Rientjes wrote:
> > Hi,
> >
> > On 2023/12/13 08:10, David Rientjes wrote:
> > > On 6.6 I measured "hugepagesz=1G hugepages=11776" on as 12TB host to be
> > > 77s this time around.
> >
> > Thanks for your test! Is this the total kernel boot time, or just the
> > hugetlb initialization time?
> >
>
> Ah, sorry for not being specific. It's just the hugetlb preallocation of
> 11776 1GB hugetlb pages, total boot takes a few more minutes.
>
I had to apply this to get the patch series to compile on 6.7-rc7:
diff --git a/kernel/padata.c b/kernel/padata.c
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -485,7 +485,7 @@ void __init padata_do_multithreaded(struct padata_mt_job *job)
struct padata_work my_work, *pw;
struct padata_mt_job_state ps;
LIST_HEAD(works);
- int nworks, nid;
+ int nworks, nid = 0;
if (job->size == 0)
return;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3300,7 +3300,7 @@ int alloc_bootmem_huge_page(struct hstate *h, int nid)
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
struct huge_bootmem_page *m = NULL; /* initialize for clang */
- int nr_nodes, node;
+ int nr_nodes, node = NUMA_NO_NODE;
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
With that, I compared "hugepagesz=1G hugepages=11776" before and after on
a 12TB host with eight NUMA nodes.
Compared to 77s of total initialization time before, with this series I
measured 18.3s.
Feel free to add this into the changelog once the initialization issues
are fixed up and I'm happy to ack it.
Thanks!
On 2023/12/25 13:21, David Rientjes wrote:
> With that, I compared "hugepagesz=1G hugepages=11776" before and after on
> a 12TB host with eight NUMA nodes.
>
> Compared to 77s of total initialization time before, with this series I
> measured 18.3s.
>
> Feel free to add this into the changelog once the initialization issues
> are fixed up and I'm happy to ack it.
>
> Thanks!
Cool! Thank you ;)
Hi Tim,
According to queue_work_node, if there are no CPUs available on the
given node, it will schedule to any available CPU.
On 2023/12/18 14:46, Gang Li wrote:
> On 2023/12/13 07:40, Tim Chen wrote:
>>
>>> list_for_each_entry(pw, &works, pw_list)
>>> - queue_work(system_unbound_wq, &pw->pw_work);
>>> + if (job->numa_aware)
>>> + queue_work_node((++nid % num_node_state(N_MEMORY)),
>>
>> The nid may fall on a NUMA node with only memory but no CPU. In that
>> case you
>> may still put the work on the unbound queue. You could end up on one
>> CPU node for work
>> from all memory nodes without CPU. Is this what you want? Or you would
>> like to spread them between CPU nodes?
>>
>> Tim
>
> Hi, thank you for your reminder. My intention was to fully utilize all
> memory bandwidth.
>
> For memory nodes without CPUs, I also hope to be able to spread them on
> different CPUs.