early_pfn_to_nid() is called frequently in init_reserved_page(), it
returns the node id of the PFN. These PFN are probably from the same
memory region, they have the same node id. It's not necessary to call
early_pfn_to_nid() for each PFN.
Pass nid to eserve_bootmem_region() and drop the call to
early_pfn_to_nid() in init_reserved_page().
The most beneficial function is memmap_init_reserved_pages() if define
CONFIG_DEFERRED_STRUCT_PAGE_INIT.
The following data was tested on x86 machine, it has 190GB RAM,
before:
memmap_init_reserved_pages() 67ms
after:
memmap_init_reserved_pages() 20ms
Signed-off-by: Yajun Deng <[email protected]>
Reported-by: kernel test robot <[email protected]>
Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]
---
include/linux/mm.h | 3 ++-
mm/memblock.c | 9 ++++++---
mm/mm_init.c | 31 +++++++++++++++++++------------
3 files changed, 27 insertions(+), 16 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 17317b1673b0..39e72ca6bf22 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2964,7 +2964,8 @@ extern unsigned long free_reserved_area(void *start, void *end,
extern void adjust_managed_page_count(struct page *page, long count);
-extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
+extern void reserve_bootmem_region(phys_addr_t start,
+ phys_addr_t end, int nid);
/* Free the reserved page into the buddy system, so it gets managed. */
static inline void free_reserved_page(struct page *page)
diff --git a/mm/memblock.c b/mm/memblock.c
index ff0da1858778..6dc51dc845e5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2091,18 +2091,21 @@ static void __init memmap_init_reserved_pages(void)
{
struct memblock_region *region;
phys_addr_t start, end;
+ int nid;
u64 i;
/* initialize struct pages for the reserved regions */
- for_each_reserved_mem_range(i, &start, &end)
- reserve_bootmem_region(start, end);
+ __for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE,
+ MEMBLOCK_NONE, &start, &end, &nid)
+ reserve_bootmem_region(start, end, nid);
/* and also treat struct pages for the NOMAP regions as PageReserved */
for_each_mem_region(region) {
if (memblock_is_nomap(region)) {
start = region->base;
end = start + region->size;
- reserve_bootmem_region(start, end);
+ nid = memblock_get_region_node(region);
+ reserve_bootmem_region(start, end, nid);
}
}
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index d393631599a7..1499efbebc6f 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -646,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
}
/* Returns true if the struct page for the pfn is initialised */
-static inline bool __meminit early_page_initialised(unsigned long pfn)
+static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
{
- int nid = early_pfn_to_nid(pfn);
-
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
return false;
@@ -695,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static void __meminit init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn, int nid)
{
pg_data_t *pgdat;
- int nid, zid;
+ int zid;
- if (early_page_initialised(pfn))
+ if (early_page_initialised(pfn, nid))
return;
- nid = early_pfn_to_nid(pfn);
pgdat = NODE_DATA(nid);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
@@ -717,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
-static inline bool early_page_initialised(unsigned long pfn)
+static inline bool early_page_initialised(unsigned long pfn, int nid)
{
return true;
}
@@ -727,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
return false;
}
-static inline void init_reserved_page(unsigned long pfn)
+static inline void init_reserved_page(unsigned long pfn, int nid)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -738,16 +735,20 @@ static inline void init_reserved_page(unsigned long pfn)
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
-void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
+void __meminit reserve_bootmem_region(phys_addr_t start,
+ phys_addr_t end, int nid)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
+ if (nid == MAX_NUMNODES)
+ nid = first_online_node;
+
for (; start_pfn < end_pfn; start_pfn++) {
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
- init_reserved_page(start_pfn);
+ init_reserved_page(start_pfn, nid);
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
@@ -2579,7 +2580,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
- if (!early_page_initialised(pfn))
+ int nid = 0;
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+ nid = early_pfn_to_nid(pfn);
+#endif
+
+ if (!early_page_initialised(pfn, nid))
return;
if (!kmsan_memblock_free_pages(page, order)) {
/* KMSAN will take care of these pages. */
--
2.25.1
On Fri, Jun 16, 2023 at 10:30:11AM +0800, Yajun Deng wrote:
> early_pfn_to_nid() is called frequently in init_reserved_page(), it
> returns the node id of the PFN. These PFN are probably from the same
> memory region, they have the same node id. It's not necessary to call
> early_pfn_to_nid() for each PFN.
>
> Pass nid to eserve_bootmem_region() and drop the call to
> early_pfn_to_nid() in init_reserved_page().
>
> The most beneficial function is memmap_init_reserved_pages() if define
> CONFIG_DEFERRED_STRUCT_PAGE_INIT.
> The following data was tested on x86 machine, it has 190GB RAM,
>
> before:
> memmap_init_reserved_pages() 67ms
>
> after:
> memmap_init_reserved_pages() 20ms
>
> Signed-off-by: Yajun Deng <[email protected]>
> Reported-by: kernel test robot <[email protected]>
> Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]
> ---
> include/linux/mm.h | 3 ++-
> mm/memblock.c | 9 ++++++---
> mm/mm_init.c | 31 +++++++++++++++++++------------
> 3 files changed, 27 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 17317b1673b0..39e72ca6bf22 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2964,7 +2964,8 @@ extern unsigned long free_reserved_area(void *start, void *end,
>
> extern void adjust_managed_page_count(struct page *page, long count);
>
> -extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
> +extern void reserve_bootmem_region(phys_addr_t start,
> + phys_addr_t end, int nid);
>
> /* Free the reserved page into the buddy system, so it gets managed. */
> static inline void free_reserved_page(struct page *page)
> diff --git a/mm/memblock.c b/mm/memblock.c
> index ff0da1858778..6dc51dc845e5 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -2091,18 +2091,21 @@ static void __init memmap_init_reserved_pages(void)
> {
> struct memblock_region *region;
> phys_addr_t start, end;
> + int nid;
> u64 i;
>
> /* initialize struct pages for the reserved regions */
> - for_each_reserved_mem_range(i, &start, &end)
> - reserve_bootmem_region(start, end);
> + __for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE,
> + MEMBLOCK_NONE, &start, &end, &nid)
> + reserve_bootmem_region(start, end, nid);
I'd prefer to see for_each_reserved_mem_region() loop here
>
> /* and also treat struct pages for the NOMAP regions as PageReserved */
> for_each_mem_region(region) {
> if (memblock_is_nomap(region)) {
> start = region->base;
> end = start + region->size;
> - reserve_bootmem_region(start, end);
> + nid = memblock_get_region_node(region);
> + reserve_bootmem_region(start, end, nid);
> }
> }
> }
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index d393631599a7..1499efbebc6f 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -646,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
> }
>
> /* Returns true if the struct page for the pfn is initialised */
> -static inline bool __meminit early_page_initialised(unsigned long pfn)
> +static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
> {
> - int nid = early_pfn_to_nid(pfn);
> -
> if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
> return false;
>
> @@ -695,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
> return false;
> }
>
> -static void __meminit init_reserved_page(unsigned long pfn)
> +static void __meminit init_reserved_page(unsigned long pfn, int nid)
> {
> pg_data_t *pgdat;
> - int nid, zid;
> + int zid;
>
> - if (early_page_initialised(pfn))
> + if (early_page_initialised(pfn, nid))
> return;
>
> - nid = early_pfn_to_nid(pfn);
> pgdat = NODE_DATA(nid);
>
> for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> @@ -717,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
> #else
> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>
> -static inline bool early_page_initialised(unsigned long pfn)
> +static inline bool early_page_initialised(unsigned long pfn, int nid)
> {
> return true;
> }
> @@ -727,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
> return false;
> }
>
> -static inline void init_reserved_page(unsigned long pfn)
> +static inline void init_reserved_page(unsigned long pfn, int nid)
> {
> }
> #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
> @@ -738,16 +735,20 @@ static inline void init_reserved_page(unsigned long pfn)
> * marks the pages PageReserved. The remaining valid pages are later
> * sent to the buddy page allocator.
> */
> -void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
> +void __meminit reserve_bootmem_region(phys_addr_t start,
> + phys_addr_t end, int nid)
> {
> unsigned long start_pfn = PFN_DOWN(start);
> unsigned long end_pfn = PFN_UP(end);
>
> + if (nid == MAX_NUMNODES)
> + nid = first_online_node;
How can this happen?
> +
> for (; start_pfn < end_pfn; start_pfn++) {
> if (pfn_valid(start_pfn)) {
> struct page *page = pfn_to_page(start_pfn);
>
> - init_reserved_page(start_pfn);
> + init_reserved_page(start_pfn, nid);
>
> /* Avoid false-positive PageTail() */
> INIT_LIST_HEAD(&page->lru);
> @@ -2579,7 +2580,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
> void __init memblock_free_pages(struct page *page, unsigned long pfn,
> unsigned int order)
> {
> - if (!early_page_initialised(pfn))
> + int nid = 0;
> +
> +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
> + nid = early_pfn_to_nid(pfn);
> +#endif
Wen can pass nid to memblock_free_pages, no?
> +
> + if (!early_page_initialised(pfn, nid))
> return;
> if (!kmsan_memblock_free_pages(page, order)) {
> /* KMSAN will take care of these pages. */
> --
> 2.25.1
>
--
Sincerely yours,
Mike.
June 16, 2023 3:22 PM, "Mike Rapoport" <[email protected]> wrote:
> On Fri, Jun 16, 2023 at 10:30:11AM +0800, Yajun Deng wrote:
>
>> early_pfn_to_nid() is called frequently in init_reserved_page(), it
>> returns the node id of the PFN. These PFN are probably from the same
>> memory region, they have the same node id. It's not necessary to call
>> early_pfn_to_nid() for each PFN.
>>
>> Pass nid to eserve_bootmem_region() and drop the call to
>> early_pfn_to_nid() in init_reserved_page().
>>
>> The most beneficial function is memmap_init_reserved_pages() if define
>> CONFIG_DEFERRED_STRUCT_PAGE_INIT.
>> The following data was tested on x86 machine, it has 190GB RAM,
>>
>> before:
>> memmap_init_reserved_pages() 67ms
>>
>> after:
>> memmap_init_reserved_pages() 20ms
>>
>> Signed-off-by: Yajun Deng <[email protected]>
>> Reported-by: kernel test robot <[email protected]>
>> Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]
>> ---
>> include/linux/mm.h | 3 ++-
>> mm/memblock.c | 9 ++++++---
>> mm/mm_init.c | 31 +++++++++++++++++++------------
>> 3 files changed, 27 insertions(+), 16 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 17317b1673b0..39e72ca6bf22 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -2964,7 +2964,8 @@ extern unsigned long free_reserved_area(void *start, void *end,
>>
>> extern void adjust_managed_page_count(struct page *page, long count);
>>
>> -extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
>> +extern void reserve_bootmem_region(phys_addr_t start,
>> + phys_addr_t end, int nid);
>>
>> /* Free the reserved page into the buddy system, so it gets managed. */
>> static inline void free_reserved_page(struct page *page)
>> diff --git a/mm/memblock.c b/mm/memblock.c
>> index ff0da1858778..6dc51dc845e5 100644
>> --- a/mm/memblock.c
>> +++ b/mm/memblock.c
>> @@ -2091,18 +2091,21 @@ static void __init memmap_init_reserved_pages(void)
>> {
>> struct memblock_region *region;
>> phys_addr_t start, end;
>> + int nid;
>> u64 i;
>>
>> /* initialize struct pages for the reserved regions */
>> - for_each_reserved_mem_range(i, &start, &end)
>> - reserve_bootmem_region(start, end);
>> + __for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE,
>> + MEMBLOCK_NONE, &start, &end, &nid)
>> + reserve_bootmem_region(start, end, nid);
>
> I'd prefer to see for_each_reserved_mem_region() loop here
>
okay.
>> /* and also treat struct pages for the NOMAP regions as PageReserved */
>> for_each_mem_region(region) {
>> if (memblock_is_nomap(region)) {
>> start = region->base;
>> end = start + region->size;
>> - reserve_bootmem_region(start, end);
>> + nid = memblock_get_region_node(region);
>> + reserve_bootmem_region(start, end, nid);
>> }
>> }
>> }
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index d393631599a7..1499efbebc6f 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -646,10 +646,8 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
>> }
>>
>> /* Returns true if the struct page for the pfn is initialised */
>> -static inline bool __meminit early_page_initialised(unsigned long pfn)
>> +static inline bool __meminit early_page_initialised(unsigned long pfn, int nid)
>> {
>> - int nid = early_pfn_to_nid(pfn);
>> -
>> if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
>> return false;
>>
>> @@ -695,15 +693,14 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
>> return false;
>> }
>>
>> -static void __meminit init_reserved_page(unsigned long pfn)
>> +static void __meminit init_reserved_page(unsigned long pfn, int nid)
>> {
>> pg_data_t *pgdat;
>> - int nid, zid;
>> + int zid;
>>
>> - if (early_page_initialised(pfn))
>> + if (early_page_initialised(pfn, nid))
>> return;
>>
>> - nid = early_pfn_to_nid(pfn);
>> pgdat = NODE_DATA(nid);
>>
>> for (zid = 0; zid < MAX_NR_ZONES; zid++) {
>> @@ -717,7 +714,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
>> #else
>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>
>> -static inline bool early_page_initialised(unsigned long pfn)
>> +static inline bool early_page_initialised(unsigned long pfn, int nid)
>> {
>> return true;
>> }
>> @@ -727,7 +724,7 @@ static inline bool defer_init(int nid, unsigned long pfn, unsigned long
>> end_pfn)
>> return false;
>> }
>>
>> -static inline void init_reserved_page(unsigned long pfn)
>> +static inline void init_reserved_page(unsigned long pfn, int nid)
>> {
>> }
>> #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
>> @@ -738,16 +735,20 @@ static inline void init_reserved_page(unsigned long pfn)
>> * marks the pages PageReserved. The remaining valid pages are later
>> * sent to the buddy page allocator.
>> */
>> -void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
>> +void __meminit reserve_bootmem_region(phys_addr_t start,
>> + phys_addr_t end, int nid)
>> {
>> unsigned long start_pfn = PFN_DOWN(start);
>> unsigned long end_pfn = PFN_UP(end);
>>
>> + if (nid == MAX_NUMNODES)
>> + nid = first_online_node;
>
> How can this happen?
>
Some reserved memory regions may not set nid. I found it when I debug.
We can see that by memblock_debug_show().
>> +
>> for (; start_pfn < end_pfn; start_pfn++) {
>> if (pfn_valid(start_pfn)) {
>> struct page *page = pfn_to_page(start_pfn);
>>
>> - init_reserved_page(start_pfn);
>> + init_reserved_page(start_pfn, nid);
>>
>> /* Avoid false-positive PageTail() */
>> INIT_LIST_HEAD(&page->lru);
>> @@ -2579,7 +2580,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
>> void __init memblock_free_pages(struct page *page, unsigned long pfn,
>> unsigned int order)
>> {
>> - if (!early_page_initialised(pfn))
>> + int nid = 0;
>> +
>> +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
>> + nid = early_pfn_to_nid(pfn);
>> +#endif
>
> Wen can pass nid to memblock_free_pages, no?
>
memblock_free_pages() was called by __free_pages_memory() and memblock_free_late().
For the latter, I'm not sure if we can pass nid.
I think we can pass nid to reserve_bootmem_region() in this patch, and pass nid to
memblock_free_pages() in another patch if we can confirm this.
>> +
>> + if (!early_page_initialised(pfn, nid))
>> return;
>> if (!kmsan_memblock_free_pages(page, order)) {
>> /* KMSAN will take care of these pages. */
>> --
>> 2.25.1
>
> --
> Sincerely yours,
> Mike.
On Fri, Jun 16, 2023 at 07:51:21AM +0000, Yajun Deng wrote:
> June 16, 2023 3:22 PM, "Mike Rapoport" <[email protected]> wrote:
> > On Fri, Jun 16, 2023 at 10:30:11AM +0800, Yajun Deng wrote:
> >
> >> diff --git a/mm/mm_init.c b/mm/mm_init.c
> >> index d393631599a7..1499efbebc6f 100644
> >> --- a/mm/mm_init.c
> >> +++ b/mm/mm_init.c
> >> @@ -738,16 +735,20 @@ static inline void init_reserved_page(unsigned long pfn)
> >> * marks the pages PageReserved. The remaining valid pages are later
> >> * sent to the buddy page allocator.
> >> */
> >> -void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
> >> +void __meminit reserve_bootmem_region(phys_addr_t start,
> >> + phys_addr_t end, int nid)
> >> {
> >> unsigned long start_pfn = PFN_DOWN(start);
> >> unsigned long end_pfn = PFN_UP(end);
> >>
> >> + if (nid == MAX_NUMNODES)
> >> + nid = first_online_node;
> >
> > How can this happen?
> >
>
> Some reserved memory regions may not set nid. I found it when I debug.
> We can see that by memblock_debug_show().
Hmm, indeed. But then it means that some struct pages for the reserved pages
will get wrong nid and if they are freed we'd actually get pages with wrong
nid.
Maybe it's this time to set nid on all reserved pages with something like
diff --git a/mm/memblock.c b/mm/memblock.c
index 3feafea06ab2..fcd0987e2496 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -2084,6 +2084,14 @@ static void __init memmap_init_reserved_pages(void)
phys_addr_t start, end;
u64 i;
+ for_each_mem_region(region) {
+ int nid = memblock_get_region_node(region);
+
+ start = region->base;
+ end = start + region->size;
+ memblock_set_node(start, end, &memblock.reserved, nid);
+ }
+
/* initialize struct pages for the reserved regions */
for_each_reserved_mem_range(i, &start, &end)
reserve_bootmem_region(start, end);
> >> @@ -2579,7 +2580,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
> >> void __init memblock_free_pages(struct page *page, unsigned long pfn,
> >> unsigned int order)
> >> {
> >> - if (!early_page_initialised(pfn))
> >> + int nid = 0;
> >> +
> >> +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
> >> + nid = early_pfn_to_nid(pfn);
> >> +#endif
Please replace #ifdef with
if (IS_DEFINED(CONFIG_DEFERRED_STRUCT_PAGE_INIT))
> > Wen can pass nid to memblock_free_pages, no?
> >
>
> memblock_free_pages() was called by __free_pages_memory() and memblock_free_late().
> For the latter, I'm not sure if we can pass nid.
>
> I think we can pass nid to reserve_bootmem_region() in this patch, and pass nid to
> memblock_free_pages() in another patch if we can confirm this.
Fair enough.
--
Sincerely yours,
Mike.