memmap_init_range() would init page count of all pages, but the free
pages count would be reset in __free_pages_core(). There are opposite
operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
context.
Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
and check the page count before reset it.
At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
need, as it already done in __init_single_page.
The following data was tested on an x86 machine with 190GB of RAM.
before:
free_low_memory_core_early() 341ms
after:
free_low_memory_core_early() 285ms
Signed-off-by: Yajun Deng <[email protected]>
---
v4: same with v2.
v3: same with v2.
v2: check page count instead of check context before reset it.
v1: https://lore.kernel.org/all/[email protected]/
---
mm/mm_init.c | 18 +++++++++++++-----
mm/page_alloc.c | 20 ++++++++++++--------
2 files changed, 25 insertions(+), 13 deletions(-)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 9716c8a7ade9..3ab8861e1ef3 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
if (zone_spans_pfn(zone, pfn))
break;
}
- __init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
+ __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
}
#else
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
@@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
init_reserved_page(start_pfn, nid);
- /* Avoid false-positive PageTail() */
- INIT_LIST_HEAD(&page->lru);
+ /* Init page count for reserved region */
+ init_page_count(page);
/*
* no need for atomic set_bit because the struct
@@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
}
page = pfn_to_page(pfn);
- __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
- if (context == MEMINIT_HOTPLUG)
+
+ /* If the context is MEMINIT_EARLY, we will init page count and
+ * mark page reserved in reserve_bootmem_region, the free region
+ * wouldn't have page count and we will check the pages count
+ * in __free_pages_core.
+ */
+ __init_single_page(page, pfn, zone, nid, 0);
+ if (context == MEMINIT_HOTPLUG) {
+ init_page_count(page);
__SetPageReserved(page);
+ }
/*
* Usually, we want to mark the pageblock MIGRATE_MOVABLE,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 06be8821d833..b868caabe8dc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
unsigned int loop;
/*
- * When initializing the memmap, __init_single_page() sets the refcount
- * of all pages to 1 ("allocated"/"not free"). We have to set the
- * refcount of all involved pages to 0.
+ * When initializing the memmap, memmap_init_range sets the refcount
+ * of all pages to 1 ("reserved" and "free") in hotplug context. We
+ * have to set the refcount of all involved pages to 0. Otherwise,
+ * we don't do it, as reserve_bootmem_region only set the refcount on
+ * reserve region ("reserved") in early context.
*/
- prefetchw(p);
- for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
- prefetchw(p + 1);
+ if (page_count(page)) {
+ prefetchw(p);
+ for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
+ prefetchw(p + 1);
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ }
__ClearPageReserved(p);
set_page_count(p, 0);
}
- __ClearPageReserved(p);
- set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
--
2.25.1
On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
> memmap_init_range() would init page count of all pages, but the free
> pages count would be reset in __free_pages_core(). There are opposite
> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
> context.
>
> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
> and check the page count before reset it.
>
> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
> need, as it already done in __init_single_page.
>
> The following data was tested on an x86 machine with 190GB of RAM.
>
> before:
> free_low_memory_core_early() 341ms
>
> after:
> free_low_memory_core_early() 285ms
>
> Signed-off-by: Yajun Deng <[email protected]>
> ---
> v4: same with v2.
> v3: same with v2.
> v2: check page count instead of check context before reset it.
> v1: https://lore.kernel.org/all/[email protected]/
> ---
> mm/mm_init.c | 18 +++++++++++++-----
> mm/page_alloc.c | 20 ++++++++++++--------
> 2 files changed, 25 insertions(+), 13 deletions(-)
>
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index 9716c8a7ade9..3ab8861e1ef3 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
> if (zone_spans_pfn(zone, pfn))
> break;
> }
> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
> }
> #else
> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>
> init_reserved_page(start_pfn, nid);
>
> - /* Avoid false-positive PageTail() */
> - INIT_LIST_HEAD(&page->lru);
> + /* Init page count for reserved region */
Please add a comment that describes _why_ we initialize the page count here.
> + init_page_count(page);
>
> /*
> * no need for atomic set_bit because the struct
> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
> }
>
> page = pfn_to_page(pfn);
> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
> - if (context == MEMINIT_HOTPLUG)
> +
> + /* If the context is MEMINIT_EARLY, we will init page count and
> + * mark page reserved in reserve_bootmem_region, the free region
> + * wouldn't have page count and we will check the pages count
> + * in __free_pages_core.
> + */
> + __init_single_page(page, pfn, zone, nid, 0);
> + if (context == MEMINIT_HOTPLUG) {
> + init_page_count(page);
> __SetPageReserved(page);
Rather than calling init_page_count() and __SetPageReserved() for
MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
an call __init_single_page() after the check for MEMINIT_HOTPLUG.
But more generally, I wonder if we have to differentiate HOTPLUG here at all.
@David, can you comment please?
> + }
>
> /*
> * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 06be8821d833..b868caabe8dc 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> unsigned int loop;
>
> /*
> - * When initializing the memmap, __init_single_page() sets the refcount
> - * of all pages to 1 ("allocated"/"not free"). We have to set the
> - * refcount of all involved pages to 0.
> + * When initializing the memmap, memmap_init_range sets the refcount
> + * of all pages to 1 ("reserved" and "free") in hotplug context. We
> + * have to set the refcount of all involved pages to 0. Otherwise,
> + * we don't do it, as reserve_bootmem_region only set the refcount on
> + * reserve region ("reserved") in early context.
> */
Again, why hotplug and early init should be different?
> - prefetchw(p);
> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> - prefetchw(p + 1);
> + if (page_count(page)) {
> + prefetchw(p);
> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> + prefetchw(p + 1);
> + __ClearPageReserved(p);
> + set_page_count(p, 0);
> + }
> __ClearPageReserved(p);
> set_page_count(p, 0);
> }
> - __ClearPageReserved(p);
> - set_page_count(p, 0);
>
> atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>
> --
> 2.25.1
>
--
Sincerely yours,
Mike.
On Fri, Sep 29, 2023 at 05:50:40PM +0800, Yajun Deng wrote:
>
> On 2023/9/29 16:30, Mike Rapoport wrote:
> > On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
> > > memmap_init_range() would init page count of all pages, but the free
> > > pages count would be reset in __free_pages_core(). There are opposite
> > > operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
> > > context.
> > >
> > > Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
> > > and check the page count before reset it.
> > >
> > > At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
> > > need, as it already done in __init_single_page.
> > >
> > > The following data was tested on an x86 machine with 190GB of RAM.
> > >
> > > before:
> > > free_low_memory_core_early() 341ms
> > >
> > > after:
> > > free_low_memory_core_early() 285ms
> > >
> > > Signed-off-by: Yajun Deng <[email protected]>
> > > ---
> > > v4: same with v2.
> > > v3: same with v2.
> > > v2: check page count instead of check context before reset it.
> > > v1: https://lore.kernel.org/all/[email protected]/
> > > ---
> > > mm/mm_init.c | 18 +++++++++++++-----
> > > mm/page_alloc.c | 20 ++++++++++++--------
> > > 2 files changed, 25 insertions(+), 13 deletions(-)
> > >
> > > diff --git a/mm/mm_init.c b/mm/mm_init.c
> > > index 9716c8a7ade9..3ab8861e1ef3 100644
> > > --- a/mm/mm_init.c
> > > +++ b/mm/mm_init.c
> > > @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
> > > if (zone_spans_pfn(zone, pfn))
> > > break;
> > > }
> > > - __init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
> > > + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
> > > }
> > > #else
> > > static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
> > > @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
> > > init_reserved_page(start_pfn, nid);
> > > - /* Avoid false-positive PageTail() */
> > > - INIT_LIST_HEAD(&page->lru);
> > > + /* Init page count for reserved region */
> > Please add a comment that describes _why_ we initialize the page count here.
> Okay.
> >
> > > + init_page_count(page);
> > > /*
> > > * no need for atomic set_bit because the struct
> > > @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
> > > }
> > > page = pfn_to_page(pfn);
> > > - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
> > > - if (context == MEMINIT_HOTPLUG)
> > > +
> > > + /* If the context is MEMINIT_EARLY, we will init page count and
> > > + * mark page reserved in reserve_bootmem_region, the free region
> > > + * wouldn't have page count and we will check the pages count
> > > + * in __free_pages_core.
> > > + */
> > > + __init_single_page(page, pfn, zone, nid, 0);
> > > + if (context == MEMINIT_HOTPLUG) {
> > > + init_page_count(page);
> > > __SetPageReserved(page);
> > Rather than calling init_page_count() and __SetPageReserved() for
> > MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
> > an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>
> No, the following code would cost more time than the current code in
> memmap_init().
>
> if (context == MEMINIT_HOTPLUG)
>
> __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
> else
>
> __init_single_page(page, pfn, zone, nid, 0);
Sorry if I wasn't clear. What I meant was to have something along these lines:
enum page_init_flags flags = 0;
if (context == MEMINIT_HOTPLUG)
flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
__init_single_page(page, pfn, zone, nid, flags);
> > But more generally, I wonder if we have to differentiate HOTPLUG here at all.
> > @David, can you comment please?
> >
> > > + }
> > > /*
> > > * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index 06be8821d833..b868caabe8dc 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> > > unsigned int loop;
> > > /*
> > > - * When initializing the memmap, __init_single_page() sets the refcount
> > > - * of all pages to 1 ("allocated"/"not free"). We have to set the
> > > - * refcount of all involved pages to 0.
> > > + * When initializing the memmap, memmap_init_range sets the refcount
> > > + * of all pages to 1 ("reserved" and "free") in hotplug context. We
> > > + * have to set the refcount of all involved pages to 0. Otherwise,
> > > + * we don't do it, as reserve_bootmem_region only set the refcount on
> > > + * reserve region ("reserved") in early context.
> > > */
> > Again, why hotplug and early init should be different?
> I will add a comment that describes it will save boot time.
But why do we need initialize struct pages differently at boot time vs
memory hotplug?
Is there a reason memory hotplug cannot have page count set to 0 just like
for pages reserved at boot time?
> > > - prefetchw(p);
> > > - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> > > - prefetchw(p + 1);
> > > + if (page_count(page)) {
> > > + prefetchw(p);
> > > + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> > > + prefetchw(p + 1);
> > > + __ClearPageReserved(p);
> > > + set_page_count(p, 0);
> > > + }
> > > __ClearPageReserved(p);
> > > set_page_count(p, 0);
> > > }
> > > - __ClearPageReserved(p);
> > > - set_page_count(p, 0);
> > > atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
> > > --
> > > 2.25.1
> > >
--
Sincerely yours,
Mike.
On 2023/9/29 18:02, Mike Rapoport wrote:
> On Fri, Sep 29, 2023 at 05:50:40PM +0800, Yajun Deng wrote:
>> On 2023/9/29 16:30, Mike Rapoport wrote:
>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>> memmap_init_range() would init page count of all pages, but the free
>>>> pages count would be reset in __free_pages_core(). There are opposite
>>>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>>>> context.
>>>>
>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
>>>> and check the page count before reset it.
>>>>
>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>> need, as it already done in __init_single_page.
>>>>
>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>
>>>> before:
>>>> free_low_memory_core_early() 341ms
>>>>
>>>> after:
>>>> free_low_memory_core_early() 285ms
>>>>
>>>> Signed-off-by: Yajun Deng <[email protected]>
>>>> ---
>>>> v4: same with v2.
>>>> v3: same with v2.
>>>> v2: check page count instead of check context before reset it.
>>>> v1: https://lore.kernel.org/all/[email protected]/
>>>> ---
>>>> mm/mm_init.c | 18 +++++++++++++-----
>>>> mm/page_alloc.c | 20 ++++++++++++--------
>>>> 2 files changed, 25 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>> --- a/mm/mm_init.c
>>>> +++ b/mm/mm_init.c
>>>> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>>>> if (zone_spans_pfn(zone, pfn))
>>>> break;
>>>> }
>>>> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
>>>> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>> }
>>>> #else
>>>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>>>> init_reserved_page(start_pfn, nid);
>>>> - /* Avoid false-positive PageTail() */
>>>> - INIT_LIST_HEAD(&page->lru);
>>>> + /* Init page count for reserved region */
>>> Please add a comment that describes _why_ we initialize the page count here.
>> Okay.
>>>> + init_page_count(page);
>>>> /*
>>>> * no need for atomic set_bit because the struct
>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>>>> }
>>>> page = pfn_to_page(pfn);
>>>> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>> - if (context == MEMINIT_HOTPLUG)
>>>> +
>>>> + /* If the context is MEMINIT_EARLY, we will init page count and
>>>> + * mark page reserved in reserve_bootmem_region, the free region
>>>> + * wouldn't have page count and we will check the pages count
>>>> + * in __free_pages_core.
>>>> + */
>>>> + __init_single_page(page, pfn, zone, nid, 0);
>>>> + if (context == MEMINIT_HOTPLUG) {
>>>> + init_page_count(page);
>>>> __SetPageReserved(page);
>>> Rather than calling init_page_count() and __SetPageReserved() for
>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>> No, the following code would cost more time than the current code in
>> memmap_init().
>>
>> if (context == MEMINIT_HOTPLUG)
>>
>> __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
>> else
>>
>> __init_single_page(page, pfn, zone, nid, 0);
> Sorry if I wasn't clear. What I meant was to have something along these lines:
>
> enum page_init_flags flags = 0;
>
> if (context == MEMINIT_HOTPLUG)
> flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
> __init_single_page(page, pfn, zone, nid, flags);
>
Okay, I'll test the time consumed in memmap_init().
>>> But more generally, I wonder if we have to differentiate HOTPLUG here at all.
>>> @David, can you comment please?
>>>
>>>> + }
>>>> /*
>>>> * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>> index 06be8821d833..b868caabe8dc 100644
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>>> unsigned int loop;
>>>> /*
>>>> - * When initializing the memmap, __init_single_page() sets the refcount
>>>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>> - * refcount of all involved pages to 0.
>>>> + * When initializing the memmap, memmap_init_range sets the refcount
>>>> + * of all pages to 1 ("reserved" and "free") in hotplug context. We
>>>> + * have to set the refcount of all involved pages to 0. Otherwise,
>>>> + * we don't do it, as reserve_bootmem_region only set the refcount on
>>>> + * reserve region ("reserved") in early context.
>>>> */
>>> Again, why hotplug and early init should be different?
>> I will add a comment that describes it will save boot time.
> But why do we need initialize struct pages differently at boot time vs
> memory hotplug?
> Is there a reason memory hotplug cannot have page count set to 0 just like
> for pages reserved at boot time?
>
This patch just save boot time in MEMINIT_EARLY. If someone finds out
that it can save time in
MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
keeping it in the same.
>>>> - prefetchw(p);
>>>> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>> - prefetchw(p + 1);
>>>> + if (page_count(page)) {
>>>> + prefetchw(p);
>>>> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>> + prefetchw(p + 1);
>>>> + __ClearPageReserved(p);
>>>> + set_page_count(p, 0);
>>>> + }
>>>> __ClearPageReserved(p);
>>>> set_page_count(p, 0);
>>>> }
>>>> - __ClearPageReserved(p);
>>>> - set_page_count(p, 0);
>>>> atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>>>> --
>>>> 2.25.1
>>>>
On 2023/9/29 16:30, Mike Rapoport wrote:
> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>> memmap_init_range() would init page count of all pages, but the free
>> pages count would be reset in __free_pages_core(). There are opposite
>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>> context.
>>
>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
>> and check the page count before reset it.
>>
>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>> need, as it already done in __init_single_page.
>>
>> The following data was tested on an x86 machine with 190GB of RAM.
>>
>> before:
>> free_low_memory_core_early() 341ms
>>
>> after:
>> free_low_memory_core_early() 285ms
>>
>> Signed-off-by: Yajun Deng <[email protected]>
>> ---
>> v4: same with v2.
>> v3: same with v2.
>> v2: check page count instead of check context before reset it.
>> v1: https://lore.kernel.org/all/[email protected]/
>> ---
>> mm/mm_init.c | 18 +++++++++++++-----
>> mm/page_alloc.c | 20 ++++++++++++--------
>> 2 files changed, 25 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index 9716c8a7ade9..3ab8861e1ef3 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>> if (zone_spans_pfn(zone, pfn))
>> break;
>> }
>> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
>> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>> }
>> #else
>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>>
>> init_reserved_page(start_pfn, nid);
>>
>> - /* Avoid false-positive PageTail() */
>> - INIT_LIST_HEAD(&page->lru);
>> + /* Init page count for reserved region */
> Please add a comment that describes _why_ we initialize the page count here.
Okay.
>
>> + init_page_count(page);
>>
>> /*
>> * no need for atomic set_bit because the struct
>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>> }
>>
>> page = pfn_to_page(pfn);
>> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>> - if (context == MEMINIT_HOTPLUG)
>> +
>> + /* If the context is MEMINIT_EARLY, we will init page count and
>> + * mark page reserved in reserve_bootmem_region, the free region
>> + * wouldn't have page count and we will check the pages count
>> + * in __free_pages_core.
>> + */
>> + __init_single_page(page, pfn, zone, nid, 0);
>> + if (context == MEMINIT_HOTPLUG) {
>> + init_page_count(page);
>> __SetPageReserved(page);
> Rather than calling init_page_count() and __SetPageReserved() for
> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
No, the following code would cost more time than the current code in
memmap_init().
if (context == MEMINIT_HOTPLUG)
__init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
else
__init_single_page(page, pfn, zone, nid, 0);
> But more generally, I wonder if we have to differentiate HOTPLUG here at all.
> @David, can you comment please?
>
>> + }
>>
>> /*
>> * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 06be8821d833..b868caabe8dc 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>> unsigned int loop;
>>
>> /*
>> - * When initializing the memmap, __init_single_page() sets the refcount
>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>> - * refcount of all involved pages to 0.
>> + * When initializing the memmap, memmap_init_range sets the refcount
>> + * of all pages to 1 ("reserved" and "free") in hotplug context. We
>> + * have to set the refcount of all involved pages to 0. Otherwise,
>> + * we don't do it, as reserve_bootmem_region only set the refcount on
>> + * reserve region ("reserved") in early context.
>> */
> Again, why hotplug and early init should be different?
I will add a comment that describes it will save boot time.
>
>> - prefetchw(p);
>> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>> - prefetchw(p + 1);
>> + if (page_count(page)) {
>> + prefetchw(p);
>> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>> + prefetchw(p + 1);
>> + __ClearPageReserved(p);
>> + set_page_count(p, 0);
>> + }
>> __ClearPageReserved(p);
>> set_page_count(p, 0);
>> }
>> - __ClearPageReserved(p);
>> - set_page_count(p, 0);
>>
>> atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>>
>> --
>> 2.25.1
>>
On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>
> On 2023/9/29 18:02, Mike Rapoport wrote:
> > On Fri, Sep 29, 2023 at 05:50:40PM +0800, Yajun Deng wrote:
> > > On 2023/9/29 16:30, Mike Rapoport wrote:
> > > > On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
> > > > > memmap_init_range() would init page count of all pages, but the free
> > > > > pages count would be reset in __free_pages_core(). There are opposite
> > > > > operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
> > > > > context.
> > > > >
> > > > > Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
> > > > > and check the page count before reset it.
> > > > >
> > > > > At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
> > > > > need, as it already done in __init_single_page.
> > > > >
> > > > > The following data was tested on an x86 machine with 190GB of RAM.
> > > > >
> > > > > before:
> > > > > free_low_memory_core_early() 341ms
> > > > >
> > > > > after:
> > > > > free_low_memory_core_early() 285ms
> > > > >
> > > > > Signed-off-by: Yajun Deng <[email protected]>
> > > > > ---
> > > > > v4: same with v2.
> > > > > v3: same with v2.
> > > > > v2: check page count instead of check context before reset it.
> > > > > v1: https://lore.kernel.org/all/[email protected]/
> > > > > ---
> > > > > mm/mm_init.c | 18 +++++++++++++-----
> > > > > mm/page_alloc.c | 20 ++++++++++++--------
> > > > > 2 files changed, 25 insertions(+), 13 deletions(-)
> > > > >
> > > > > diff --git a/mm/mm_init.c b/mm/mm_init.c
> > > > > index 9716c8a7ade9..3ab8861e1ef3 100644
> > > > > --- a/mm/mm_init.c
> > > > > +++ b/mm/mm_init.c
> > > > > @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
> > > > > if (zone_spans_pfn(zone, pfn))
> > > > > break;
> > > > > }
> > > > > - __init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
> > > > > + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
> > > > > }
> > > > > #else
> > > > > static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
> > > > > @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
> > > > > init_reserved_page(start_pfn, nid);
> > > > > - /* Avoid false-positive PageTail() */
> > > > > - INIT_LIST_HEAD(&page->lru);
> > > > > + /* Init page count for reserved region */
> > > > Please add a comment that describes _why_ we initialize the page count here.
> > > Okay.
> > > > > + init_page_count(page);
> > > > > /*
> > > > > * no need for atomic set_bit because the struct
> > > > > @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
> > > > > }
> > > > > page = pfn_to_page(pfn);
> > > > > - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
> > > > > - if (context == MEMINIT_HOTPLUG)
> > > > > +
> > > > > + /* If the context is MEMINIT_EARLY, we will init page count and
> > > > > + * mark page reserved in reserve_bootmem_region, the free region
> > > > > + * wouldn't have page count and we will check the pages count
> > > > > + * in __free_pages_core.
> > > > > + */
> > > > > + __init_single_page(page, pfn, zone, nid, 0);
> > > > > + if (context == MEMINIT_HOTPLUG) {
> > > > > + init_page_count(page);
> > > > > __SetPageReserved(page);
> > > > Rather than calling init_page_count() and __SetPageReserved() for
> > > > MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
> > > > an call __init_single_page() after the check for MEMINIT_HOTPLUG.
> > > No, the following code would cost more time than the current code in
> > > memmap_init().
> > >
> > > if (context == MEMINIT_HOTPLUG)
> > >
> > > __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
> > > else
> > >
> > > __init_single_page(page, pfn, zone, nid, 0);
> > Sorry if I wasn't clear. What I meant was to have something along these lines:
> >
> > enum page_init_flags flags = 0;
> >
> > if (context == MEMINIT_HOTPLUG)
> > flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
> > __init_single_page(page, pfn, zone, nid, flags);
> >
> Okay, I'll test the time consumed in memmap_init().
> > > > But more generally, I wonder if we have to differentiate HOTPLUG here at all.
> > > > @David, can you comment please?
> > > >
> > > > > + }
> > > > > /*
> > > > > * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
> > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > > index 06be8821d833..b868caabe8dc 100644
> > > > > --- a/mm/page_alloc.c
> > > > > +++ b/mm/page_alloc.c
> > > > > @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> > > > > unsigned int loop;
> > > > > /*
> > > > > - * When initializing the memmap, __init_single_page() sets the refcount
> > > > > - * of all pages to 1 ("allocated"/"not free"). We have to set the
> > > > > - * refcount of all involved pages to 0.
> > > > > + * When initializing the memmap, memmap_init_range sets the refcount
> > > > > + * of all pages to 1 ("reserved" and "free") in hotplug context. We
> > > > > + * have to set the refcount of all involved pages to 0. Otherwise,
> > > > > + * we don't do it, as reserve_bootmem_region only set the refcount on
> > > > > + * reserve region ("reserved") in early context.
> > > > > */
> > > > Again, why hotplug and early init should be different?
> > > I will add a comment that describes it will save boot time.
> > But why do we need initialize struct pages differently at boot time vs
> > memory hotplug?
> > Is there a reason memory hotplug cannot have page count set to 0 just like
> > for pages reserved at boot time?
>
> This patch just save boot time in MEMINIT_EARLY. If someone finds out that
> it can save time in
>
> MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
> keeping it in the same.
But it's not the same. It becomes slower after your patch and the code that
frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
for no apparent reason.
The if (page_count(page)) is really non-obvious...
> > > > > - prefetchw(p);
> > > > > - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> > > > > - prefetchw(p + 1);
> > > > > + if (page_count(page)) {
> > > > > + prefetchw(p);
> > > > > + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
> > > > > + prefetchw(p + 1);
> > > > > + __ClearPageReserved(p);
> > > > > + set_page_count(p, 0);
> > > > > + }
> > > > > __ClearPageReserved(p);
> > > > > set_page_count(p, 0);
> > > > > }
> > > > > - __ClearPageReserved(p);
> > > > > - set_page_count(p, 0);
> > > > > atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
> > > > > --
> > > > > 2.25.1
> > > > >
--
Sincerely yours,
Mike.
On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
>
> On 2023/10/2 02:59, Mike Rapoport wrote:
> > On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
> > > On 2023/9/29 18:02, Mike Rapoport wrote:
> > > > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > > > > index 06be8821d833..b868caabe8dc 100644
> > > > > > > --- a/mm/page_alloc.c
> > > > > > > +++ b/mm/page_alloc.c
> > > > > > > @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> > > > > > > unsigned int loop;
> > > > > > > /*
> > > > > > > - * When initializing the memmap, __init_single_page() sets the refcount
> > > > > > > - * of all pages to 1 ("allocated"/"not free"). We have to set the
> > > > > > > - * refcount of all involved pages to 0.
> > > > > > > + * When initializing the memmap, memmap_init_range sets the refcount
> > > > > > > + * of all pages to 1 ("reserved" and "free") in hotplug context. We
> > > > > > > + * have to set the refcount of all involved pages to 0. Otherwise,
> > > > > > > + * we don't do it, as reserve_bootmem_region only set the refcount on
> > > > > > > + * reserve region ("reserved") in early context.
> > > > > > > */
> > > > > > Again, why hotplug and early init should be different?
> > > > > I will add a comment that describes it will save boot time.
> > > > But why do we need initialize struct pages differently at boot time vs
> > > > memory hotplug?
> > > > Is there a reason memory hotplug cannot have page count set to 0 just like
> > > > for pages reserved at boot time?
> > > This patch just save boot time in MEMINIT_EARLY. If someone finds out that
> > > it can save time in
> > >
> > > MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
> > > keeping it in the same.
> > But it's not the same. It becomes slower after your patch and the code that
> > frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
> > for no apparent reason.
>
> __free_pages_core will also be called by others, such as:
> deferred_free_range, do_collection and memblock_free_late.
>
> We couldn't remove? 'if (page_count(page))' even if we set page count to 0
> when MEMINIT_HOTPLUG.
That 'if' breaks the invariant that __free_pages_core is always called for
pages with initialized page count. Adding it may lead to subtle bugs and
random memory corruption so we don't want to add it at the first place.
--
Sincerely yours,
Mike.
On 2023/10/2 02:59, Mike Rapoport wrote:
> On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>> On 2023/9/29 18:02, Mike Rapoport wrote:
>>> On Fri, Sep 29, 2023 at 05:50:40PM +0800, Yajun Deng wrote:
>>>> On 2023/9/29 16:30, Mike Rapoport wrote:
>>>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>>>> memmap_init_range() would init page count of all pages, but the free
>>>>>> pages count would be reset in __free_pages_core(). There are opposite
>>>>>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>>>>>> context.
>>>>>>
>>>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
>>>>>> and check the page count before reset it.
>>>>>>
>>>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>>>> need, as it already done in __init_single_page.
>>>>>>
>>>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>>>
>>>>>> before:
>>>>>> free_low_memory_core_early() 341ms
>>>>>>
>>>>>> after:
>>>>>> free_low_memory_core_early() 285ms
>>>>>>
>>>>>> Signed-off-by: Yajun Deng <[email protected]>
>>>>>> ---
>>>>>> v4: same with v2.
>>>>>> v3: same with v2.
>>>>>> v2: check page count instead of check context before reset it.
>>>>>> v1: https://lore.kernel.org/all/[email protected]/
>>>>>> ---
>>>>>> mm/mm_init.c | 18 +++++++++++++-----
>>>>>> mm/page_alloc.c | 20 ++++++++++++--------
>>>>>> 2 files changed, 25 insertions(+), 13 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>>>> --- a/mm/mm_init.c
>>>>>> +++ b/mm/mm_init.c
>>>>>> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>>>>>> if (zone_spans_pfn(zone, pfn))
>>>>>> break;
>>>>>> }
>>>>>> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
>>>>>> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>>>> }
>>>>>> #else
>>>>>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>>>> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>>>>>> init_reserved_page(start_pfn, nid);
>>>>>> - /* Avoid false-positive PageTail() */
>>>>>> - INIT_LIST_HEAD(&page->lru);
>>>>>> + /* Init page count for reserved region */
>>>>> Please add a comment that describes _why_ we initialize the page count here.
>>>> Okay.
>>>>>> + init_page_count(page);
>>>>>> /*
>>>>>> * no need for atomic set_bit because the struct
>>>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>>>>>> }
>>>>>> page = pfn_to_page(pfn);
>>>>>> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>>>> - if (context == MEMINIT_HOTPLUG)
>>>>>> +
>>>>>> + /* If the context is MEMINIT_EARLY, we will init page count and
>>>>>> + * mark page reserved in reserve_bootmem_region, the free region
>>>>>> + * wouldn't have page count and we will check the pages count
>>>>>> + * in __free_pages_core.
>>>>>> + */
>>>>>> + __init_single_page(page, pfn, zone, nid, 0);
>>>>>> + if (context == MEMINIT_HOTPLUG) {
>>>>>> + init_page_count(page);
>>>>>> __SetPageReserved(page);
>>>>> Rather than calling init_page_count() and __SetPageReserved() for
>>>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
>>>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>>> No, the following code would cost more time than the current code in
>>>> memmap_init().
>>>>
>>>> if (context == MEMINIT_HOTPLUG)
>>>>
>>>> __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT | INIT_PAGE_RESERVED);
>>>> else
>>>>
>>>> __init_single_page(page, pfn, zone, nid, 0);
>>> Sorry if I wasn't clear. What I meant was to have something along these lines:
>>>
>>> enum page_init_flags flags = 0;
>>>
>>> if (context == MEMINIT_HOTPLUG)
>>> flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
>>> __init_single_page(page, pfn, zone, nid, flags);
>>>
>> Okay, I'll test the time consumed in memmap_init().
>>>>> But more generally, I wonder if we have to differentiate HOTPLUG here at all.
>>>>> @David, can you comment please?
>>>>>
>>>>>> + }
>>>>>> /*
>>>>>> * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>> --- a/mm/page_alloc.c
>>>>>> +++ b/mm/page_alloc.c
>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>>>>> unsigned int loop;
>>>>>> /*
>>>>>> - * When initializing the memmap, __init_single_page() sets the refcount
>>>>>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>>>> - * refcount of all involved pages to 0.
>>>>>> + * When initializing the memmap, memmap_init_range sets the refcount
>>>>>> + * of all pages to 1 ("reserved" and "free") in hotplug context. We
>>>>>> + * have to set the refcount of all involved pages to 0. Otherwise,
>>>>>> + * we don't do it, as reserve_bootmem_region only set the refcount on
>>>>>> + * reserve region ("reserved") in early context.
>>>>>> */
>>>>> Again, why hotplug and early init should be different?
>>>> I will add a comment that describes it will save boot time.
>>> But why do we need initialize struct pages differently at boot time vs
>>> memory hotplug?
>>> Is there a reason memory hotplug cannot have page count set to 0 just like
>>> for pages reserved at boot time?
>> This patch just save boot time in MEMINIT_EARLY. If someone finds out that
>> it can save time in
>>
>> MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
>> keeping it in the same.
> But it's not the same. It becomes slower after your patch and the code that
> frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
> for no apparent reason.
__free_pages_core will also be called by others, such as:
deferred_free_range, do_collection and memblock_free_late.
We couldn't remove 'if (page_count(page))' even if we set page count to
0 when MEMINIT_HOTPLUG.
The following data is the time consumed in online_pages_range().
start_pfn nr_pages before after
0x680000 0x80000 4.315ms 4.461ms
0x700000 0x80000 4.468ms 4.440ms
0x780000 0x80000 4.295ms 3.835ms
0x800000 0x80000 3.928ms 4.377ms
0x880000 0x80000 4.321ms 4.378ms
0x900000 0x80000 4.448ms 3.964ms
0x980000 0x80000 4.000ms 4.381ms
0xa00000 0x80000 4.459ms 4.255ms
sum 34.234ms 34.091ms
As we can see, it doesn't become slower with ' if (page_count(page))'
when MEMINIT_HOTPLUG.
> The if (page_count(page)) is really non-obvious...
>
>>>>>> - prefetchw(p);
>>>>>> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>>> - prefetchw(p + 1);
>>>>>> + if (page_count(page)) {
>>>>>> + prefetchw(p);
>>>>>> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>>> + prefetchw(p + 1);
>>>>>> + __ClearPageReserved(p);
>>>>>> + set_page_count(p, 0);
>>>>>> + }
>>>>>> __ClearPageReserved(p);
>>>>>> set_page_count(p, 0);
>>>>>> }
>>>>>> - __ClearPageReserved(p);
>>>>>> - set_page_count(p, 0);
>>>>>> atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
>>>>>> --
>>>>>> 2.25.1
>>>>>>
On Mon, Oct 02, 2023 at 10:56:51AM +0200, David Hildenbrand wrote:
> On 02.10.23 10:47, Mike Rapoport wrote:
> > On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
> > >
> > > On 2023/10/2 02:59, Mike Rapoport wrote:
> > > > On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
> > > > > On 2023/9/29 18:02, Mike Rapoport wrote:
> > > > > > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > > > > > > > index 06be8821d833..b868caabe8dc 100644
> > > > > > > > > --- a/mm/page_alloc.c
> > > > > > > > > +++ b/mm/page_alloc.c
> > > > > > > > > @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
> > > > > > > > > unsigned int loop;
> > > > > > > > > /*
> > > > > > > > > - * When initializing the memmap, __init_single_page() sets the refcount
> > > > > > > > > - * of all pages to 1 ("allocated"/"not free"). We have to set the
> > > > > > > > > - * refcount of all involved pages to 0.
> > > > > > > > > + * When initializing the memmap, memmap_init_range sets the refcount
> > > > > > > > > + * of all pages to 1 ("reserved" and "free") in hotplug context. We
> > > > > > > > > + * have to set the refcount of all involved pages to 0. Otherwise,
> > > > > > > > > + * we don't do it, as reserve_bootmem_region only set the refcount on
> > > > > > > > > + * reserve region ("reserved") in early context.
> > > > > > > > > */
> > > > > > > > Again, why hotplug and early init should be different?
> > > > > > > I will add a comment that describes it will save boot time.
> > > > > > But why do we need initialize struct pages differently at boot time vs
> > > > > > memory hotplug?
> > > > > > Is there a reason memory hotplug cannot have page count set to 0 just like
> > > > > > for pages reserved at boot time?
> > > > > This patch just save boot time in MEMINIT_EARLY. If someone finds out that
> > > > > it can save time in
> > > > >
> > > > > MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
> > > > > keeping it in the same.
> > > > But it's not the same. It becomes slower after your patch and the code that
> > > > frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
> > > > for no apparent reason.
> > >
> > > __free_pages_core will also be called by others, such as:
> > > deferred_free_range, do_collection and memblock_free_late.
> > >
> > > We couldn't remove? 'if (page_count(page))' even if we set page count to 0
> > > when MEMINIT_HOTPLUG.
> >
> > That 'if' breaks the invariant that __free_pages_core is always called for
> > pages with initialized page count. Adding it may lead to subtle bugs and
> > random memory corruption so we don't want to add it at the first place.
>
> As long as we have to special-case memory hotplug, we know that we are
> always coming via generic_online_page() in that case. We could either move
> some logic over there, or let __free_pages_core() know what it should do.
Looks like the patch rather special cases MEMINIT_EARLY, although I didn't
check throughfully other code paths.
Anyway, relying on page_count() to be correct in different ways for
different callers of __free_pages_core() does not sound right to me.
> --
> Cheers,
>
> David / dhildenb
>
--
Sincerely yours,
Mike.
On 29.09.23 10:30, Mike Rapoport wrote:
> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>> memmap_init_range() would init page count of all pages, but the free
>> pages count would be reset in __free_pages_core(). There are opposite
>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>> context.
>>
>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY context,
>> and check the page count before reset it.
>>
>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>> need, as it already done in __init_single_page.
>>
>> The following data was tested on an x86 machine with 190GB of RAM.
>>
>> before:
>> free_low_memory_core_early() 341ms
>>
>> after:
>> free_low_memory_core_early() 285ms
>>
>> Signed-off-by: Yajun Deng <[email protected]>
>> ---
>> v4: same with v2.
>> v3: same with v2.
>> v2: check page count instead of check context before reset it.
>> v1: https://lore.kernel.org/all/[email protected]/
>> ---
>> mm/mm_init.c | 18 +++++++++++++-----
>> mm/page_alloc.c | 20 ++++++++++++--------
>> 2 files changed, 25 insertions(+), 13 deletions(-)
>>
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index 9716c8a7ade9..3ab8861e1ef3 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -718,7 +718,7 @@ static void __meminit init_reserved_page(unsigned long pfn, int nid)
>> if (zone_spans_pfn(zone, pfn))
>> break;
>> }
>> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid, INIT_PAGE_COUNT);
>> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>> }
>> #else
>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>> @@ -756,8 +756,8 @@ void __meminit reserve_bootmem_region(phys_addr_t start,
>>
>> init_reserved_page(start_pfn, nid);
>>
>> - /* Avoid false-positive PageTail() */
>> - INIT_LIST_HEAD(&page->lru);
>> + /* Init page count for reserved region */
>
> Please add a comment that describes _why_ we initialize the page count here.
>
>> + init_page_count(page);
>>
>> /*
>> * no need for atomic set_bit because the struct
>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
>> }
>>
>> page = pfn_to_page(pfn);
>> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>> - if (context == MEMINIT_HOTPLUG)
>> +
>> + /* If the context is MEMINIT_EARLY, we will init page count and
>> + * mark page reserved in reserve_bootmem_region, the free region
>> + * wouldn't have page count and we will check the pages count
>> + * in __free_pages_core.
>> + */
>> + __init_single_page(page, pfn, zone, nid, 0);
>> + if (context == MEMINIT_HOTPLUG) {
>> + init_page_count(page);
>> __SetPageReserved(page);
>
> Rather than calling init_page_count() and __SetPageReserved() for
> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT | INIT_PAGE_RESERVED
> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>
> But more generally, I wonder if we have to differentiate HOTPLUG here at all.
> @David, can you comment please?
There are a lot of details to that, and I'll share some I can briefly think of.
1) __SetPageReserved()
I tried removing that a while ago, but there was a blocker (IIRC something about
ZONE_DEVICE). I still have the patches at [1] and I could probably take a look
if that blocker still exists (I recall that something changed at some point, but
I never had the time to follow up).
But once we stop setting the pages reserved, we might run into issues with ...
2) init_page_count()
virtio-mem, XEN balloon and HV-balloon add memory blocks that can contain holes.
set_online_page_callback() is used to intercept memory onlining and to expose
only the pages that are not holes to the buddy: calling generic_online_page() on !hole.
Holes are PageReserved but with an initialized page count. Memory offlining will fail on
PageReserved pages -- has_unmovable_pages().
At least virtio-mem clears the PageReserved flag of holes when onlining memory,
and currently relies in the page count to be reasonable (so memory offlining can work).
static void virtio_mem_set_fake_offline(unsigned long pfn,
unsigned long nr_pages, bool onlined)
{
page_offline_begin();
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
__SetPageOffline(page);
if (!onlined) {
SetPageDirty(page);
/* FIXME: remove after cleanups */
ClearPageReserved(page);
}
}
page_offline_end();
}
For virtio-mem, we could initialize the page count there instead. The other PV drivers
might require a bit more thought.
[1] https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>
>> + }
>>
>> /*
>> * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 06be8821d833..b868caabe8dc 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>> unsigned int loop;
>>
>> /*
>> - * When initializing the memmap, __init_single_page() sets the refcount
>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>> - * refcount of all involved pages to 0.
>> + * When initializing the memmap, memmap_init_range sets the refcount
>> + * of all pages to 1 ("reserved" and "free") in hotplug context. We
>> + * have to set the refcount of all involved pages to 0. Otherwise,
>> + * we don't do it, as reserve_bootmem_region only set the refcount on
>> + * reserve region ("reserved") in early context.
>> */
>
> Again, why hotplug and early init should be different?
>
>> - prefetchw(p);
>> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>> - prefetchw(p + 1);
>> + if (page_count(page)) {
>> + prefetchw(p);
>> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>> + prefetchw(p + 1);
>> + __ClearPageReserved(p);
>> + set_page_count(p, 0);
>> + }
>> __ClearPageReserved(p);
>> set_page_count(p, 0);
That looks wrong. if the page count would by pure luck be 0 already for hotplugged memory,
you wouldn't clear the reserved flag.
These changes make me a bit nervous.
--
Cheers,
David / dhildenb
On 02.10.23 13:10, Mike Rapoport wrote:
> On Mon, Oct 02, 2023 at 10:56:51AM +0200, David Hildenbrand wrote:
>> On 02.10.23 10:47, Mike Rapoport wrote:
>>> On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
>>>>
>>>> On 2023/10/2 02:59, Mike Rapoport wrote:
>>>>> On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>>>>>> On 2023/9/29 18:02, Mike Rapoport wrote:
>>>>>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>>>>>> --- a/mm/page_alloc.c
>>>>>>>>>> +++ b/mm/page_alloc.c
>>>>>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>>>>>>>>> unsigned int loop;
>>>>>>>>>> /*
>>>>>>>>>> - * When initializing the memmap, __init_single_page() sets the refcount
>>>>>>>>>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>>>>>>>> - * refcount of all involved pages to 0.
>>>>>>>>>> + * When initializing the memmap, memmap_init_range sets the refcount
>>>>>>>>>> + * of all pages to 1 ("reserved" and "free") in hotplug context. We
>>>>>>>>>> + * have to set the refcount of all involved pages to 0. Otherwise,
>>>>>>>>>> + * we don't do it, as reserve_bootmem_region only set the refcount on
>>>>>>>>>> + * reserve region ("reserved") in early context.
>>>>>>>>>> */
>>>>>>>>> Again, why hotplug and early init should be different?
>>>>>>>> I will add a comment that describes it will save boot time.
>>>>>>> But why do we need initialize struct pages differently at boot time vs
>>>>>>> memory hotplug?
>>>>>>> Is there a reason memory hotplug cannot have page count set to 0 just like
>>>>>>> for pages reserved at boot time?
>>>>>> This patch just save boot time in MEMINIT_EARLY. If someone finds out that
>>>>>> it can save time in
>>>>>>
>>>>>> MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
>>>>>> keeping it in the same.
>>>>> But it's not the same. It becomes slower after your patch and the code that
>>>>> frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
>>>>> for no apparent reason.
>>>>
>>>> __free_pages_core will also be called by others, such as:
>>>> deferred_free_range, do_collection and memblock_free_late.
>>>>
>>>> We couldn't remove 'if (page_count(page))' even if we set page count to 0
>>>> when MEMINIT_HOTPLUG.
>>>
>>> That 'if' breaks the invariant that __free_pages_core is always called for
>>> pages with initialized page count. Adding it may lead to subtle bugs and
>>> random memory corruption so we don't want to add it at the first place.
>>
>> As long as we have to special-case memory hotplug, we know that we are
>> always coming via generic_online_page() in that case. We could either move
>> some logic over there, or let __free_pages_core() know what it should do.
>
> Looks like the patch rather special cases MEMINIT_EARLY, although I didn't
> check throughfully other code paths.
> Anyway, relying on page_count() to be correct in different ways for
> different callers of __free_pages_core() does not sound right to me.
Absolutely agreed.
--
Cheers,
David / dhildenb
On 02.10.23 10:47, Mike Rapoport wrote:
> On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
>>
>> On 2023/10/2 02:59, Mike Rapoport wrote:
>>> On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>>>> On 2023/9/29 18:02, Mike Rapoport wrote:
>>>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>>>> --- a/mm/page_alloc.c
>>>>>>>> +++ b/mm/page_alloc.c
>>>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page, unsigned int order)
>>>>>>>> unsigned int loop;
>>>>>>>> /*
>>>>>>>> - * When initializing the memmap, __init_single_page() sets the refcount
>>>>>>>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>>>>>> - * refcount of all involved pages to 0.
>>>>>>>> + * When initializing the memmap, memmap_init_range sets the refcount
>>>>>>>> + * of all pages to 1 ("reserved" and "free") in hotplug context. We
>>>>>>>> + * have to set the refcount of all involved pages to 0. Otherwise,
>>>>>>>> + * we don't do it, as reserve_bootmem_region only set the refcount on
>>>>>>>> + * reserve region ("reserved") in early context.
>>>>>>>> */
>>>>>>> Again, why hotplug and early init should be different?
>>>>>> I will add a comment that describes it will save boot time.
>>>>> But why do we need initialize struct pages differently at boot time vs
>>>>> memory hotplug?
>>>>> Is there a reason memory hotplug cannot have page count set to 0 just like
>>>>> for pages reserved at boot time?
>>>> This patch just save boot time in MEMINIT_EARLY. If someone finds out that
>>>> it can save time in
>>>>
>>>> MEMINIT_HOTPLUG, I think it can be done in another patch later. I just
>>>> keeping it in the same.
>>> But it's not the same. It becomes slower after your patch and the code that
>>> frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes non-uniform
>>> for no apparent reason.
>>
>> __free_pages_core will also be called by others, such as:
>> deferred_free_range, do_collection and memblock_free_late.
>>
>> We couldn't remove 'if (page_count(page))' even if we set page count to 0
>> when MEMINIT_HOTPLUG.
>
> That 'if' breaks the invariant that __free_pages_core is always called for
> pages with initialized page count. Adding it may lead to subtle bugs and
> random memory corruption so we don't want to add it at the first place.
As long as we have to special-case memory hotplug, we know that we are
always coming via generic_online_page() in that case. We could either
move some logic over there, or let __free_pages_core() know what it
should do.
--
Cheers,
David / dhildenb
On 2023/10/2 19:25, David Hildenbrand wrote:
> On 02.10.23 13:10, Mike Rapoport wrote:
>> On Mon, Oct 02, 2023 at 10:56:51AM +0200, David Hildenbrand wrote:
>>> On 02.10.23 10:47, Mike Rapoport wrote:
>>>> On Mon, Oct 02, 2023 at 03:03:56PM +0800, Yajun Deng wrote:
>>>>>
>>>>> On 2023/10/2 02:59, Mike Rapoport wrote:
>>>>>> On Fri, Sep 29, 2023 at 06:27:25PM +0800, Yajun Deng wrote:
>>>>>>> On 2023/9/29 18:02, Mike Rapoport wrote:
>>>>>>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>>>>>>> --- a/mm/page_alloc.c
>>>>>>>>>>> +++ b/mm/page_alloc.c
>>>>>>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page
>>>>>>>>>>> *page, unsigned int order)
>>>>>>>>>>> unsigned int loop;
>>>>>>>>>>> /*
>>>>>>>>>>> - * When initializing the memmap, __init_single_page()
>>>>>>>>>>> sets the refcount
>>>>>>>>>>> - * of all pages to 1 ("allocated"/"not free"). We have
>>>>>>>>>>> to set the
>>>>>>>>>>> - * refcount of all involved pages to 0.
>>>>>>>>>>> + * When initializing the memmap, memmap_init_range sets
>>>>>>>>>>> the refcount
>>>>>>>>>>> + * of all pages to 1 ("reserved" and "free") in hotplug
>>>>>>>>>>> context. We
>>>>>>>>>>> + * have to set the refcount of all involved pages to 0.
>>>>>>>>>>> Otherwise,
>>>>>>>>>>> + * we don't do it, as reserve_bootmem_region only set
>>>>>>>>>>> the refcount on
>>>>>>>>>>> + * reserve region ("reserved") in early context.
>>>>>>>>>>> */
>>>>>>>>>> Again, why hotplug and early init should be different?
>>>>>>>>> I will add a comment that describes it will save boot time.
>>>>>>>> But why do we need initialize struct pages differently at boot
>>>>>>>> time vs
>>>>>>>> memory hotplug?
>>>>>>>> Is there a reason memory hotplug cannot have page count set to
>>>>>>>> 0 just like
>>>>>>>> for pages reserved at boot time?
>>>>>>> This patch just save boot time in MEMINIT_EARLY. If someone
>>>>>>> finds out that
>>>>>>> it can save time in
>>>>>>>
>>>>>>> MEMINIT_HOTPLUG, I think it can be done in another patch later.
>>>>>>> I just
>>>>>>> keeping it in the same.
>>>>>> But it's not the same. It becomes slower after your patch and the
>>>>>> code that
>>>>>> frees the pages for MEMINIT_EARLY and MEMINIT_HOTPLUG becomes
>>>>>> non-uniform
>>>>>> for no apparent reason.
>>>>>
>>>>> __free_pages_core will also be called by others, such as:
>>>>> deferred_free_range, do_collection and memblock_free_late.
>>>>>
>>>>> We couldn't remove 'if (page_count(page))' even if we set page
>>>>> count to 0
>>>>> when MEMINIT_HOTPLUG.
>>>>
>>>> That 'if' breaks the invariant that __free_pages_core is always
>>>> called for
>>>> pages with initialized page count. Adding it may lead to subtle
>>>> bugs and
>>>> random memory corruption so we don't want to add it at the first
>>>> place.
>>>
>>> As long as we have to special-case memory hotplug, we know that we are
>>> always coming via generic_online_page() in that case. We could
>>> either move
>>> some logic over there, or let __free_pages_core() know what it
>>> should do.
>>
>> Looks like the patch rather special cases MEMINIT_EARLY, although I
>> didn't
>> check throughfully other code paths.
>> Anyway, relying on page_count() to be correct in different ways for
>> different callers of __free_pages_core() does not sound right to me.
>
> Absolutely agreed.
>
I already sent v5 a few days ago. Comments, please...
On 2023/10/5 13:06, Mike Rapoport wrote:
> On Tue, Oct 03, 2023 at 10:38:09PM +0800, Yajun Deng wrote:
>> On 2023/10/2 19:25, David Hildenbrand wrote:
>>> On 02.10.23 13:10, Mike Rapoport wrote:
>>>>>> That 'if' breaks the invariant that __free_pages_core is
>>>>>> always called for
>>>>>> pages with initialized page count. Adding it may lead to
>>>>>> subtle bugs and
>>>>>> random memory corruption so we don't want to add it at the
>>>>>> first place.
>>>>> As long as we have to special-case memory hotplug, we know that we are
>>>>> always coming via generic_online_page() in that case. We could
>>>>> either move
>>>>> some logic over there, or let __free_pages_core() know what it
>>>>> should do.
>>>> Looks like the patch rather special cases MEMINIT_EARLY, although I
>>>> didn't
>>>> check throughfully other code paths.
>>>> Anyway, relying on page_count() to be correct in different ways for
>>>> different callers of __free_pages_core() does not sound right to me.
>>> Absolutely agreed.
>>>
>> I already sent v5 a few days ago. Comments, please...
> Does it address all the feedback from this thread?
>
Except hotplug. As far as I konw, we only clear page count in
MEMINIT_EARLY and all tail pages in compound page.
So adding 'if (page_count(page))' will have no actual effect for other
case. According to previous data, it didn't
become slower in hotplug.
On 2023/10/2 16:30, David Hildenbrand wrote:
> On 29.09.23 10:30, Mike Rapoport wrote:
>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>> memmap_init_range() would init page count of all pages, but the free
>>> pages count would be reset in __free_pages_core(). There are opposite
>>> operations. It's unnecessary and time-consuming when it's MEMINIT_EARLY
>>> context.
>>>
>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY
>>> context,
>>> and check the page count before reset it.
>>>
>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>> need, as it already done in __init_single_page.
>>>
>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>
>>> before:
>>> free_low_memory_core_early() 341ms
>>>
>>> after:
>>> free_low_memory_core_early() 285ms
>>>
>>> Signed-off-by: Yajun Deng <[email protected]>
>>> ---
>>> v4: same with v2.
>>> v3: same with v2.
>>> v2: check page count instead of check context before reset it.
>>> v1:
>>> https://lore.kernel.org/all/[email protected]/
>>> ---
>>> mm/mm_init.c | 18 +++++++++++++-----
>>> mm/page_alloc.c | 20 ++++++++++++--------
>>> 2 files changed, 25 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>> --- a/mm/mm_init.c
>>> +++ b/mm/mm_init.c
>>> @@ -718,7 +718,7 @@ static void __meminit
>>> init_reserved_page(unsigned long pfn, int nid)
>>> if (zone_spans_pfn(zone, pfn))
>>> break;
>>> }
>>> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid,
>>> INIT_PAGE_COUNT);
>>> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>> }
>>> #else
>>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>> @@ -756,8 +756,8 @@ void __meminit
>>> reserve_bootmem_region(phys_addr_t start,
>>> init_reserved_page(start_pfn, nid);
>>> - /* Avoid false-positive PageTail() */
>>> - INIT_LIST_HEAD(&page->lru);
>>> + /* Init page count for reserved region */
>>
>> Please add a comment that describes _why_ we initialize the page
>> count here.
>>
>>> + init_page_count(page);
>>> /*
>>> * no need for atomic set_bit because the struct
>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long
>>> size, int nid, unsigned long zone
>>> }
>>> page = pfn_to_page(pfn);
>>> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>> - if (context == MEMINIT_HOTPLUG)
>>> +
>>> + /* If the context is MEMINIT_EARLY, we will init page count
>>> and
>>> + * mark page reserved in reserve_bootmem_region, the free
>>> region
>>> + * wouldn't have page count and we will check the pages count
>>> + * in __free_pages_core.
>>> + */
>>> + __init_single_page(page, pfn, zone, nid, 0);
>>> + if (context == MEMINIT_HOTPLUG) {
>>> + init_page_count(page);
>>> __SetPageReserved(page);
>>
>> Rather than calling init_page_count() and __SetPageReserved() for
>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT |
>> INIT_PAGE_RESERVED
>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>
>> But more generally, I wonder if we have to differentiate HOTPLUG here
>> at all.
>> @David, can you comment please?
>
> There are a lot of details to that, and I'll share some I can briefly
> think of.
>
> 1) __SetPageReserved()
>
> I tried removing that a while ago, but there was a blocker (IIRC
> something about
> ZONE_DEVICE). I still have the patches at [1] and I could probably
> take a look
> if that blocker still exists (I recall that something changed at some
> point, but
> I never had the time to follow up).
>
> But once we stop setting the pages reserved, we might run into issues
> with ...
>
>
> 2) init_page_count()
>
> virtio-mem, XEN balloon and HV-balloon add memory blocks that can
> contain holes.
> set_online_page_callback() is used to intercept memory onlining and to
> expose
> only the pages that are not holes to the buddy: calling
> generic_online_page() on !hole.
>
> Holes are PageReserved but with an initialized page count. Memory
> offlining will fail on
> PageReserved pages -- has_unmovable_pages().
>
>
> At least virtio-mem clears the PageReserved flag of holes when
> onlining memory,
> and currently relies in the page count to be reasonable (so memory
> offlining can work).
>
> static void virtio_mem_set_fake_offline(unsigned long pfn,
> unsigned long nr_pages, bool onlined)
> {
> page_offline_begin();
> for (; nr_pages--; pfn++) {
> struct page *page = pfn_to_page(pfn);
>
> __SetPageOffline(page);
> if (!onlined) {
> SetPageDirty(page);
> /* FIXME: remove after cleanups */
> ClearPageReserved(page);
> }
> }
> page_offline_end();
> }
>
>
> For virtio-mem, we could initialize the page count there instead. The
> other PV drivers
> might require a bit more thought.
>
>
> [1]
> https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>
>>
>>> + }
>>> /*
>>> * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 06be8821d833..b868caabe8dc 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page,
>>> unsigned int order)
>>> unsigned int loop;
>>> /*
>>> - * When initializing the memmap, __init_single_page() sets the
>>> refcount
>>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>>> - * refcount of all involved pages to 0.
>>> + * When initializing the memmap, memmap_init_range sets the
>>> refcount
>>> + * of all pages to 1 ("reserved" and "free") in hotplug
>>> context. We
>>> + * have to set the refcount of all involved pages to 0. Otherwise,
>>> + * we don't do it, as reserve_bootmem_region only set the
>>> refcount on
>>> + * reserve region ("reserved") in early context.
>>> */
>>
>> Again, why hotplug and early init should be different?
>>
>>> - prefetchw(p);
>>> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>> - prefetchw(p + 1);
>>> + if (page_count(page)) {
>>> + prefetchw(p);
>>> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>> + prefetchw(p + 1);
>>> + __ClearPageReserved(p);
>>> + set_page_count(p, 0);
>>> + }
>>> __ClearPageReserved(p);
>>> set_page_count(p, 0);
>
> That looks wrong. if the page count would by pure luck be 0 already
> for hotplugged memory,
> you wouldn't clear the reserved flag.
>
> These changes make me a bit nervous.
Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I need
to do something else?
On 2023/10/8 16:57, Yajun Deng wrote:
>
> On 2023/10/2 16:30, David Hildenbrand wrote:
>> On 29.09.23 10:30, Mike Rapoport wrote:
>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>> memmap_init_range() would init page count of all pages, but the free
>>>> pages count would be reset in __free_pages_core(). There are opposite
>>>> operations. It's unnecessary and time-consuming when it's
>>>> MEMINIT_EARLY
>>>> context.
>>>>
>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY
>>>> context,
>>>> and check the page count before reset it.
>>>>
>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>> need, as it already done in __init_single_page.
>>>>
>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>
>>>> before:
>>>> free_low_memory_core_early() 341ms
>>>>
>>>> after:
>>>> free_low_memory_core_early() 285ms
>>>>
>>>> Signed-off-by: Yajun Deng <[email protected]>
>>>> ---
>>>> v4: same with v2.
>>>> v3: same with v2.
>>>> v2: check page count instead of check context before reset it.
>>>> v1:
>>>> https://lore.kernel.org/all/[email protected]/
>>>> ---
>>>> mm/mm_init.c | 18 +++++++++++++-----
>>>> mm/page_alloc.c | 20 ++++++++++++--------
>>>> 2 files changed, 25 insertions(+), 13 deletions(-)
>>>>
>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>> --- a/mm/mm_init.c
>>>> +++ b/mm/mm_init.c
>>>> @@ -718,7 +718,7 @@ static void __meminit
>>>> init_reserved_page(unsigned long pfn, int nid)
>>>> if (zone_spans_pfn(zone, pfn))
>>>> break;
>>>> }
>>>> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid,
>>>> INIT_PAGE_COUNT);
>>>> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>> }
>>>> #else
>>>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>> @@ -756,8 +756,8 @@ void __meminit
>>>> reserve_bootmem_region(phys_addr_t start,
>>>> init_reserved_page(start_pfn, nid);
>>>> - /* Avoid false-positive PageTail() */
>>>> - INIT_LIST_HEAD(&page->lru);
>>>> + /* Init page count for reserved region */
>>>
>>> Please add a comment that describes _why_ we initialize the page
>>> count here.
>>>
>>>> + init_page_count(page);
>>>> /*
>>>> * no need for atomic set_bit because the struct
>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long
>>>> size, int nid, unsigned long zone
>>>> }
>>>> page = pfn_to_page(pfn);
>>>> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>> - if (context == MEMINIT_HOTPLUG)
>>>> +
>>>> + /* If the context is MEMINIT_EARLY, we will init page
>>>> count and
>>>> + * mark page reserved in reserve_bootmem_region, the free
>>>> region
>>>> + * wouldn't have page count and we will check the pages count
>>>> + * in __free_pages_core.
>>>> + */
>>>> + __init_single_page(page, pfn, zone, nid, 0);
>>>> + if (context == MEMINIT_HOTPLUG) {
>>>> + init_page_count(page);
>>>> __SetPageReserved(page);
>>>
>>> Rather than calling init_page_count() and __SetPageReserved() for
>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT |
>>> INIT_PAGE_RESERVED
>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>>
>>> But more generally, I wonder if we have to differentiate HOTPLUG
>>> here at all.
>>> @David, can you comment please?
>>
>> There are a lot of details to that, and I'll share some I can briefly
>> think of.
>>
>> 1) __SetPageReserved()
>>
>> I tried removing that a while ago, but there was a blocker (IIRC
>> something about
>> ZONE_DEVICE). I still have the patches at [1] and I could probably
>> take a look
>> if that blocker still exists (I recall that something changed at some
>> point, but
>> I never had the time to follow up).
>>
>> But once we stop setting the pages reserved, we might run into issues
>> with ...
>>
>>
>> 2) init_page_count()
>>
>> virtio-mem, XEN balloon and HV-balloon add memory blocks that can
>> contain holes.
>> set_online_page_callback() is used to intercept memory onlining and
>> to expose
>> only the pages that are not holes to the buddy: calling
>> generic_online_page() on !hole.
>>
>> Holes are PageReserved but with an initialized page count. Memory
>> offlining will fail on
>> PageReserved pages -- has_unmovable_pages().
>>
>>
>> At least virtio-mem clears the PageReserved flag of holes when
>> onlining memory,
>> and currently relies in the page count to be reasonable (so memory
>> offlining can work).
>>
>> static void virtio_mem_set_fake_offline(unsigned long pfn,
>> unsigned long nr_pages, bool onlined)
>> {
>> page_offline_begin();
>> for (; nr_pages--; pfn++) {
>> struct page *page = pfn_to_page(pfn);
>>
>> __SetPageOffline(page);
>> if (!onlined) {
>> SetPageDirty(page);
>> /* FIXME: remove after cleanups */
>> ClearPageReserved(page);
>> }
>> }
>> page_offline_end();
>> }
>>
>>
>> For virtio-mem, we could initialize the page count there instead. The
>> other PV drivers
>> might require a bit more thought.
>>
>>
>> [1]
>> https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>>
>>>
>>>> + }
>>>> /*
>>>> * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>> index 06be8821d833..b868caabe8dc 100644
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page,
>>>> unsigned int order)
>>>> unsigned int loop;
>>>> /*
>>>> - * When initializing the memmap, __init_single_page() sets the
>>>> refcount
>>>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>> - * refcount of all involved pages to 0.
>>>> + * When initializing the memmap, memmap_init_range sets the
>>>> refcount
>>>> + * of all pages to 1 ("reserved" and "free") in hotplug
>>>> context. We
>>>> + * have to set the refcount of all involved pages to 0.
>>>> Otherwise,
>>>> + * we don't do it, as reserve_bootmem_region only set the
>>>> refcount on
>>>> + * reserve region ("reserved") in early context.
>>>> */
>>>
>>> Again, why hotplug and early init should be different?
>>>
>>>> - prefetchw(p);
>>>> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>> - prefetchw(p + 1);
>>>> + if (page_count(page)) {
>>>> + prefetchw(p);
>>>> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>> + prefetchw(p + 1);
>>>> + __ClearPageReserved(p);
>>>> + set_page_count(p, 0);
>>>> + }
>>>> __ClearPageReserved(p);
>>>> set_page_count(p, 0);
>>
>> That looks wrong. if the page count would by pure luck be 0 already
>> for hotplugged memory,
>> you wouldn't clear the reserved flag.
>>
>> These changes make me a bit nervous.
>
>
> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
> need to do something else?
>
How about the following if statement? But it needs to add more patch
like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
It'll be safer, but more complex. Please comment...
if (context != MEMINIT_EARLY || (page_count(page) || PageReserved(page)) {
On Thu, Oct 05, 2023 at 10:04:28PM +0800, Yajun Deng wrote:
>
> > > > > > > That 'if' breaks the invariant that __free_pages_core is
> > > > > > > always called for pages with initialized page count. Adding
> > > > > > > it may lead to subtle bugs and random memory corruption so we
> > > > > > > don't want to add it at the first place.
> > > > > >
> > > > > > As long as we have to special-case memory hotplug, we know that
> > > > > > we are always coming via generic_online_page() in that case. We
> > > > > > could either move some logic over there, or let
> > > > > > __free_pages_core() know what it should do.
> > > > >
> > > > > Looks like the patch rather special cases MEMINIT_EARLY, although
> > > > > I didn't check throughfully other code paths. Anyway, relying on
> > > > > page_count() to be correct in different ways for different
> > > > > callers of __free_pages_core() does not sound right to me.
> > > >
> > > > Absolutely agreed.
> > > >
> > > I already sent v5? a few days ago. Comments, please...
> >
> > Does it address all the feedback from this thread?
>
> Except hotplug.
Please reread carefully the last comments from me and from David above.
--
Sincerely yours,
Mike.
On 10.10.23 04:31, Yajun Deng wrote:
>
> On 2023/10/8 16:57, Yajun Deng wrote:
>>
>> On 2023/10/2 16:30, David Hildenbrand wrote:
>>> On 29.09.23 10:30, Mike Rapoport wrote:
>>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>>> memmap_init_range() would init page count of all pages, but the free
>>>>> pages count would be reset in __free_pages_core(). There are opposite
>>>>> operations. It's unnecessary and time-consuming when it's
>>>>> MEMINIT_EARLY
>>>>> context.
>>>>>
>>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY
>>>>> context,
>>>>> and check the page count before reset it.
>>>>>
>>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>>> need, as it already done in __init_single_page.
>>>>>
>>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>>
>>>>> before:
>>>>> free_low_memory_core_early() 341ms
>>>>>
>>>>> after:
>>>>> free_low_memory_core_early() 285ms
>>>>>
>>>>> Signed-off-by: Yajun Deng <[email protected]>
>>>>> ---
>>>>> v4: same with v2.
>>>>> v3: same with v2.
>>>>> v2: check page count instead of check context before reset it.
>>>>> v1:
>>>>> https://lore.kernel.org/all/[email protected]/
>>>>> ---
>>>>> mm/mm_init.c | 18 +++++++++++++-----
>>>>> mm/page_alloc.c | 20 ++++++++++++--------
>>>>> 2 files changed, 25 insertions(+), 13 deletions(-)
>>>>>
>>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>>> --- a/mm/mm_init.c
>>>>> +++ b/mm/mm_init.c
>>>>> @@ -718,7 +718,7 @@ static void __meminit
>>>>> init_reserved_page(unsigned long pfn, int nid)
>>>>> if (zone_spans_pfn(zone, pfn))
>>>>> break;
>>>>> }
>>>>> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid,
>>>>> INIT_PAGE_COUNT);
>>>>> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>>> }
>>>>> #else
>>>>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>>> @@ -756,8 +756,8 @@ void __meminit
>>>>> reserve_bootmem_region(phys_addr_t start,
>>>>> init_reserved_page(start_pfn, nid);
>>>>> - /* Avoid false-positive PageTail() */
>>>>> - INIT_LIST_HEAD(&page->lru);
>>>>> + /* Init page count for reserved region */
>>>>
>>>> Please add a comment that describes _why_ we initialize the page
>>>> count here.
>>>>
>>>>> + init_page_count(page);
>>>>> /*
>>>>> * no need for atomic set_bit because the struct
>>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long
>>>>> size, int nid, unsigned long zone
>>>>> }
>>>>> page = pfn_to_page(pfn);
>>>>> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>>> - if (context == MEMINIT_HOTPLUG)
>>>>> +
>>>>> + /* If the context is MEMINIT_EARLY, we will init page
>>>>> count and
>>>>> + * mark page reserved in reserve_bootmem_region, the free
>>>>> region
>>>>> + * wouldn't have page count and we will check the pages count
>>>>> + * in __free_pages_core.
>>>>> + */
>>>>> + __init_single_page(page, pfn, zone, nid, 0);
>>>>> + if (context == MEMINIT_HOTPLUG) {
>>>>> + init_page_count(page);
>>>>> __SetPageReserved(page);
>>>>
>>>> Rather than calling init_page_count() and __SetPageReserved() for
>>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT |
>>>> INIT_PAGE_RESERVED
>>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>>>
>>>> But more generally, I wonder if we have to differentiate HOTPLUG
>>>> here at all.
>>>> @David, can you comment please?
>>>
>>> There are a lot of details to that, and I'll share some I can briefly
>>> think of.
>>>
>>> 1) __SetPageReserved()
>>>
>>> I tried removing that a while ago, but there was a blocker (IIRC
>>> something about
>>> ZONE_DEVICE). I still have the patches at [1] and I could probably
>>> take a look
>>> if that blocker still exists (I recall that something changed at some
>>> point, but
>>> I never had the time to follow up).
>>>
>>> But once we stop setting the pages reserved, we might run into issues
>>> with ...
>>>
>>>
>>> 2) init_page_count()
>>>
>>> virtio-mem, XEN balloon and HV-balloon add memory blocks that can
>>> contain holes.
>>> set_online_page_callback() is used to intercept memory onlining and
>>> to expose
>>> only the pages that are not holes to the buddy: calling
>>> generic_online_page() on !hole.
>>>
>>> Holes are PageReserved but with an initialized page count. Memory
>>> offlining will fail on
>>> PageReserved pages -- has_unmovable_pages().
>>>
>>>
>>> At least virtio-mem clears the PageReserved flag of holes when
>>> onlining memory,
>>> and currently relies in the page count to be reasonable (so memory
>>> offlining can work).
>>>
>>> static void virtio_mem_set_fake_offline(unsigned long pfn,
>>> unsigned long nr_pages, bool onlined)
>>> {
>>> page_offline_begin();
>>> for (; nr_pages--; pfn++) {
>>> struct page *page = pfn_to_page(pfn);
>>>
>>> __SetPageOffline(page);
>>> if (!onlined) {
>>> SetPageDirty(page);
>>> /* FIXME: remove after cleanups */
>>> ClearPageReserved(page);
>>> }
>>> }
>>> page_offline_end();
>>> }
>>>
>>>
>>> For virtio-mem, we could initialize the page count there instead. The
>>> other PV drivers
>>> might require a bit more thought.
>>>
>>>
>>> [1]
>>> https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>>>
>>>>
>>>>> + }
>>>>> /*
>>>>> * Usually, we want to mark the pageblock MIGRATE_MOVABLE,
>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>> --- a/mm/page_alloc.c
>>>>> +++ b/mm/page_alloc.c
>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page,
>>>>> unsigned int order)
>>>>> unsigned int loop;
>>>>> /*
>>>>> - * When initializing the memmap, __init_single_page() sets the
>>>>> refcount
>>>>> - * of all pages to 1 ("allocated"/"not free"). We have to set the
>>>>> - * refcount of all involved pages to 0.
>>>>> + * When initializing the memmap, memmap_init_range sets the
>>>>> refcount
>>>>> + * of all pages to 1 ("reserved" and "free") in hotplug
>>>>> context. We
>>>>> + * have to set the refcount of all involved pages to 0.
>>>>> Otherwise,
>>>>> + * we don't do it, as reserve_bootmem_region only set the
>>>>> refcount on
>>>>> + * reserve region ("reserved") in early context.
>>>>> */
>>>>
>>>> Again, why hotplug and early init should be different?
>>>>
>>>>> - prefetchw(p);
>>>>> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>> - prefetchw(p + 1);
>>>>> + if (page_count(page)) {
>>>>> + prefetchw(p);
>>>>> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>> + prefetchw(p + 1);
>>>>> + __ClearPageReserved(p);
>>>>> + set_page_count(p, 0);
>>>>> + }
>>>>> __ClearPageReserved(p);
>>>>> set_page_count(p, 0);
>>>
>>> That looks wrong. if the page count would by pure luck be 0 already
>>> for hotplugged memory,
>>> you wouldn't clear the reserved flag.
>>>
>>> These changes make me a bit nervous.
>>
>>
>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>> need to do something else?
>>
>
> How about the following if statement? But it needs to add more patch
> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>
> It'll be safer, but more complex. Please comment...
>
> if (context != MEMINIT_EARLY || (page_count(page) || PageReserved(page)) {
>
Ideally we could make initialization only depend on the context, and not
check for count or the reserved flag.
--
Cheers,
David / dhildenb
On 2023/10/12 17:19, Mike Rapoport wrote:
> On Thu, Oct 05, 2023 at 10:04:28PM +0800, Yajun Deng wrote:
>>>>>>>> That 'if' breaks the invariant that __free_pages_core is
>>>>>>>> always called for pages with initialized page count. Adding
>>>>>>>> it may lead to subtle bugs and random memory corruption so we
>>>>>>>> don't want to add it at the first place.
>>>>>>> As long as we have to special-case memory hotplug, we know that
>>>>>>> we are always coming via generic_online_page() in that case. We
>>>>>>> could either move some logic over there, or let
>>>>>>> __free_pages_core() know what it should do.
>>>>>> Looks like the patch rather special cases MEMINIT_EARLY, although
>>>>>> I didn't check throughfully other code paths. Anyway, relying on
>>>>>> page_count() to be correct in different ways for different
>>>>>> callers of __free_pages_core() does not sound right to me.
>>>>> Absolutely agreed.
>>>>>
>>>> I already sent v5 a few days ago. Comments, please...
>>> Does it address all the feedback from this thread?
>> Except hotplug.
> Please reread carefully the last comments from me and from David above.
>
I replied in another thread about that 'if' statement. David just
replied to me, let's discuss in another thread.
On 2023/10/12 17:23, David Hildenbrand wrote:
> On 10.10.23 04:31, Yajun Deng wrote:
>>
>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>
>>> On 2023/10/2 16:30, David Hildenbrand wrote:
>>>> On 29.09.23 10:30, Mike Rapoport wrote:
>>>>> On Thu, Sep 28, 2023 at 04:33:02PM +0800, Yajun Deng wrote:
>>>>>> memmap_init_range() would init page count of all pages, but the free
>>>>>> pages count would be reset in __free_pages_core(). There are
>>>>>> opposite
>>>>>> operations. It's unnecessary and time-consuming when it's
>>>>>> MEMINIT_EARLY
>>>>>> context.
>>>>>>
>>>>>> Init page count in reserve_bootmem_region when in MEMINIT_EARLY
>>>>>> context,
>>>>>> and check the page count before reset it.
>>>>>>
>>>>>> At the same time, the INIT_LIST_HEAD in reserve_bootmem_region isn't
>>>>>> need, as it already done in __init_single_page.
>>>>>>
>>>>>> The following data was tested on an x86 machine with 190GB of RAM.
>>>>>>
>>>>>> before:
>>>>>> free_low_memory_core_early() 341ms
>>>>>>
>>>>>> after:
>>>>>> free_low_memory_core_early() 285ms
>>>>>>
>>>>>> Signed-off-by: Yajun Deng <[email protected]>
>>>>>> ---
>>>>>> v4: same with v2.
>>>>>> v3: same with v2.
>>>>>> v2: check page count instead of check context before reset it.
>>>>>> v1:
>>>>>> https://lore.kernel.org/all/[email protected]/
>>>>>>
>>>>>> ---
>>>>>> mm/mm_init.c | 18 +++++++++++++-----
>>>>>> mm/page_alloc.c | 20 ++++++++++++--------
>>>>>> 2 files changed, 25 insertions(+), 13 deletions(-)
>>>>>>
>>>>>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>>>>>> index 9716c8a7ade9..3ab8861e1ef3 100644
>>>>>> --- a/mm/mm_init.c
>>>>>> +++ b/mm/mm_init.c
>>>>>> @@ -718,7 +718,7 @@ static void __meminit
>>>>>> init_reserved_page(unsigned long pfn, int nid)
>>>>>> if (zone_spans_pfn(zone, pfn))
>>>>>> break;
>>>>>> }
>>>>>> - __init_single_page(pfn_to_page(pfn), pfn, zid, nid,
>>>>>> INIT_PAGE_COUNT);
>>>>>> + __init_single_page(pfn_to_page(pfn), pfn, zid, nid, 0);
>>>>>> }
>>>>>> #else
>>>>>> static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
>>>>>> @@ -756,8 +756,8 @@ void __meminit
>>>>>> reserve_bootmem_region(phys_addr_t start,
>>>>>> init_reserved_page(start_pfn, nid);
>>>>>> - /* Avoid false-positive PageTail() */
>>>>>> - INIT_LIST_HEAD(&page->lru);
>>>>>> + /* Init page count for reserved region */
>>>>>
>>>>> Please add a comment that describes _why_ we initialize the page
>>>>> count here.
>>>>>
>>>>>> + init_page_count(page);
>>>>>> /*
>>>>>> * no need for atomic set_bit because the struct
>>>>>> @@ -888,9 +888,17 @@ void __meminit memmap_init_range(unsigned long
>>>>>> size, int nid, unsigned long zone
>>>>>> }
>>>>>> page = pfn_to_page(pfn);
>>>>>> - __init_single_page(page, pfn, zone, nid, INIT_PAGE_COUNT);
>>>>>> - if (context == MEMINIT_HOTPLUG)
>>>>>> +
>>>>>> + /* If the context is MEMINIT_EARLY, we will init page
>>>>>> count and
>>>>>> + * mark page reserved in reserve_bootmem_region, the free
>>>>>> region
>>>>>> + * wouldn't have page count and we will check the pages
>>>>>> count
>>>>>> + * in __free_pages_core.
>>>>>> + */
>>>>>> + __init_single_page(page, pfn, zone, nid, 0);
>>>>>> + if (context == MEMINIT_HOTPLUG) {
>>>>>> + init_page_count(page);
>>>>>> __SetPageReserved(page);
>>>>>
>>>>> Rather than calling init_page_count() and __SetPageReserved() for
>>>>> MEMINIT_HOTPLUG you can set flags to INIT_PAGE_COUNT |
>>>>> INIT_PAGE_RESERVED
>>>>> an call __init_single_page() after the check for MEMINIT_HOTPLUG.
>>>>>
>>>>> But more generally, I wonder if we have to differentiate HOTPLUG
>>>>> here at all.
>>>>> @David, can you comment please?
>>>>
>>>> There are a lot of details to that, and I'll share some I can briefly
>>>> think of.
>>>>
>>>> 1) __SetPageReserved()
>>>>
>>>> I tried removing that a while ago, but there was a blocker (IIRC
>>>> something about
>>>> ZONE_DEVICE). I still have the patches at [1] and I could probably
>>>> take a look
>>>> if that blocker still exists (I recall that something changed at some
>>>> point, but
>>>> I never had the time to follow up).
>>>>
>>>> But once we stop setting the pages reserved, we might run into issues
>>>> with ...
>>>>
>>>>
>>>> 2) init_page_count()
>>>>
>>>> virtio-mem, XEN balloon and HV-balloon add memory blocks that can
>>>> contain holes.
>>>> set_online_page_callback() is used to intercept memory onlining and
>>>> to expose
>>>> only the pages that are not holes to the buddy: calling
>>>> generic_online_page() on !hole.
>>>>
>>>> Holes are PageReserved but with an initialized page count. Memory
>>>> offlining will fail on
>>>> PageReserved pages -- has_unmovable_pages().
>>>>
>>>>
>>>> At least virtio-mem clears the PageReserved flag of holes when
>>>> onlining memory,
>>>> and currently relies in the page count to be reasonable (so memory
>>>> offlining can work).
>>>>
>>>> static void virtio_mem_set_fake_offline(unsigned long pfn,
>>>> unsigned long nr_pages, bool onlined)
>>>> {
>>>> page_offline_begin();
>>>> for (; nr_pages--; pfn++) {
>>>> struct page *page = pfn_to_page(pfn);
>>>>
>>>> __SetPageOffline(page);
>>>> if (!onlined) {
>>>> SetPageDirty(page);
>>>> /* FIXME: remove after cleanups */
>>>> ClearPageReserved(page);
>>>> }
>>>> }
>>>> page_offline_end();
>>>> }
>>>>
>>>>
>>>> For virtio-mem, we could initialize the page count there instead. The
>>>> other PV drivers
>>>> might require a bit more thought.
>>>>
>>>>
>>>> [1]
>>>> https://github.com/davidhildenbrand/linux/tree/online_reserved_cleanup
>>>>
>>>>>
>>>>>> + }
>>>>>> /*
>>>>>> * Usually, we want to mark the pageblock
>>>>>> MIGRATE_MOVABLE,
>>>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>>>> index 06be8821d833..b868caabe8dc 100644
>>>>>> --- a/mm/page_alloc.c
>>>>>> +++ b/mm/page_alloc.c
>>>>>> @@ -1285,18 +1285,22 @@ void __free_pages_core(struct page *page,
>>>>>> unsigned int order)
>>>>>> unsigned int loop;
>>>>>> /*
>>>>>> - * When initializing the memmap, __init_single_page() sets the
>>>>>> refcount
>>>>>> - * of all pages to 1 ("allocated"/"not free"). We have to
>>>>>> set the
>>>>>> - * refcount of all involved pages to 0.
>>>>>> + * When initializing the memmap, memmap_init_range sets the
>>>>>> refcount
>>>>>> + * of all pages to 1 ("reserved" and "free") in hotplug
>>>>>> context. We
>>>>>> + * have to set the refcount of all involved pages to 0.
>>>>>> Otherwise,
>>>>>> + * we don't do it, as reserve_bootmem_region only set the
>>>>>> refcount on
>>>>>> + * reserve region ("reserved") in early context.
>>>>>> */
>>>>>
>>>>> Again, why hotplug and early init should be different?
>>>>>
>>>>>> - prefetchw(p);
>>>>>> - for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>>> - prefetchw(p + 1);
>>>>>> + if (page_count(page)) {
>>>>>> + prefetchw(p);
>>>>>> + for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
>>>>>> + prefetchw(p + 1);
>>>>>> + __ClearPageReserved(p);
>>>>>> + set_page_count(p, 0);
>>>>>> + }
>>>>>> __ClearPageReserved(p);
>>>>>> set_page_count(p, 0);
>>>>
>>>> That looks wrong. if the page count would by pure luck be 0 already
>>>> for hotplugged memory,
>>>> you wouldn't clear the reserved flag.
>>>>
>>>> These changes make me a bit nervous.
>>>
>>>
>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>>> need to do something else?
>>>
>>
>> How about the following if statement? But it needs to add more patch
>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>
>> It'll be safer, but more complex. Please comment...
>>
>> if (context != MEMINIT_EARLY || (page_count(page) ||
>> PageReserved(page)) {
>>
>
> Ideally we could make initialization only depend on the context, and
> not check for count or the reserved flag.
>
This link is v1,
https://lore.kernel.org/all/[email protected]/
If we could make initialization only depend on the context, I'll modify
it based on v1.
@Mike, By the way, this code will cost more time:
if (context == MEMINIT_HOTPLUG)
flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
__init_single_page(page, pfn, zone, nid, flags);
[ 0.014999] On node 0, zone DMA32: 31679 pages in unavailable ranges
[ 0.311560] ACPI: PM-Timer IO Port: 0x508
This code will cost less time:
__init_single_page(page, pfn, zone, nid, 0);
if (context == MEMINIT_HOTPLUG) {
init_page_count(page);
__SetPageReserved(page);
[ 0.014299] On node 0, zone DMA32: 31679 pages in unavailable ranges
[ 0.250223] ACPI: PM-Timer IO Port: 0x508
On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>
> On 2023/10/12 17:23, David Hildenbrand wrote:
> > On 10.10.23 04:31, Yajun Deng wrote:
> > >
> > > On 2023/10/8 16:57, Yajun Deng wrote:
> > > > >
> > > > > That looks wrong. if the page count would by pure luck be 0
> > > > > already for hotplugged memory, you wouldn't clear the reserved
> > > > > flag.
> > > > >
> > > > > These changes make me a bit nervous.
> > > >
> > > > Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
> > > > need to do something else?
> > > >
> > >
> > > How about the following if statement? But it needs to add more patch
> > > like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
> > >
> > > It'll be safer, but more complex. Please comment...
> > >
> > > ????if (context != MEMINIT_EARLY || (page_count(page) ||
> > > PageReserved(page)) {
> > >
> >
> > Ideally we could make initialization only depend on the context, and not
> > check for count or the reserved flag.
> >
>
> This link is v1,
> https://lore.kernel.org/all/[email protected]/
>
> If we could make initialization only depend on the context, I'll modify it
> based on v1.
Although ~20% improvement looks impressive, this is only optimization of a
fraction of the boot time, and realistically, how much 56 msec saves from
the total boot time when you boot a machine with 190G of RAM?
I still think the improvement does not justify the churn, added complexity
and special casing of different code paths of initialization of struct pages.
> @Mike,? By the way,? this code will cost more time:
>
> ??????????????? if (context == MEMINIT_HOTPLUG)
> ??????????????????????? flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
> ??????????????? __init_single_page(page, pfn, zone, nid, flags);
>
>
> [??? 0.014999] On node 0, zone DMA32: 31679 pages in unavailable ranges
> [??? 0.311560] ACPI: PM-Timer IO Port: 0x508
>
>
> This code will cost less time:
>
> ??????????????? __init_single_page(page, pfn, zone, nid, 0);
> ??????????????? if (context == MEMINIT_HOTPLUG) {
> ??????????????????????? init_page_count(page);
> ??????????????????????? __SetPageReserved(page);
>
> [??? 0.014299] On node 0, zone DMA32: 31679 pages in unavailable ranges
> [??? 0.250223] ACPI: PM-Timer IO Port: 0x508
>
--
Sincerely yours,
Mike.
On 2023/10/13 16:48, Mike Rapoport wrote:
> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>> flag.
>>>>>>
>>>>>> These changes make me a bit nervous.
>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>>>>> need to do something else?
>>>>>
>>>> How about the following if statement? But it needs to add more patch
>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>
>>>> It'll be safer, but more complex. Please comment...
>>>>
>>>> if (context != MEMINIT_EARLY || (page_count(page) ||
>>>> PageReserved(page)) {
>>>>
>>> Ideally we could make initialization only depend on the context, and not
>>> check for count or the reserved flag.
>>>
>> This link is v1,
>> https://lore.kernel.org/all/[email protected]/
>>
>> If we could make initialization only depend on the context, I'll modify it
>> based on v1.
> Although ~20% improvement looks impressive, this is only optimization of a
> fraction of the boot time, and realistically, how much 56 msec saves from
> the total boot time when you boot a machine with 190G of RAM?
There are a lot of factors that can affect the total boot time. 56 msec
saves may be insignificant.
But if we look at the boot log, we'll see there's a significant time jump.
before:
[ 0.250334] ACPI: PM-Timer IO Port: 0x508
[ 0.618994] Memory: 173413056K/199884452K available (18440K kernel
code, 4204K rwdata, 5608K rodata, 3188K init, 17024K bss, 5499616K
reserved, 20971520K cma-reserved)
after:
[ 0.260229] software IO TLB: area num 32.
[ 0.563497] Memory: 173413056K/199884452K available (18440K kernel
code, 4204K rwdata, 5608K rodata, 3188K init, 17024K bss, 5499616K
reserved, 20971520K cma-reserved)
Memory initialization is time consuming in the boot log.
> I still think the improvement does not justify the churn, added complexity
> and special casing of different code paths of initialization of struct pages.
>
Because there is a loop, if the order is MAX_ORDER, the loop will run
1024 times. The following 'if' would be safer:
'if (context != MEMINIT_EARLY || (page_count(page) || >>
PageReserved(page)) {'
This is a foundation. We may change this when we are able to safely
remove page init in the hotplug context one day.
So the case for the early context is only temporary.
>> @Mike, By the way, this code will cost more time:
>>
>> if (context == MEMINIT_HOTPLUG)
>> flags = INIT_PAGE_COUNT | INIT_PAGE_RESERVED;
>> __init_single_page(page, pfn, zone, nid, flags);
>>
>>
>> [ 0.014999] On node 0, zone DMA32: 31679 pages in unavailable ranges
>> [ 0.311560] ACPI: PM-Timer IO Port: 0x508
>>
>>
>> This code will cost less time:
>>
>> __init_single_page(page, pfn, zone, nid, 0);
>> if (context == MEMINIT_HOTPLUG) {
>> init_page_count(page);
>> __SetPageReserved(page);
>>
>> [ 0.014299] On node 0, zone DMA32: 31679 pages in unavailable ranges
>> [ 0.250223] ACPI: PM-Timer IO Port: 0x508
>>
On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>
> On 2023/10/13 16:48, Mike Rapoport wrote:
> > On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
> > > On 2023/10/12 17:23, David Hildenbrand wrote:
> > > > On 10.10.23 04:31, Yajun Deng wrote:
> > > > > On 2023/10/8 16:57, Yajun Deng wrote:
> > > > > > > That looks wrong. if the page count would by pure luck be 0
> > > > > > > already for hotplugged memory, you wouldn't clear the reserved
> > > > > > > flag.
> > > > > > >
> > > > > > > These changes make me a bit nervous.
> > > > > > Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
> > > > > > need to do something else?
> > > > > >
> > > > > How about the following if statement? But it needs to add more patch
> > > > > like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
> > > > >
> > > > > It'll be safer, but more complex. Please comment...
> > > > >
> > > > > ????if (context != MEMINIT_EARLY || (page_count(page) ||
> > > > > PageReserved(page)) {
> > > > >
> > > > Ideally we could make initialization only depend on the context, and not
> > > > check for count or the reserved flag.
> > > >
> > > This link is v1,
> > > https://lore.kernel.org/all/[email protected]/
> > >
> > > If we could make initialization only depend on the context, I'll modify it
> > > based on v1.
> > Although ~20% improvement looks impressive, this is only optimization of a
> > fraction of the boot time, and realistically, how much 56 msec saves from
> > the total boot time when you boot a machine with 190G of RAM?
>
> There are a lot of factors that can affect the total boot time. 56 msec
> saves may be insignificant.
>
> But if we look at the boot log, we'll see there's a significant time jump.
>
> before:
>
> [??? 0.250334] ACPI: PM-Timer IO Port: 0x508
> [??? 0.618994] Memory: 173413056K/199884452K available (18440K kernel code,
>
> after:
>
> [??? 0.260229] software IO TLB: area num 32.
> [??? 0.563497] Memory: 173413056K/199884452K available (18440K kernel code,
>
> Memory initialization is time consuming in the boot log.
You just confirmed that 56 msec is insignificant and then you send again
the improvement of ~60 msec in memory initialization.
What does this improvement gain in percentage of total boot time?
> > I still think the improvement does not justify the churn, added complexity
> > and special casing of different code paths of initialization of struct pages.
>
>
> Because there is a loop, if the order is MAX_ORDER, the loop will run 1024
> times. The following 'if' would be safer:
>
> 'if (context != MEMINIT_EARLY || (page_count(page) || >> PageReserved(page))
> {'
No, it will not.
As the matter of fact any condition here won't be 'safer' because it makes
the code more complex and less maintainable.
Any future change in __free_pages_core() or one of it's callers will have
to reason what will happen with that condition after the change.
--
Sincerely yours,
Mike.
On 2023/10/16 14:33, Mike Rapoport wrote:
> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>> flag.
>>>>>>>>
>>>>>>>> These changes make me a bit nervous.
>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>>>>>>> need to do something else?
>>>>>>>
>>>>>> How about the following if statement? But it needs to add more patch
>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>
>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>
>>>>>> if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>> PageReserved(page)) {
>>>>>>
>>>>> Ideally we could make initialization only depend on the context, and not
>>>>> check for count or the reserved flag.
>>>>>
>>>> This link is v1,
>>>> https://lore.kernel.org/all/[email protected]/
>>>>
>>>> If we could make initialization only depend on the context, I'll modify it
>>>> based on v1.
>>> Although ~20% improvement looks impressive, this is only optimization of a
>>> fraction of the boot time, and realistically, how much 56 msec saves from
>>> the total boot time when you boot a machine with 190G of RAM?
>> There are a lot of factors that can affect the total boot time. 56 msec
>> saves may be insignificant.
>>
>> But if we look at the boot log, we'll see there's a significant time jump.
>>
>> before:
>>
>> [ 0.250334] ACPI: PM-Timer IO Port: 0x508
>> [ 0.618994] Memory: 173413056K/199884452K available (18440K kernel code,
>>
>> after:
>>
>> [ 0.260229] software IO TLB: area num 32.
>> [ 0.563497] Memory: 173413056K/199884452K available (18440K kernel code,
>> Memory:
>> Memory initialization is time consuming in the boot log.
> You just confirmed that 56 msec is insignificant and then you send again
> the improvement of ~60 msec in memory initialization.
>
> What does this improvement gain in percentage of total boot time?
before:
[ 10.692708] Run /init as init process
after:
[ 10.666290] Run /init as init process
About 0.25%. The total boot time is variable, depending on how many
drivers need to be initialized.
>
>>> I still think the improvement does not justify the churn, added complexity
>>> and special casing of different code paths of initialization of struct pages.
>>
>> Because there is a loop, if the order is MAX_ORDER, the loop will run 1024
>> times. The following 'if' would be safer:
>>
>> 'if (context != MEMINIT_EARLY || (page_count(page) || >> PageReserved(page))
>> {'
> No, it will not.
>
> As the matter of fact any condition here won't be 'safer' because it makes
> the code more complex and less maintainable.
> Any future change in __free_pages_core() or one of it's callers will have
> to reason what will happen with that condition after the change.
To avoid introducing MEMINIT_LATE context and make code simpler. This
might be a better option.
if (page_count(page) || PageReserved(page))
On 16.10.23 10:10, Yajun Deng wrote:
>
> On 2023/10/16 14:33, Mike Rapoport wrote:
>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>>> flag.
>>>>>>>>>
>>>>>>>>> These changes make me a bit nervous.
>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or do I
>>>>>>>> need to do something else?
>>>>>>>>
>>>>>>> How about the following if statement? But it needs to add more patch
>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>
>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>
>>>>>>> if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>> PageReserved(page)) {
>>>>>>>
>>>>>> Ideally we could make initialization only depend on the context, and not
>>>>>> check for count or the reserved flag.
>>>>>>
>>>>> This link is v1,
>>>>> https://lore.kernel.org/all/[email protected]/
>>>>>
>>>>> If we could make initialization only depend on the context, I'll modify it
>>>>> based on v1.
>>>> Although ~20% improvement looks impressive, this is only optimization of a
>>>> fraction of the boot time, and realistically, how much 56 msec saves from
>>>> the total boot time when you boot a machine with 190G of RAM?
>>> There are a lot of factors that can affect the total boot time. 56 msec
>>> saves may be insignificant.
>>>
>>> But if we look at the boot log, we'll see there's a significant time jump.
>>>
>>> before:
>>>
>>> [ 0.250334] ACPI: PM-Timer IO Port: 0x508
>>> [ 0.618994] Memory: 173413056K/199884452K available (18440K kernel code,
>>>
>>> after:
>>>
>>> [ 0.260229] software IO TLB: area num 32.
>>> [ 0.563497] Memory: 173413056K/199884452K available (18440K kernel code,
>>> Memory:
>>> Memory initialization is time consuming in the boot log.
>> You just confirmed that 56 msec is insignificant and then you send again
>> the improvement of ~60 msec in memory initialization.
>>
>> What does this improvement gain in percentage of total boot time?
>
>
> before:
>
> [ 10.692708] Run /init as init process
>
>
> after:
>
> [ 10.666290] Run /init as init process
>
>
> About 0.25%. The total boot time is variable, depending on how many
> drivers need to be initialized.
>
>
>>
>>>> I still think the improvement does not justify the churn, added complexity
>>>> and special casing of different code paths of initialization of struct pages.
>>>
>>> Because there is a loop, if the order is MAX_ORDER, the loop will run 1024
>>> times. The following 'if' would be safer:
>>>
>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >> PageReserved(page))
>>> {'
>> No, it will not.
>>
>> As the matter of fact any condition here won't be 'safer' because it makes
>> the code more complex and less maintainable.
>> Any future change in __free_pages_core() or one of it's callers will have
>> to reason what will happen with that condition after the change.
>
>
> To avoid introducing MEMINIT_LATE context and make code simpler. This
> might be a better option.
>
> if (page_count(page) || PageReserved(page))
I'll have to side with Mike here; this change might not be worth it.
--
Cheers,
David / dhildenb
On 2023/10/16 16:16, David Hildenbrand wrote:
> On 16.10.23 10:10, Yajun Deng wrote:
>>
>> On 2023/10/16 14:33, Mike Rapoport wrote:
>>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>>>> flag.
>>>>>>>>>>
>>>>>>>>>> These changes make me a bit nervous.
>>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or
>>>>>>>>> do I
>>>>>>>>> need to do something else?
>>>>>>>>>
>>>>>>>> How about the following if statement? But it needs to add more
>>>>>>>> patch
>>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>>
>>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>>
>>>>>>>> if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>>> PageReserved(page)) {
>>>>>>>>
>>>>>>> Ideally we could make initialization only depend on the context,
>>>>>>> and not
>>>>>>> check for count or the reserved flag.
>>>>>>>
>>>>>> This link is v1,
>>>>>> https://lore.kernel.org/all/[email protected]/
>>>>>>
>>>>>>
>>>>>> If we could make initialization only depend on the context, I'll
>>>>>> modify it
>>>>>> based on v1.
>>>>> Although ~20% improvement looks impressive, this is only
>>>>> optimization of a
>>>>> fraction of the boot time, and realistically, how much 56 msec
>>>>> saves from
>>>>> the total boot time when you boot a machine with 190G of RAM?
>>>> There are a lot of factors that can affect the total boot time. 56
>>>> msec
>>>> saves may be insignificant.
>>>>
>>>> But if we look at the boot log, we'll see there's a significant
>>>> time jump.
>>>>
>>>> before:
>>>>
>>>> [ 0.250334] ACPI: PM-Timer IO Port: 0x508
>>>> [ 0.618994] Memory: 173413056K/199884452K available (18440K
>>>> kernel code,
>>>>
>>>> after:
>>>>
>>>> [ 0.260229] software IO TLB: area num 32.
>>>> [ 0.563497] Memory: 173413056K/199884452K available (18440K
>>>> kernel code,
>>>> Memory:
>>>> Memory initialization is time consuming in the boot log.
>>> You just confirmed that 56 msec is insignificant and then you send
>>> again
>>> the improvement of ~60 msec in memory initialization.
>>>
>>> What does this improvement gain in percentage of total boot time?
>>
>>
>> before:
>>
>> [ 10.692708] Run /init as init process
>>
>>
>> after:
>>
>> [ 10.666290] Run /init as init process
>>
>>
>> About 0.25%. The total boot time is variable, depending on how many
>> drivers need to be initialized.
>>
>>
>>>>> I still think the improvement does not justify the churn, added
>>>>> complexity
>>>>> and special casing of different code paths of initialization of
>>>>> struct pages.
>>>>
>>>> Because there is a loop, if the order is MAX_ORDER, the loop will
>>>> run 1024
>>>> times. The following 'if' would be safer:
>>>>
>>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >>
>>>> PageReserved(page))
>>>> {'
>>> No, it will not.
>>>
>>> As the matter of fact any condition here won't be 'safer' because it
>>> makes
>>> the code more complex and less maintainable.
>>> Any future change in __free_pages_core() or one of it's callers will
>>> have
>>> to reason what will happen with that condition after the change.
>>
>>
>> To avoid introducing MEMINIT_LATE context and make code simpler. This
>> might be a better option.
>>
>> if (page_count(page) || PageReserved(page))
>
> I'll have to side with Mike here; this change might not be worth it.
>
Okay, I got it. Thanks!
On 16.10.23 10:32, Yajun Deng wrote:
>
> On 2023/10/16 16:16, David Hildenbrand wrote:
>> On 16.10.23 10:10, Yajun Deng wrote:
>>>
>>> On 2023/10/16 14:33, Mike Rapoport wrote:
>>>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>>>>> flag.
>>>>>>>>>>>
>>>>>>>>>>> These changes make me a bit nervous.
>>>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or
>>>>>>>>>> do I
>>>>>>>>>> need to do something else?
>>>>>>>>>>
>>>>>>>>> How about the following if statement? But it needs to add more
>>>>>>>>> patch
>>>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>>>
>>>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>>>
>>>>>>>>> if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>>>> PageReserved(page)) {
>>>>>>>>>
>>>>>>>> Ideally we could make initialization only depend on the context,
>>>>>>>> and not
>>>>>>>> check for count or the reserved flag.
>>>>>>>>
>>>>>>> This link is v1,
>>>>>>> https://lore.kernel.org/all/[email protected]/
>>>>>>>
>>>>>>>
>>>>>>> If we could make initialization only depend on the context, I'll
>>>>>>> modify it
>>>>>>> based on v1.
>>>>>> Although ~20% improvement looks impressive, this is only
>>>>>> optimization of a
>>>>>> fraction of the boot time, and realistically, how much 56 msec
>>>>>> saves from
>>>>>> the total boot time when you boot a machine with 190G of RAM?
>>>>> There are a lot of factors that can affect the total boot time. 56
>>>>> msec
>>>>> saves may be insignificant.
>>>>>
>>>>> But if we look at the boot log, we'll see there's a significant
>>>>> time jump.
>>>>>
>>>>> before:
>>>>>
>>>>> [ 0.250334] ACPI: PM-Timer IO Port: 0x508
>>>>> [ 0.618994] Memory: 173413056K/199884452K available (18440K
>>>>> kernel code,
>>>>>
>>>>> after:
>>>>>
>>>>> [ 0.260229] software IO TLB: area num 32.
>>>>> [ 0.563497] Memory: 173413056K/199884452K available (18440K
>>>>> kernel code,
>>>>> Memory:
>>>>> Memory initialization is time consuming in the boot log.
>>>> You just confirmed that 56 msec is insignificant and then you send
>>>> again
>>>> the improvement of ~60 msec in memory initialization.
>>>>
>>>> What does this improvement gain in percentage of total boot time?
>>>
>>>
>>> before:
>>>
>>> [ 10.692708] Run /init as init process
>>>
>>>
>>> after:
>>>
>>> [ 10.666290] Run /init as init process
>>>
>>>
>>> About 0.25%. The total boot time is variable, depending on how many
>>> drivers need to be initialized.
>>>
>>>
>>>>>> I still think the improvement does not justify the churn, added
>>>>>> complexity
>>>>>> and special casing of different code paths of initialization of
>>>>>> struct pages.
>>>>>
>>>>> Because there is a loop, if the order is MAX_ORDER, the loop will
>>>>> run 1024
>>>>> times. The following 'if' would be safer:
>>>>>
>>>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >>
>>>>> PageReserved(page))
>>>>> {'
>>>> No, it will not.
>>>>
>>>> As the matter of fact any condition here won't be 'safer' because it
>>>> makes
>>>> the code more complex and less maintainable.
>>>> Any future change in __free_pages_core() or one of it's callers will
>>>> have
>>>> to reason what will happen with that condition after the change.
>>>
>>>
>>> To avoid introducing MEMINIT_LATE context and make code simpler. This
>>> might be a better option.
>>>
>>> if (page_count(page) || PageReserved(page))
>>
>> I'll have to side with Mike here; this change might not be worth it.
>>
>
> Okay, I got it. Thanks!
IMHO instead of adding more checks to that code we should try to unify
that handling such that we can just remove it. As expressed, at least
from the memory hotplug perspective there are still reasons why we need
that; I can provide some guidance on how to eventually achieve that, but
it might end up in a bit of work ...
Anyhow, thanks for bringing up that topic; it reminded me that I still
have pending cleanups to not rely on PageReserved on the memory hotplug
path.
--
Cheers,
David / dhildenb
On 2023/10/16 16:36, David Hildenbrand wrote:
> On 16.10.23 10:32, Yajun Deng wrote:
>>
>> On 2023/10/16 16:16, David Hildenbrand wrote:
>>> On 16.10.23 10:10, Yajun Deng wrote:
>>>>
>>>> On 2023/10/16 14:33, Mike Rapoport wrote:
>>>>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>>>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>>>>> already for hotplugged memory, you wouldn't clear the reserved
>>>>>>>>>>>> flag.
>>>>>>>>>>>>
>>>>>>>>>>>> These changes make me a bit nervous.
>>>>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or
>>>>>>>>>>> do I
>>>>>>>>>>> need to do something else?
>>>>>>>>>>>
>>>>>>>>>> How about the following if statement? But it needs to add more
>>>>>>>>>> patch
>>>>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>>>>
>>>>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>>>>
>>>>>>>>>> if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>>>>> PageReserved(page)) {
>>>>>>>>>>
>>>>>>>>> Ideally we could make initialization only depend on the context,
>>>>>>>>> and not
>>>>>>>>> check for count or the reserved flag.
>>>>>>>>>
>>>>>>>> This link is v1,
>>>>>>>> https://lore.kernel.org/all/[email protected]/
>>>>>>>>
>>>>>>>>
>>>>>>>>
>>>>>>>> If we could make initialization only depend on the context, I'll
>>>>>>>> modify it
>>>>>>>> based on v1.
>>>>>>> Although ~20% improvement looks impressive, this is only
>>>>>>> optimization of a
>>>>>>> fraction of the boot time, and realistically, how much 56 msec
>>>>>>> saves from
>>>>>>> the total boot time when you boot a machine with 190G of RAM?
>>>>>> There are a lot of factors that can affect the total boot time. 56
>>>>>> msec
>>>>>> saves may be insignificant.
>>>>>>
>>>>>> But if we look at the boot log, we'll see there's a significant
>>>>>> time jump.
>>>>>>
>>>>>> before:
>>>>>>
>>>>>> [ 0.250334] ACPI: PM-Timer IO Port: 0x508
>>>>>> [ 0.618994] Memory: 173413056K/199884452K available (18440K
>>>>>> kernel code,
>>>>>>
>>>>>> after:
>>>>>>
>>>>>> [ 0.260229] software IO TLB: area num 32.
>>>>>> [ 0.563497] Memory: 173413056K/199884452K available (18440K
>>>>>> kernel code,
>>>>>> Memory:
>>>>>> Memory initialization is time consuming in the boot log.
>>>>> You just confirmed that 56 msec is insignificant and then you send
>>>>> again
>>>>> the improvement of ~60 msec in memory initialization.
>>>>>
>>>>> What does this improvement gain in percentage of total boot time?
>>>>
>>>>
>>>> before:
>>>>
>>>> [ 10.692708] Run /init as init process
>>>>
>>>>
>>>> after:
>>>>
>>>> [ 10.666290] Run /init as init process
>>>>
>>>>
>>>> About 0.25%. The total boot time is variable, depending on how many
>>>> drivers need to be initialized.
>>>>
>>>>
>>>>>>> I still think the improvement does not justify the churn, added
>>>>>>> complexity
>>>>>>> and special casing of different code paths of initialization of
>>>>>>> struct pages.
>>>>>>
>>>>>> Because there is a loop, if the order is MAX_ORDER, the loop will
>>>>>> run 1024
>>>>>> times. The following 'if' would be safer:
>>>>>>
>>>>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >>
>>>>>> PageReserved(page))
>>>>>> {'
>>>>> No, it will not.
>>>>>
>>>>> As the matter of fact any condition here won't be 'safer' because it
>>>>> makes
>>>>> the code more complex and less maintainable.
>>>>> Any future change in __free_pages_core() or one of it's callers will
>>>>> have
>>>>> to reason what will happen with that condition after the change.
>>>>
>>>>
>>>> To avoid introducing MEMINIT_LATE context and make code simpler. This
>>>> might be a better option.
>>>>
>>>> if (page_count(page) || PageReserved(page))
>>>
>>> I'll have to side with Mike here; this change might not be worth it.
>>>
>>
>> Okay, I got it. Thanks!
>
> IMHO instead of adding more checks to that code we should try to unify
> that handling such that we can just remove it. As expressed, at least
> from the memory hotplug perspective there are still reasons why we
> need that; I can provide some guidance on how to eventually achieve
> that, but it might end up in a bit of work ...
Yes, we can't remove it right now. If we want to do that, we have to
clean up rely on page count and PageReserved first.
>
> Anyhow, thanks for bringing up that topic; it reminded me that I still
> have pending cleanups to not rely on PageReserved on the memory
> hotplug path.
>
On 2023/10/16 18:17, Yajun Deng wrote:
>
> On 2023/10/16 16:36, David Hildenbrand wrote:
>> On 16.10.23 10:32, Yajun Deng wrote:
>>>
>>> On 2023/10/16 16:16, David Hildenbrand wrote:
>>>> On 16.10.23 10:10, Yajun Deng wrote:
>>>>>
>>>>> On 2023/10/16 14:33, Mike Rapoport wrote:
>>>>>> On Fri, Oct 13, 2023 at 05:29:19PM +0800, Yajun Deng wrote:
>>>>>>> On 2023/10/13 16:48, Mike Rapoport wrote:
>>>>>>>> On Thu, Oct 12, 2023 at 05:53:22PM +0800, Yajun Deng wrote:
>>>>>>>>> On 2023/10/12 17:23, David Hildenbrand wrote:
>>>>>>>>>> On 10.10.23 04:31, Yajun Deng wrote:
>>>>>>>>>>> On 2023/10/8 16:57, Yajun Deng wrote:
>>>>>>>>>>>>> That looks wrong. if the page count would by pure luck be 0
>>>>>>>>>>>>> already for hotplugged memory, you wouldn't clear the
>>>>>>>>>>>>> reserved
>>>>>>>>>>>>> flag.
>>>>>>>>>>>>>
>>>>>>>>>>>>> These changes make me a bit nervous.
>>>>>>>>>>>> Is 'if (page_count(page) || PageReserved(page))' be safer? Or
>>>>>>>>>>>> do I
>>>>>>>>>>>> need to do something else?
>>>>>>>>>>>>
>>>>>>>>>>> How about the following if statement? But it needs to add more
>>>>>>>>>>> patch
>>>>>>>>>>> like v1 ([PATCH 2/4] mm: Introduce MEMINIT_LATE context).
>>>>>>>>>>>
>>>>>>>>>>> It'll be safer, but more complex. Please comment...
>>>>>>>>>>>
>>>>>>>>>>> if (context != MEMINIT_EARLY || (page_count(page) ||
>>>>>>>>>>> PageReserved(page)) {
>>>>>>>>>>>
>>>>>>>>>> Ideally we could make initialization only depend on the context,
>>>>>>>>>> and not
>>>>>>>>>> check for count or the reserved flag.
>>>>>>>>>>
>>>>>>>>> This link is v1,
>>>>>>>>> https://lore.kernel.org/all/[email protected]/
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> If we could make initialization only depend on the context, I'll
>>>>>>>>> modify it
>>>>>>>>> based on v1.
>>>>>>>> Although ~20% improvement looks impressive, this is only
>>>>>>>> optimization of a
>>>>>>>> fraction of the boot time, and realistically, how much 56 msec
>>>>>>>> saves from
>>>>>>>> the total boot time when you boot a machine with 190G of RAM?
>>>>>>> There are a lot of factors that can affect the total boot time. 56
>>>>>>> msec
>>>>>>> saves may be insignificant.
>>>>>>>
>>>>>>> But if we look at the boot log, we'll see there's a significant
>>>>>>> time jump.
>>>>>>>
>>>>>>> before:
>>>>>>>
>>>>>>> [ 0.250334] ACPI: PM-Timer IO Port: 0x508
>>>>>>> [ 0.618994] Memory: 173413056K/199884452K available (18440K
>>>>>>> kernel code,
>>>>>>>
>>>>>>> after:
>>>>>>>
>>>>>>> [ 0.260229] software IO TLB: area num 32.
>>>>>>> [ 0.563497] Memory: 173413056K/199884452K available (18440K
>>>>>>> kernel code,
>>>>>>> Memory:
>>>>>>> Memory initialization is time consuming in the boot log.
>>>>>> You just confirmed that 56 msec is insignificant and then you send
>>>>>> again
>>>>>> the improvement of ~60 msec in memory initialization.
>>>>>>
>>>>>> What does this improvement gain in percentage of total boot time?
>>>>>
>>>>>
>>>>> before:
>>>>>
>>>>> [ 10.692708] Run /init as init process
>>>>>
>>>>>
>>>>> after:
>>>>>
>>>>> [ 10.666290] Run /init as init process
>>>>>
>>>>>
>>>>> About 0.25%. The total boot time is variable, depending on how many
>>>>> drivers need to be initialized.
>>>>>
>>>>>
>>>>>>>> I still think the improvement does not justify the churn, added
>>>>>>>> complexity
>>>>>>>> and special casing of different code paths of initialization of
>>>>>>>> struct pages.
>>>>>>>
>>>>>>> Because there is a loop, if the order is MAX_ORDER, the loop will
>>>>>>> run 1024
>>>>>>> times. The following 'if' would be safer:
>>>>>>>
>>>>>>> 'if (context != MEMINIT_EARLY || (page_count(page) || >>
>>>>>>> PageReserved(page))
>>>>>>> {'
>>>>>> No, it will not.
>>>>>>
>>>>>> As the matter of fact any condition here won't be 'safer' because it
>>>>>> makes
>>>>>> the code more complex and less maintainable.
>>>>>> Any future change in __free_pages_core() or one of it's callers will
>>>>>> have
>>>>>> to reason what will happen with that condition after the change.
>>>>>
>>>>>
>>>>> To avoid introducing MEMINIT_LATE context and make code simpler. This
>>>>> might be a better option.
>>>>>
>>>>> if (page_count(page) || PageReserved(page))
>>>>
>>>> I'll have to side with Mike here; this change might not be worth it.
>>>>
>>>
>>> Okay, I got it. Thanks!
>>
>> IMHO instead of adding more checks to that code we should try to
>> unify that handling such that we can just remove it. As expressed, at
>> least from the memory hotplug perspective there are still reasons why
>> we need that; I can provide some guidance on how to eventually
>> achieve that, but it might end up in a bit of work ...
>
>
> Yes, we can't remove it right now. If we want to do that, we have to
> clean up rely on page count and PageReserved first.
How about making __free_pages_core separate, like:
void __init __free_pages_core_early(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
if (page_contains_unaccepted(page, order)) {
if (order == MAX_ORDER && __free_unaccepted(page))
return;
accept_page(page, order);
}
/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
__free_pages_ok(page, order, FPI_TO_TAIL);
}
void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
unsigned int loop;
/*
* When initializing the memmap, __init_single_page() sets the
refcount
* of all pages to 1 ("allocated"/"not free"). We have to set the
* refcount of all involved pages to 0.
*/
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
__ClearPageReserved(p);
set_page_count(p, 0);
__free_pages_core_early(page, order);
}
We only change the caller we need to __free_pages_core_early, it doesn't
affect other callers.
>
>>
>> Anyhow, thanks for bringing up that topic; it reminded me that I
>> still have pending cleanups to not rely on PageReserved on the memory
>> hotplug path.
>>