When free pages are done with pageblock_order, time spend on
coalescing pages by buddy allocator can be reduced. With
section size of 256MB, hot add latency of a single section
shows improvement from 50-60 ms to less than 1 ms, hence
improving the hot add latency by 60%.
If this looks okey, I'll modify users of set_online_page_callback
and resend clean patch.
Signed-off-by: Arun KS <[email protected]>
---
include/linux/memory_hotplug.h | 1 +
mm/memory_hotplug.c | 52 ++++++++++++++++++++++++++++++++++++------
2 files changed, 46 insertions(+), 7 deletions(-)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 34a2822..447047d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -88,6 +88,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
extern void __offline_isolated_pages(unsigned long, unsigned long);
typedef void (*online_page_callback_t)(struct page *page);
+typedef int (*online_pages_callback_t)(struct page *page, unsigned int order);
extern int set_online_page_callback(online_page_callback_t callback);
extern int restore_online_page_callback(online_page_callback_t callback);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 38d94b7..853104d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -662,19 +662,57 @@ static void generic_online_page(struct page *page)
__online_page_free(page);
}
+static int generic_online_pages(struct page *page, unsigned int order);
+static online_pages_callback_t online_pages_callback = generic_online_pages;
+
+static int generic_online_pages(struct page *page, unsigned int order)
+{
+ unsigned long nr_pages = 1 << order;
+ struct page *p = page;
+ unsigned int loop;
+
+ for (loop = 0 ; loop < nr_pages ; loop++, p++) {
+ __ClearPageReserved(p);
+ set_page_count(p, 0);
+ }
+ adjust_managed_page_count(page, nr_pages);
+ init_page_count(page);
+ __free_pages(page, order);
+
+ return 0;
+}
+
+static int online_pages_blocks(unsigned long start_pfn, unsigned long nr_pages)
+{
+ unsigned long pages_per_block = (1 << pageblock_order);
+ unsigned long nr_pageblocks = nr_pages / pages_per_block;
+// unsigned long rem_pages = nr_pages % pages_per_block;
+ int i, ret, onlined_pages = 0;
+ struct page *page;
+
+ for (i = 0 ; i < nr_pageblocks ; i++) {
+ page = pfn_to_page(start_pfn + (i * pages_per_block));
+ ret = (*online_pages_callback)(page, pageblock_order);
+ if (!ret)
+ onlined_pages += pages_per_block;
+ else if (ret > 0)
+ onlined_pages += ret;
+ }
+/*
+ if (rem_pages)
+ onlined_pages += online_page_single(start_pfn + i, rem_pages);
+*/
+
+ return onlined_pages;
+}
+
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long i;
unsigned long onlined_pages = *(unsigned long *)arg;
- struct page *page;
if (PageReserved(pfn_to_page(start_pfn)))
- for (i = 0; i < nr_pages; i++) {
- page = pfn_to_page(start_pfn + i);
- (*online_page_callback)(page);
- onlined_pages++;
- }
+ onlined_pages = online_pages_blocks(start_pfn, nr_pages);
online_mem_sections(start_pfn, start_pfn + nr_pages);
--
1.9.1
On Wed 12-09-18 14:56:45, Arun KS wrote:
> When free pages are done with pageblock_order, time spend on
> coalescing pages by buddy allocator can be reduced. With
> section size of 256MB, hot add latency of a single section
> shows improvement from 50-60 ms to less than 1 ms, hence
> improving the hot add latency by 60%.
Where does the improvement come from? You are still doing the same
amount of work except that the number of callbacks is lower. Is this the
real source of 60% improvement?
>
> If this looks okey, I'll modify users of set_online_page_callback
> and resend clean patch.
[...]
> +static int generic_online_pages(struct page *page, unsigned int order);
> +static online_pages_callback_t online_pages_callback = generic_online_pages;
> +
> +static int generic_online_pages(struct page *page, unsigned int order)
> +{
> + unsigned long nr_pages = 1 << order;
> + struct page *p = page;
> + unsigned int loop;
> +
> + for (loop = 0 ; loop < nr_pages ; loop++, p++) {
> + __ClearPageReserved(p);
> + set_page_count(p, 0);
> + }
> + adjust_managed_page_count(page, nr_pages);
> + init_page_count(page);
> + __free_pages(page, order);
> +
> + return 0;
> +}
> +
> +static int online_pages_blocks(unsigned long start_pfn, unsigned long nr_pages)
> +{
> + unsigned long pages_per_block = (1 << pageblock_order);
> + unsigned long nr_pageblocks = nr_pages / pages_per_block;
> +// unsigned long rem_pages = nr_pages % pages_per_block;
> + int i, ret, onlined_pages = 0;
> + struct page *page;
> +
> + for (i = 0 ; i < nr_pageblocks ; i++) {
> + page = pfn_to_page(start_pfn + (i * pages_per_block));
> + ret = (*online_pages_callback)(page, pageblock_order);
> + if (!ret)
> + onlined_pages += pages_per_block;
> + else if (ret > 0)
> + onlined_pages += ret;
> + }
Could you explain why does the pages_per_block step makes any sense? Why
don't you simply apply handle the full nr_pages worth of memory range
instead?
> +/*
> + if (rem_pages)
> + onlined_pages += online_page_single(start_pfn + i, rem_pages);
> +*/
> +
> + return onlined_pages;
> +}
> +
> static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
> void *arg)
> {
> - unsigned long i;
> unsigned long onlined_pages = *(unsigned long *)arg;
> - struct page *page;
>
> if (PageReserved(pfn_to_page(start_pfn)))
> - for (i = 0; i < nr_pages; i++) {
> - page = pfn_to_page(start_pfn + i);
> - (*online_page_callback)(page);
> - onlined_pages++;
> - }
> + onlined_pages = online_pages_blocks(start_pfn, nr_pages);
>
> online_mem_sections(start_pfn, start_pfn + nr_pages);
>
> --
> 1.9.1
>
--
Michal Hocko
SUSE Labs
On Wed, Sep 12, 2018 at 12:38:53PM +0200, Michal Hocko wrote:
> On Wed 12-09-18 14:56:45, Arun KS wrote:
> > When free pages are done with pageblock_order, time spend on
> > coalescing pages by buddy allocator can be reduced. With
> > section size of 256MB, hot add latency of a single section
> > shows improvement from 50-60 ms to less than 1 ms, hence
> > improving the hot add latency by 60%.
>
> Where does the improvement come from? You are still doing the same
> amount of work except that the number of callbacks is lower. Is this the
> real source of 60% improvement?
>
It looks like only the first page of the pageblock is initialized, is
some of the cost amortized in terms of doing one initialization for
the page with order (order) and then relying on split_page and helpers
to do the rest? Of course the number of callbacks reduce by a significant
number as well.
> >
> > If this looks okey, I'll modify users of set_online_page_callback
> > and resend clean patch.
>
> [...]
>
> > +static int generic_online_pages(struct page *page, unsigned int order);
> > +static online_pages_callback_t online_pages_callback = generic_online_pages;
> > +
> > +static int generic_online_pages(struct page *page, unsigned int order)
> > +{
> > + unsigned long nr_pages = 1 << order;
> > + struct page *p = page;
> > + unsigned int loop;
> > +
> > + for (loop = 0 ; loop < nr_pages ; loop++, p++) {
> > + __ClearPageReserved(p);
> > + set_page_count(p, 0);
> > + }
> > + adjust_managed_page_count(page, nr_pages);
> > + init_page_count(page);
> > + __free_pages(page, order);
> > +
> > + return 0;
> > +}
> > +
> > +static int online_pages_blocks(unsigned long start_pfn, unsigned long nr_pages)
> > +{
> > + unsigned long pages_per_block = (1 << pageblock_order);
> > + unsigned long nr_pageblocks = nr_pages / pages_per_block;
> > +// unsigned long rem_pages = nr_pages % pages_per_block;
> > + int i, ret, onlined_pages = 0;
> > + struct page *page;
> > +
> > + for (i = 0 ; i < nr_pageblocks ; i++) {
> > + page = pfn_to_page(start_pfn + (i * pages_per_block));
> > + ret = (*online_pages_callback)(page, pageblock_order);
> > + if (!ret)
> > + onlined_pages += pages_per_block;
> > + else if (ret > 0)
> > + onlined_pages += ret;
> > + }
>
> Could you explain why does the pages_per_block step makes any sense? Why
> don't you simply apply handle the full nr_pages worth of memory range
> instead?
>
> > +/*
> > + if (rem_pages)
> > + onlined_pages += online_page_single(start_pfn + i, rem_pages);
> > +*/
Do we expect no rem_pages with this patch?
> > +
> > + return onlined_pages;
> > +}
> > +
> > static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
> > void *arg)
> > {
> > - unsigned long i;
> > unsigned long onlined_pages = *(unsigned long *)arg;
> > - struct page *page;
> >
> > if (PageReserved(pfn_to_page(start_pfn)))
> > - for (i = 0; i < nr_pages; i++) {
> > - page = pfn_to_page(start_pfn + i);
> > - (*online_page_callback)(page);
> > - onlined_pages++;
> > - }
> > + onlined_pages = online_pages_blocks(start_pfn, nr_pages);
> >
> > online_mem_sections(start_pfn, start_pfn + nr_pages);
Balbir Singh.
On Wed 12-09-18 22:57:43, Balbir Singh wrote:
> On Wed, Sep 12, 2018 at 12:38:53PM +0200, Michal Hocko wrote:
> > On Wed 12-09-18 14:56:45, Arun KS wrote:
> > > When free pages are done with pageblock_order, time spend on
> > > coalescing pages by buddy allocator can be reduced. With
> > > section size of 256MB, hot add latency of a single section
> > > shows improvement from 50-60 ms to less than 1 ms, hence
> > > improving the hot add latency by 60%.
> >
> > Where does the improvement come from? You are still doing the same
> > amount of work except that the number of callbacks is lower. Is this the
> > real source of 60% improvement?
> >
>
> It looks like only the first page of the pageblock is initialized, is
> some of the cost amortized in terms of doing one initialization for
> the page with order (order) and then relying on split_page and helpers
> to do the rest? Of course the number of callbacks reduce by a significant
> number as well.
Ohh, I have missed that part. Now when re-reading I can see the reason
for the perf improvement. It is most likely the higher order free which
ends up being much cheaper. This part makes some sense.
How much is this feasible is another question. Do not forget we have
those external providers of the online callback and those would need to
be updated as well.
Btw. the normal memmap init code path does the same per-page free as
well. If we really want to speed the hotplug path then I guess the init
one would see a bigger improvement and those two should be in sync.
> > >
> > > If this looks okey, I'll modify users of set_online_page_callback
> > > and resend clean patch.
> >
> > [...]
> >
> > > +static int generic_online_pages(struct page *page, unsigned int order);
> > > +static online_pages_callback_t online_pages_callback = generic_online_pages;
> > > +
> > > +static int generic_online_pages(struct page *page, unsigned int order)
> > > +{
> > > + unsigned long nr_pages = 1 << order;
> > > + struct page *p = page;
> > > + unsigned int loop;
> > > +
> > > + for (loop = 0 ; loop < nr_pages ; loop++, p++) {
> > > + __ClearPageReserved(p);
> > > + set_page_count(p, 0);
btw. you want init_page_count here.
--
Michal Hocko
SUSE Labs
Hello Michal and Balbir,
Thanks for reviewing.
On 2018-09-12 18:27, Balbir Singh wrote:
> On Wed, Sep 12, 2018 at 12:38:53PM +0200, Michal Hocko wrote:
>> On Wed 12-09-18 14:56:45, Arun KS wrote:
>> > When free pages are done with pageblock_order, time spend on
>> > coalescing pages by buddy allocator can be reduced. With
>> > section size of 256MB, hot add latency of a single section
>> > shows improvement from 50-60 ms to less than 1 ms, hence
>> > improving the hot add latency by 60%.
>>
>> Where does the improvement come from? You are still doing the same
>> amount of work except that the number of callbacks is lower. Is this
>> the
>> real source of 60% improvement?
>>
>
> It looks like only the first page of the pageblock is initialized, is
> some of the cost amortized in terms of doing one initialization for
> the page with order (order) and then relying on split_page and helpers
> to do the rest? Of course the number of callbacks reduce by a
> significant
> number as well.
Currently, order zero pages are freed one by one, they goes to pcp list
and later when pcp->count >= pcp->high, kernel calls __free_one_page()
in a loop. __free_one_page() tries to merge these pages to create bigger
order page.
But when we free with higher order page(pageblock_order), this merging
is not done. AFAIU, this is the reason for improvement in hot add
latency.
>
>
>> >
>> > If this looks okey, I'll modify users of set_online_page_callback
>> > and resend clean patch.
>>
>> [...]
>>
>> > +static int generic_online_pages(struct page *page, unsigned int order);
>> > +static online_pages_callback_t online_pages_callback = generic_online_pages;
>> > +
>> > +static int generic_online_pages(struct page *page, unsigned int order)
>> > +{
>> > + unsigned long nr_pages = 1 << order;
>> > + struct page *p = page;
>> > + unsigned int loop;
>> > +
>> > + for (loop = 0 ; loop < nr_pages ; loop++, p++) {
>> > + __ClearPageReserved(p);
>> > + set_page_count(p, 0);
>> > + }
>> > + adjust_managed_page_count(page, nr_pages);
>> > + init_page_count(page);
>> > + __free_pages(page, order);
>> > +
>> > + return 0;
>> > +}
>> > +
>> > +static int online_pages_blocks(unsigned long start_pfn, unsigned long nr_pages)
>> > +{
>> > + unsigned long pages_per_block = (1 << pageblock_order);
>> > + unsigned long nr_pageblocks = nr_pages / pages_per_block;
>> > +// unsigned long rem_pages = nr_pages % pages_per_block;
>> > + int i, ret, onlined_pages = 0;
>> > + struct page *page;
>> > +
>> > + for (i = 0 ; i < nr_pageblocks ; i++) {
>> > + page = pfn_to_page(start_pfn + (i * pages_per_block));
>> > + ret = (*online_pages_callback)(page, pageblock_order);
>> > + if (!ret)
>> > + onlined_pages += pages_per_block;
>> > + else if (ret > 0)
>> > + onlined_pages += ret;
>> > + }
>>
>> Could you explain why does the pages_per_block step makes any sense?
>> Why
>> don't you simply apply handle the full nr_pages worth of memory range
>> instead?
Yes. We can move the this loop to generic_online_pages and do
__free_pages() of pageblock_order.
>>
>> > +/*
>> > + if (rem_pages)
>> > + onlined_pages += online_page_single(start_pfn + i, rem_pages);
>> > +*/
>
> Do we expect no rem_pages with this patch?
I ll remove this code, in assumption that section size will be always
multiple of pageblock_order.
Regards,
Arun
>
>> > +
>> > + return onlined_pages;
>> > +}
>> > +
>> > static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
>> > void *arg)
>> > {
>> > - unsigned long i;
>> > unsigned long onlined_pages = *(unsigned long *)arg;
>> > - struct page *page;
>> >
>> > if (PageReserved(pfn_to_page(start_pfn)))
>> > - for (i = 0; i < nr_pages; i++) {
>> > - page = pfn_to_page(start_pfn + i);
>> > - (*online_page_callback)(page);
>> > - onlined_pages++;
>> > - }
>> > + onlined_pages = online_pages_blocks(start_pfn, nr_pages);
>> >
>> > online_mem_sections(start_pfn, start_pfn + nr_pages);
>
>
> Balbir Singh.
On 2018-09-12 18:47, Michal Hocko wrote:
> On Wed 12-09-18 22:57:43, Balbir Singh wrote:
>> On Wed, Sep 12, 2018 at 12:38:53PM +0200, Michal Hocko wrote:
>> > On Wed 12-09-18 14:56:45, Arun KS wrote:
>> > > When free pages are done with pageblock_order, time spend on
>> > > coalescing pages by buddy allocator can be reduced. With
>> > > section size of 256MB, hot add latency of a single section
>> > > shows improvement from 50-60 ms to less than 1 ms, hence
>> > > improving the hot add latency by 60%.
>> >
>> > Where does the improvement come from? You are still doing the same
>> > amount of work except that the number of callbacks is lower. Is this the
>> > real source of 60% improvement?
>> >
>>
>> It looks like only the first page of the pageblock is initialized, is
>> some of the cost amortized in terms of doing one initialization for
>> the page with order (order) and then relying on split_page and helpers
>> to do the rest? Of course the number of callbacks reduce by a
>> significant
>> number as well.
>
> Ohh, I have missed that part. Now when re-reading I can see the reason
> for the perf improvement. It is most likely the higher order free which
> ends up being much cheaper. This part makes some sense.
>
> How much is this feasible is another question. Do not forget we have
> those external providers of the online callback and those would need to
> be updated as well.
Sure Michal, I ll look into this.
>
> Btw. the normal memmap init code path does the same per-page free as
> well. If we really want to speed the hotplug path then I guess the init
> one would see a bigger improvement and those two should be in sync.
Thanks for pointers, Will look further.
>
>> > >
>> > > If this looks okey, I'll modify users of set_online_page_callback
>> > > and resend clean patch.
>> >
>> > [...]
>> >
>> > > +static int generic_online_pages(struct page *page, unsigned int order);
>> > > +static online_pages_callback_t online_pages_callback = generic_online_pages;
>> > > +
>> > > +static int generic_online_pages(struct page *page, unsigned int order)
>> > > +{
>> > > + unsigned long nr_pages = 1 << order;
>> > > + struct page *p = page;
>> > > + unsigned int loop;
>> > > +
>> > > + for (loop = 0 ; loop < nr_pages ; loop++, p++) {
>> > > + __ClearPageReserved(p);
>> > > + set_page_count(p, 0);
>
> btw. you want init_page_count here.
Do you mean replace set_page_count(p, 0) with init_page_count(page)?
Because init_page_count is setting the page _refcount to 1
static inline void init_page_count(struct page *page)
{
set_page_count(page, 1);
}
I thought in case of higher order pages only the first struct page
should have _refcount to 1 before calling __free_pages(). Please correct
me if wrong.
Regards,
Arun
On Wed 12-09-18 20:12:30, Arun KS wrote:
> On 2018-09-12 18:47, Michal Hocko wrote:
> > On Wed 12-09-18 22:57:43, Balbir Singh wrote:
> > > On Wed, Sep 12, 2018 at 12:38:53PM +0200, Michal Hocko wrote:
> > > > On Wed 12-09-18 14:56:45, Arun KS wrote:
> > > > > When free pages are done with pageblock_order, time spend on
> > > > > coalescing pages by buddy allocator can be reduced. With
> > > > > section size of 256MB, hot add latency of a single section
> > > > > shows improvement from 50-60 ms to less than 1 ms, hence
> > > > > improving the hot add latency by 60%.
> > > >
> > > > Where does the improvement come from? You are still doing the same
> > > > amount of work except that the number of callbacks is lower. Is this the
> > > > real source of 60% improvement?
> > > >
> > >
> > > It looks like only the first page of the pageblock is initialized, is
> > > some of the cost amortized in terms of doing one initialization for
> > > the page with order (order) and then relying on split_page and helpers
> > > to do the rest? Of course the number of callbacks reduce by a
> > > significant
> > > number as well.
> >
> > Ohh, I have missed that part. Now when re-reading I can see the reason
> > for the perf improvement. It is most likely the higher order free which
> > ends up being much cheaper. This part makes some sense.
> >
> > How much is this feasible is another question. Do not forget we have
> > those external providers of the online callback and those would need to
> > be updated as well.
> Sure Michal, I ll look into this.
>
> >
> > Btw. the normal memmap init code path does the same per-page free as
> > well. If we really want to speed the hotplug path then I guess the init
> > one would see a bigger improvement and those two should be in sync.
> Thanks for pointers, Will look further.
I haven't looked closer and I will be travelling next week so just hint.
Have a look at the nobootmem and how it frees pages to the page
allocator in __free_pages_boot_core. Seems exactly what you want and it
also answers your question about reference counting.
--
Michal Hocko
SUSE Labs
On 2018-09-14 14:40, Michal Hocko wrote:
> On Wed 12-09-18 20:12:30, Arun KS wrote:
>> On 2018-09-12 18:47, Michal Hocko wrote:
>> > On Wed 12-09-18 22:57:43, Balbir Singh wrote:
>> > > On Wed, Sep 12, 2018 at 12:38:53PM +0200, Michal Hocko wrote:
>> > > > On Wed 12-09-18 14:56:45, Arun KS wrote:
>> > > > > When free pages are done with pageblock_order, time spend on
>> > > > > coalescing pages by buddy allocator can be reduced. With
>> > > > > section size of 256MB, hot add latency of a single section
>> > > > > shows improvement from 50-60 ms to less than 1 ms, hence
>> > > > > improving the hot add latency by 60%.
>> > > >
>> > > > Where does the improvement come from? You are still doing the same
>> > > > amount of work except that the number of callbacks is lower. Is this the
>> > > > real source of 60% improvement?
>> > > >
>> > >
>> > > It looks like only the first page of the pageblock is initialized, is
>> > > some of the cost amortized in terms of doing one initialization for
>> > > the page with order (order) and then relying on split_page and helpers
>> > > to do the rest? Of course the number of callbacks reduce by a
>> > > significant
>> > > number as well.
>> >
>> > Ohh, I have missed that part. Now when re-reading I can see the reason
>> > for the perf improvement. It is most likely the higher order free which
>> > ends up being much cheaper. This part makes some sense.
>> >
>> > How much is this feasible is another question. Do not forget we have
>> > those external providers of the online callback and those would need to
>> > be updated as well.
>> Sure Michal, I ll look into this.
>>
>> >
>> > Btw. the normal memmap init code path does the same per-page free as
>> > well. If we really want to speed the hotplug path then I guess the init
>> > one would see a bigger improvement and those two should be in sync.
>> Thanks for pointers, Will look further.
>
> I haven't looked closer and I will be travelling next week so just
> hint.
> Have a look at the nobootmem and how it frees pages to the page
> allocator in __free_pages_boot_core. Seems exactly what you want and it
> also answers your question about reference counting.
Thanks Michal. Will send a new version after testing.
Regards,
Arun