This change makes populate_section_memmap()/depopulate_section_memmap
much simpler.
Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: Baoquan He <[email protected]>
---
v1->v2:
The old version only used __get_free_pages() to replace alloc_pages()
in populate_section_memmap().
http://lkml.kernel.org/r/[email protected]
mm/sparse.c | 27 +++------------------------
1 file changed, 3 insertions(+), 24 deletions(-)
diff --git a/mm/sparse.c b/mm/sparse.c
index bf6c00a28045..362018e82e22 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- struct page *page, *ret;
- unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
-
- page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
- if (page)
- goto got_map_page;
-
- ret = vmalloc(memmap_size);
- if (ret)
- goto got_map_ptr;
-
- return NULL;
-got_map_page:
- ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
-got_map_ptr:
-
- return ret;
+ return kvmalloc_node(sizeof(struct page) * PAGES_PER_SECTION,
+ GFP_KERNEL|__GFP_NOWARN, nid);
}
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- struct page *memmap = pfn_to_page(pfn);
-
- if (is_vmalloc_addr(memmap))
- vfree(memmap);
- else
- free_pages((unsigned long)memmap,
- get_order(sizeof(struct page) * PAGES_PER_SECTION));
+ kvfree(pfn_to_page(pfn));
}
static void free_map_bootmem(struct page *memmap)
--
2.17.2
On Thu, Mar 12, 2020 at 09:08:22PM +0800, Baoquan He wrote:
> This change makes populate_section_memmap()/depopulate_section_memmap
> much simpler.
>
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Baoquan He <[email protected]>
> ---
> v1->v2:
> The old version only used __get_free_pages() to replace alloc_pages()
> in populate_section_memmap().
> http://lkml.kernel.org/r/[email protected]
>
> mm/sparse.c | 27 +++------------------------
> 1 file changed, 3 insertions(+), 24 deletions(-)
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index bf6c00a28045..362018e82e22 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
> struct page * __meminit populate_section_memmap(unsigned long pfn,
> unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
> {
> - struct page *page, *ret;
> - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
> -
> - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
> - if (page)
> - goto got_map_page;
> -
> - ret = vmalloc(memmap_size);
> - if (ret)
> - goto got_map_ptr;
> -
> - return NULL;
> -got_map_page:
> - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
> -got_map_ptr:
> -
> - return ret;
> + return kvmalloc_node(sizeof(struct page) * PAGES_PER_SECTION,
> + GFP_KERNEL|__GFP_NOWARN, nid);
Use of NOWARN here is inappropriate, because there's no fallback.
Also, I'd use array_size(sizeof(struct page), PAGES_PER_SECTION).
On 03/12/20 at 06:34am, Matthew Wilcox wrote:
> On Thu, Mar 12, 2020 at 09:08:22PM +0800, Baoquan He wrote:
> > This change makes populate_section_memmap()/depopulate_section_memmap
> > much simpler.
> >
> > Suggested-by: Michal Hocko <[email protected]>
> > Signed-off-by: Baoquan He <[email protected]>
> > ---
> > v1->v2:
> > The old version only used __get_free_pages() to replace alloc_pages()
> > in populate_section_memmap().
> > http://lkml.kernel.org/r/[email protected]
> >
> > mm/sparse.c | 27 +++------------------------
> > 1 file changed, 3 insertions(+), 24 deletions(-)
> >
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index bf6c00a28045..362018e82e22 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
> > struct page * __meminit populate_section_memmap(unsigned long pfn,
> > unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
> > {
> > - struct page *page, *ret;
> > - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
> > -
> > - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
> > - if (page)
> > - goto got_map_page;
> > -
> > - ret = vmalloc(memmap_size);
> > - if (ret)
> > - goto got_map_ptr;
> > -
> > - return NULL;
> > -got_map_page:
> > - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
> > -got_map_ptr:
> > -
> > - return ret;
> > + return kvmalloc_node(sizeof(struct page) * PAGES_PER_SECTION,
> > + GFP_KERNEL|__GFP_NOWARN, nid);
>
> Use of NOWARN here is inappropriate, because there's no fallback.
kvmalloc_node has added __GFP_NOWARN internally when try to allocate
continuous pages. I will remove it.
> Also, I'd use array_size(sizeof(struct page), PAGES_PER_SECTION).
It's fine to me, even though we know it has no risk to overflow. I will
use array_size. Thanks.
This change makes populate_section_memmap()/depopulate_section_memmap
much simpler.
Suggested-by: Michal Hocko <[email protected]>
Signed-off-by: Baoquan He <[email protected]>
---
v2->v3:
Remove __GFP_NOWARN and use array_size when calling kvmalloc_node()
per Matthew's comments.
mm/sparse.c | 27 +++------------------------
1 file changed, 3 insertions(+), 24 deletions(-)
diff --git a/mm/sparse.c b/mm/sparse.c
index bf6c00a28045..bb99633575b5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- struct page *page, *ret;
- unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
-
- page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
- if (page)
- goto got_map_page;
-
- ret = vmalloc(memmap_size);
- if (ret)
- goto got_map_ptr;
-
- return NULL;
-got_map_page:
- ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
-got_map_ptr:
-
- return ret;
+ return kvmalloc_node(array_size(sizeof(struct page),
+ PAGES_PER_SECTION), GFP_KERNEL, nid);
}
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- struct page *memmap = pfn_to_page(pfn);
-
- if (is_vmalloc_addr(memmap))
- vfree(memmap);
- else
- free_pages((unsigned long)memmap,
- get_order(sizeof(struct page) * PAGES_PER_SECTION));
+ kvfree(pfn_to_page(pfn));
}
static void free_map_bootmem(struct page *memmap)
--
2.17.2
On Thu, Mar 12, 2020 at 06:34:16AM -0700, Matthew Wilcox wrote:
>On Thu, Mar 12, 2020 at 09:08:22PM +0800, Baoquan He wrote:
>> This change makes populate_section_memmap()/depopulate_section_memmap
>> much simpler.
>>
>> Suggested-by: Michal Hocko <[email protected]>
>> Signed-off-by: Baoquan He <[email protected]>
>> ---
>> v1->v2:
>> The old version only used __get_free_pages() to replace alloc_pages()
>> in populate_section_memmap().
>> http://lkml.kernel.org/r/[email protected]
>>
>> mm/sparse.c | 27 +++------------------------
>> 1 file changed, 3 insertions(+), 24 deletions(-)
>>
>> diff --git a/mm/sparse.c b/mm/sparse.c
>> index bf6c00a28045..362018e82e22 100644
>> --- a/mm/sparse.c
>> +++ b/mm/sparse.c
>> @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
>> struct page * __meminit populate_section_memmap(unsigned long pfn,
>> unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
>> {
>> - struct page *page, *ret;
>> - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
>> -
>> - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
>> - if (page)
>> - goto got_map_page;
>> -
>> - ret = vmalloc(memmap_size);
>> - if (ret)
>> - goto got_map_ptr;
>> -
>> - return NULL;
>> -got_map_page:
>> - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
>> -got_map_ptr:
>> -
>> - return ret;
>> + return kvmalloc_node(sizeof(struct page) * PAGES_PER_SECTION,
>> + GFP_KERNEL|__GFP_NOWARN, nid);
>
>Use of NOWARN here is inappropriate, because there's no fallback.
Hmm... this replacement is a little tricky.
When you look into kvmalloc_node(), it will do the fallback if the size is
bigger than PAGE_SIZE. This means the change here may not be equivalent as
before if memmap_size is less than PAGE_SIZE.
For example if :
PAGE_SIZE = 64K
SECTION_SIZE = 128M
would lead to memmap_size = 2K, which is less than PAGE_SIZE.
Not sure this combination would happen?
>Also, I'd use array_size(sizeof(struct page), PAGES_PER_SECTION).
--
Wei Yang
Help you, Help me
On Thu, Mar 12, 2020 at 02:18:26PM +0000, Wei Yang wrote:
> On Thu, Mar 12, 2020 at 06:34:16AM -0700, Matthew Wilcox wrote:
> >On Thu, Mar 12, 2020 at 09:08:22PM +0800, Baoquan He wrote:
> >> This change makes populate_section_memmap()/depopulate_section_memmap
> >> much simpler.
> >>
> >> Suggested-by: Michal Hocko <[email protected]>
> >> Signed-off-by: Baoquan He <[email protected]>
> >> ---
> >> v1->v2:
> >> The old version only used __get_free_pages() to replace alloc_pages()
> >> in populate_section_memmap().
> >> http://lkml.kernel.org/r/[email protected]
> >>
> >> mm/sparse.c | 27 +++------------------------
> >> 1 file changed, 3 insertions(+), 24 deletions(-)
> >>
> >> diff --git a/mm/sparse.c b/mm/sparse.c
> >> index bf6c00a28045..362018e82e22 100644
> >> --- a/mm/sparse.c
> >> +++ b/mm/sparse.c
> >> @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
> >> struct page * __meminit populate_section_memmap(unsigned long pfn,
> >> unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
> >> {
> >> - struct page *page, *ret;
> >> - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
> >> -
> >> - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
> >> - if (page)
> >> - goto got_map_page;
> >> -
> >> - ret = vmalloc(memmap_size);
> >> - if (ret)
> >> - goto got_map_ptr;
> >> -
> >> - return NULL;
> >> -got_map_page:
> >> - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
> >> -got_map_ptr:
> >> -
> >> - return ret;
> >> + return kvmalloc_node(sizeof(struct page) * PAGES_PER_SECTION,
> >> + GFP_KERNEL|__GFP_NOWARN, nid);
> >
> >Use of NOWARN here is inappropriate, because there's no fallback.
>
> Hmm... this replacement is a little tricky.
>
> When you look into kvmalloc_node(), it will do the fallback if the size is
> bigger than PAGE_SIZE. This means the change here may not be equivalent as
> before if memmap_size is less than PAGE_SIZE.
>
> For example if :
> PAGE_SIZE = 64K
> SECTION_SIZE = 128M
>
> would lead to memmap_size = 2K, which is less than PAGE_SIZE.
Yes, I thought about that. I decided it wasn't a problem, as long as
the struct page remains aligned, and we now have a guarantee that allocations
above 512 bytes in size are aligned. With a 64 byte struct page, as long
as we're allocating at least 8 pages, we know it'll be naturally aligned.
Your calculation doesn't take into account the size of struct page.
128M / 64k is indeed 2k, but you forgot to multiply by 64, which takes
us to 128kB.
On Thu, Mar 12, 2020 at 07:25:35AM -0700, Matthew Wilcox wrote:
>On Thu, Mar 12, 2020 at 02:18:26PM +0000, Wei Yang wrote:
>> On Thu, Mar 12, 2020 at 06:34:16AM -0700, Matthew Wilcox wrote:
>> >On Thu, Mar 12, 2020 at 09:08:22PM +0800, Baoquan He wrote:
>> >> This change makes populate_section_memmap()/depopulate_section_memmap
>> >> much simpler.
>> >>
>> >> Suggested-by: Michal Hocko <[email protected]>
>> >> Signed-off-by: Baoquan He <[email protected]>
>> >> ---
>> >> v1->v2:
>> >> The old version only used __get_free_pages() to replace alloc_pages()
>> >> in populate_section_memmap().
>> >> http://lkml.kernel.org/r/[email protected]
>> >>
>> >> mm/sparse.c | 27 +++------------------------
>> >> 1 file changed, 3 insertions(+), 24 deletions(-)
>> >>
>> >> diff --git a/mm/sparse.c b/mm/sparse.c
>> >> index bf6c00a28045..362018e82e22 100644
>> >> --- a/mm/sparse.c
>> >> +++ b/mm/sparse.c
>> >> @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
>> >> struct page * __meminit populate_section_memmap(unsigned long pfn,
>> >> unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
>> >> {
>> >> - struct page *page, *ret;
>> >> - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
>> >> -
>> >> - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
>> >> - if (page)
>> >> - goto got_map_page;
>> >> -
>> >> - ret = vmalloc(memmap_size);
>> >> - if (ret)
>> >> - goto got_map_ptr;
>> >> -
>> >> - return NULL;
>> >> -got_map_page:
>> >> - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
>> >> -got_map_ptr:
>> >> -
>> >> - return ret;
>> >> + return kvmalloc_node(sizeof(struct page) * PAGES_PER_SECTION,
>> >> + GFP_KERNEL|__GFP_NOWARN, nid);
>> >
>> >Use of NOWARN here is inappropriate, because there's no fallback.
>>
>> Hmm... this replacement is a little tricky.
>>
>> When you look into kvmalloc_node(), it will do the fallback if the size is
>> bigger than PAGE_SIZE. This means the change here may not be equivalent as
>> before if memmap_size is less than PAGE_SIZE.
>>
>> For example if :
>> PAGE_SIZE = 64K
>> SECTION_SIZE = 128M
>>
>> would lead to memmap_size = 2K, which is less than PAGE_SIZE.
>
>Yes, I thought about that. I decided it wasn't a problem, as long as
>the struct page remains aligned, and we now have a guarantee that allocations
>above 512 bytes in size are aligned. With a 64 byte struct page, as long
Where is this 512 bytes condition comes from?
>as we're allocating at least 8 pages, we know it'll be naturally aligned.
>
>Your calculation doesn't take into account the size of struct page.
>128M / 64k is indeed 2k, but you forgot to multiply by 64, which takes
>us to 128kB.
You are right. While would there be other combination? Or in the future?
For example, there are definitions of
#define SECTION_SIZE_BITS 26
#define SECTION_SIZE_BITS 24
Are we sure it won't break some thing?
--
Wei Yang
Help you, Help me
On Thu, Mar 12, 2020 at 10:50:55PM +0000, Wei Yang wrote:
> On Thu, Mar 12, 2020 at 07:25:35AM -0700, Matthew Wilcox wrote:
> >Yes, I thought about that. I decided it wasn't a problem, as long as
> >the struct page remains aligned, and we now have a guarantee that allocations
> >above 512 bytes in size are aligned. With a 64 byte struct page, as long
>
> Where is this 512 bytes condition comes from?
Filesystems need to do I/Os from kmalloc addresses and those I/Os need to
be 512 byte aligned.
> >as we're allocating at least 8 pages, we know it'll be naturally aligned.
> >
> >Your calculation doesn't take into account the size of struct page.
> >128M / 64k is indeed 2k, but you forgot to multiply by 64, which takes
> >us to 128kB.
>
> You are right. While would there be other combination? Or in the future?
>
> For example, there are definitions of
>
> #define SECTION_SIZE_BITS 26
> #define SECTION_SIZE_BITS 24
>
> Are we sure it won't break some thing?
As I said, once it's at least 512 bytes, it'll be 512 byte aligned. And I
can't see us having sections smaller than 8 pages, can you?
On 13.03.20 01:00, Matthew Wilcox wrote:
> On Thu, Mar 12, 2020 at 10:50:55PM +0000, Wei Yang wrote:
>> On Thu, Mar 12, 2020 at 07:25:35AM -0700, Matthew Wilcox wrote:
>>> Yes, I thought about that. I decided it wasn't a problem, as long as
>>> the struct page remains aligned, and we now have a guarantee that allocations
>>> above 512 bytes in size are aligned. With a 64 byte struct page, as long
>>
>> Where is this 512 bytes condition comes from?
>
> Filesystems need to do I/Os from kmalloc addresses and those I/Os need to
> be 512 byte aligned.
>
>>> as we're allocating at least 8 pages, we know it'll be naturally aligned.
>>>
>>> Your calculation doesn't take into account the size of struct page.
>>> 128M / 64k is indeed 2k, but you forgot to multiply by 64, which takes
>>> us to 128kB.
>>
>> You are right. While would there be other combination? Or in the future?
>>
>> For example, there are definitions of
>>
>> #define SECTION_SIZE_BITS 26
>> #define SECTION_SIZE_BITS 24
>>
>> Are we sure it won't break some thing?
>
> As I said, once it's at least 512 bytes, it'll be 512 byte aligned. And I
> can't see us having sections smaller than 8 pages, can you?
>
Smallest I am aware of is indeed special case of 16MB sections on PPC.
(If my math is correct, a 16MB section on PPC with 64k pages needs 16k
of memmap, which would be less than a 64k page, and thus, quite some
memory is wasted - but maybe my friday-afternoon-math is just wrong)
--
Thanks,
David / dhildenb
On 3/13/20 1:00 AM, Matthew Wilcox wrote:
> On Thu, Mar 12, 2020 at 10:50:55PM +0000, Wei Yang wrote:
>> On Thu, Mar 12, 2020 at 07:25:35AM -0700, Matthew Wilcox wrote:
>> >Yes, I thought about that. I decided it wasn't a problem, as long as
>> >the struct page remains aligned, and we now have a guarantee that allocations
>> >above 512 bytes in size are aligned. With a 64 byte struct page, as long
>>
>> Where is this 512 bytes condition comes from?
>
> Filesystems need to do I/Os from kmalloc addresses and those I/Os need to
> be 512 byte aligned.
To clarify, the guarantee exist for every power of two size. The I/O usecase was
part of the motivation for the guarantee, but there is not 512 byte limit. But
that means there is also no guarantee for a non-power-of-two size above (or
below) 512 bytes. Currently this only matters for sizes that fall into the 96
byte or 192 byte caches. With SLOB it can be any size.
So what I'm saying the allocations should make sure they are power of two and
then they will be aligned. The page size of 64bytes depends on some debugging
options being disabled, right?
>> >as we're allocating at least 8 pages, we know it'll be naturally aligned.
>> >
>> >Your calculation doesn't take into account the size of struct page.
>> >128M / 64k is indeed 2k, but you forgot to multiply by 64, which takes
>> >us to 128kB.
>>
>> You are right. While would there be other combination? Or in the future?
>>
>> For example, there are definitions of
>>
>> #define SECTION_SIZE_BITS 26
>> #define SECTION_SIZE_BITS 24
>>
>> Are we sure it won't break some thing?
>
> As I said, once it's at least 512 bytes, it'll be 512 byte aligned. And I
> can't see us having sections smaller than 8 pages, can you?
>
On Thu 12-03-20 22:17:49, Baoquan He wrote:
> This change makes populate_section_memmap()/depopulate_section_memmap
> much simpler.
Not only and you should make it more explicit. It also tries to allocate
memmaps from the target numa node so this is a functional change. I
would prefer to have that in a separate patch in case we hit some weird
NUMA setups which would choke on memory less nodes and similar horrors.
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Baoquan He <[email protected]>
I do not see any reason this shouldn't work. Btw. did you get to test
it?
Feel free to add
Acked-by: Michal Hocko <[email protected]>
to both patches if you go and split.
> ---
> v2->v3:
> Remove __GFP_NOWARN and use array_size when calling kvmalloc_node()
> per Matthew's comments.
>
> mm/sparse.c | 27 +++------------------------
> 1 file changed, 3 insertions(+), 24 deletions(-)
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index bf6c00a28045..bb99633575b5 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
> struct page * __meminit populate_section_memmap(unsigned long pfn,
> unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
> {
> - struct page *page, *ret;
> - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
> -
> - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
> - if (page)
> - goto got_map_page;
> -
> - ret = vmalloc(memmap_size);
> - if (ret)
> - goto got_map_ptr;
> -
> - return NULL;
> -got_map_page:
> - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
> -got_map_ptr:
> -
> - return ret;
> + return kvmalloc_node(array_size(sizeof(struct page),
> + PAGES_PER_SECTION), GFP_KERNEL, nid);
> }
>
> static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
> struct vmem_altmap *altmap)
> {
> - struct page *memmap = pfn_to_page(pfn);
> -
> - if (is_vmalloc_addr(memmap))
> - vfree(memmap);
> - else
> - free_pages((unsigned long)memmap,
> - get_order(sizeof(struct page) * PAGES_PER_SECTION));
> + kvfree(pfn_to_page(pfn));
> }
>
> static void free_map_bootmem(struct page *memmap)
> --
> 2.17.2
>
--
Michal Hocko
SUSE Labs
On Thu 12-03-20 14:18:26, Wei Yang wrote:
> On Thu, Mar 12, 2020 at 06:34:16AM -0700, Matthew Wilcox wrote:
> >On Thu, Mar 12, 2020 at 09:08:22PM +0800, Baoquan He wrote:
> >> This change makes populate_section_memmap()/depopulate_section_memmap
> >> much simpler.
> >>
> >> Suggested-by: Michal Hocko <[email protected]>
> >> Signed-off-by: Baoquan He <[email protected]>
> >> ---
> >> v1->v2:
> >> The old version only used __get_free_pages() to replace alloc_pages()
> >> in populate_section_memmap().
> >> http://lkml.kernel.org/r/[email protected]
> >>
> >> mm/sparse.c | 27 +++------------------------
> >> 1 file changed, 3 insertions(+), 24 deletions(-)
> >>
> >> diff --git a/mm/sparse.c b/mm/sparse.c
> >> index bf6c00a28045..362018e82e22 100644
> >> --- a/mm/sparse.c
> >> +++ b/mm/sparse.c
> >> @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
> >> struct page * __meminit populate_section_memmap(unsigned long pfn,
> >> unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
> >> {
> >> - struct page *page, *ret;
> >> - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
> >> -
> >> - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
> >> - if (page)
> >> - goto got_map_page;
> >> -
> >> - ret = vmalloc(memmap_size);
> >> - if (ret)
> >> - goto got_map_ptr;
> >> -
> >> - return NULL;
> >> -got_map_page:
> >> - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
> >> -got_map_ptr:
> >> -
> >> - return ret;
> >> + return kvmalloc_node(sizeof(struct page) * PAGES_PER_SECTION,
> >> + GFP_KERNEL|__GFP_NOWARN, nid);
> >
> >Use of NOWARN here is inappropriate, because there's no fallback.
>
> Hmm... this replacement is a little tricky.
>
> When you look into kvmalloc_node(), it will do the fallback if the size is
> bigger than PAGE_SIZE. This means the change here may not be equivalent as
> before if memmap_size is less than PAGE_SIZE.
I do not understand your concern to be honest. Even if a sub page memmap
size was possible (I haven't checked), I fail to see why kmalloc would
fail to allocate while vmalloc would have a bigger chance to succeed.
--
Michal Hocko
SUSE Labs
On 12.03.20 15:17, Baoquan He wrote:
> This change makes populate_section_memmap()/depopulate_section_memmap
> much simpler.
>
> Suggested-by: Michal Hocko <[email protected]>
> Signed-off-by: Baoquan He <[email protected]>
> ---
> v2->v3:
> Remove __GFP_NOWARN and use array_size when calling kvmalloc_node()
> per Matthew's comments.
>
> mm/sparse.c | 27 +++------------------------
> 1 file changed, 3 insertions(+), 24 deletions(-)
>
> diff --git a/mm/sparse.c b/mm/sparse.c
> index bf6c00a28045..bb99633575b5 100644
> --- a/mm/sparse.c
> +++ b/mm/sparse.c
> @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
> struct page * __meminit populate_section_memmap(unsigned long pfn,
> unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
> {
> - struct page *page, *ret;
> - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
> -
> - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
> - if (page)
> - goto got_map_page;
> -
> - ret = vmalloc(memmap_size);
> - if (ret)
> - goto got_map_ptr;
> -
> - return NULL;
> -got_map_page:
> - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
> -got_map_ptr:
> -
> - return ret;
> + return kvmalloc_node(array_size(sizeof(struct page),
> + PAGES_PER_SECTION), GFP_KERNEL, nid);
Indentation of the parameters looks wrong/weird. Maybe just calculate
memmap_size outside of the call, makes it easier to read IMHO.
Apart from that, looks good to me.
Reviewed-by: David Hildenbrand <[email protected]>
--
Thanks,
David / dhildenb
On Fri, Mar 13, 2020 at 03:57:33PM +0100, Michal Hocko wrote:
>On Thu 12-03-20 14:18:26, Wei Yang wrote:
>> On Thu, Mar 12, 2020 at 06:34:16AM -0700, Matthew Wilcox wrote:
>> >On Thu, Mar 12, 2020 at 09:08:22PM +0800, Baoquan He wrote:
>> >> This change makes populate_section_memmap()/depopulate_section_memmap
>> >> much simpler.
>> >>
>> >> Suggested-by: Michal Hocko <[email protected]>
>> >> Signed-off-by: Baoquan He <[email protected]>
>> >> ---
>> >> v1->v2:
>> >> The old version only used __get_free_pages() to replace alloc_pages()
>> >> in populate_section_memmap().
>> >> http://lkml.kernel.org/r/[email protected]
>> >>
>> >> mm/sparse.c | 27 +++------------------------
>> >> 1 file changed, 3 insertions(+), 24 deletions(-)
>> >>
>> >> diff --git a/mm/sparse.c b/mm/sparse.c
>> >> index bf6c00a28045..362018e82e22 100644
>> >> --- a/mm/sparse.c
>> >> +++ b/mm/sparse.c
>> >> @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
>> >> struct page * __meminit populate_section_memmap(unsigned long pfn,
>> >> unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
>> >> {
>> >> - struct page *page, *ret;
>> >> - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
>> >> -
>> >> - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
>> >> - if (page)
>> >> - goto got_map_page;
>> >> -
>> >> - ret = vmalloc(memmap_size);
>> >> - if (ret)
>> >> - goto got_map_ptr;
>> >> -
>> >> - return NULL;
>> >> -got_map_page:
>> >> - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
>> >> -got_map_ptr:
>> >> -
>> >> - return ret;
>> >> + return kvmalloc_node(sizeof(struct page) * PAGES_PER_SECTION,
>> >> + GFP_KERNEL|__GFP_NOWARN, nid);
>> >
>> >Use of NOWARN here is inappropriate, because there's no fallback.
>>
>> Hmm... this replacement is a little tricky.
>>
>> When you look into kvmalloc_node(), it will do the fallback if the size is
>> bigger than PAGE_SIZE. This means the change here may not be equivalent as
>> before if memmap_size is less than PAGE_SIZE.
>
>I do not understand your concern to be honest. Even if a sub page memmap
>size was possible (I haven't checked), I fail to see why kmalloc would
>fail to allocate while vmalloc would have a bigger chance to succeed.
>
You are right. My concern is just at kvmalloc_node() level.
After I look deep into the implementation, __vmalloc_area_node(), it will
still allocate memory on PAGE_SIZE base and fill up kernel page table.
That's why kvmalloc_node() stops trying when size is less than PAGE_SIZE.
Learned one more point. Thanks
>--
>Michal Hocko
>SUSE Labs
--
Wei Yang
Help you, Help me
On 03/13/20 at 03:56pm, Michal Hocko wrote:
> On Thu 12-03-20 22:17:49, Baoquan He wrote:
> > This change makes populate_section_memmap()/depopulate_section_memmap
> > much simpler.
>
> Not only and you should make it more explicit. It also tries to allocate
> memmaps from the target numa node so this is a functional change. I
> would prefer to have that in a separate patch in case we hit some weird
> NUMA setups which would choke on memory less nodes and similar horrors.
Yes, splitting sounds more reasonable, I would love to do that. One
question is I noticed Andrew had picked this into -mm tree, if I post a
new patchset including these two small patches, whether it's convenient
to drop the old one and get these two merged.
Sorry, I don't know very well how this works in mm maintaining.
>
> > Suggested-by: Michal Hocko <[email protected]>
> > Signed-off-by: Baoquan He <[email protected]>
>
> I do not see any reason this shouldn't work. Btw. did you get to test
> it?
>
> Feel free to add
> Acked-by: Michal Hocko <[email protected]>
> to both patches if you go and split.
>
> > ---
> > v2->v3:
> > Remove __GFP_NOWARN and use array_size when calling kvmalloc_node()
> > per Matthew's comments.
> >
> > mm/sparse.c | 27 +++------------------------
> > 1 file changed, 3 insertions(+), 24 deletions(-)
> >
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index bf6c00a28045..bb99633575b5 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
> > struct page * __meminit populate_section_memmap(unsigned long pfn,
> > unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
> > {
> > - struct page *page, *ret;
> > - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
> > -
> > - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
> > - if (page)
> > - goto got_map_page;
> > -
> > - ret = vmalloc(memmap_size);
> > - if (ret)
> > - goto got_map_ptr;
> > -
> > - return NULL;
> > -got_map_page:
> > - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
> > -got_map_ptr:
> > -
> > - return ret;
> > + return kvmalloc_node(array_size(sizeof(struct page),
> > + PAGES_PER_SECTION), GFP_KERNEL, nid);
> > }
> >
> > static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
> > struct vmem_altmap *altmap)
> > {
> > - struct page *memmap = pfn_to_page(pfn);
> > -
> > - if (is_vmalloc_addr(memmap))
> > - vfree(memmap);
> > - else
> > - free_pages((unsigned long)memmap,
> > - get_order(sizeof(struct page) * PAGES_PER_SECTION));
> > + kvfree(pfn_to_page(pfn));
> > }
> >
> > static void free_map_bootmem(struct page *memmap)
> > --
> > 2.17.2
> >
>
> --
> Michal Hocko
> SUSE Labs
>
On 03/13/20 at 03:56pm, Michal Hocko wrote:
> On Thu 12-03-20 22:17:49, Baoquan He wrote:
> > This change makes populate_section_memmap()/depopulate_section_memmap
> > much simpler.
>
> Not only and you should make it more explicit. It also tries to allocate
> memmaps from the target numa node so this is a functional change. I
> would prefer to have that in a separate patch in case we hit some weird
> NUMA setups which would choke on memory less nodes and similar horrors.
>
> > Suggested-by: Michal Hocko <[email protected]>
> > Signed-off-by: Baoquan He <[email protected]>
>
> I do not see any reason this shouldn't work. Btw. did you get to test
> it?
Forget replying to this comment. Yes, I have tested it before each post.
On Sat 14-03-20 08:53:34, Baoquan He wrote:
> On 03/13/20 at 03:56pm, Michal Hocko wrote:
> > On Thu 12-03-20 22:17:49, Baoquan He wrote:
> > > This change makes populate_section_memmap()/depopulate_section_memmap
> > > much simpler.
> >
> > Not only and you should make it more explicit. It also tries to allocate
> > memmaps from the target numa node so this is a functional change. I
> > would prefer to have that in a separate patch in case we hit some weird
> > NUMA setups which would choke on memory less nodes and similar horrors.
>
> Yes, splitting sounds more reasonable, I would love to do that. One
> question is I noticed Andrew had picked this into -mm tree, if I post a
> new patchset including these two small patches, whether it's convenient
> to drop the old one and get these two merged.
Andrew usually just drops the previous version and replaces it by the
new one. So just post a new version. Thanks!
--
Michal Hocko
SUSE Labs
On 03/14/20 at 01:56pm, Michal Hocko wrote:
> On Sat 14-03-20 08:53:34, Baoquan He wrote:
> > On 03/13/20 at 03:56pm, Michal Hocko wrote:
> > > On Thu 12-03-20 22:17:49, Baoquan He wrote:
> > > > This change makes populate_section_memmap()/depopulate_section_memmap
> > > > much simpler.
> > >
> > > Not only and you should make it more explicit. It also tries to allocate
> > > memmaps from the target numa node so this is a functional change. I
> > > would prefer to have that in a separate patch in case we hit some weird
> > > NUMA setups which would choke on memory less nodes and similar horrors.
> >
> > Yes, splitting sounds more reasonable, I would love to do that. One
> > question is I noticed Andrew had picked this into -mm tree, if I post a
> > new patchset including these two small patches, whether it's convenient
> > to drop the old one and get these two merged.
>
> Andrew usually just drops the previous version and replaces it by the
> new one. So just post a new version. Thanks!
I see, will post a new version, thanks.
On 03/13/20 at 04:04pm, David Hildenbrand wrote:
> On 12.03.20 15:17, Baoquan He wrote:
> > This change makes populate_section_memmap()/depopulate_section_memmap
> > much simpler.
> >
> > Suggested-by: Michal Hocko <[email protected]>
> > Signed-off-by: Baoquan He <[email protected]>
> > ---
> > v2->v3:
> > Remove __GFP_NOWARN and use array_size when calling kvmalloc_node()
> > per Matthew's comments.
> >
> > mm/sparse.c | 27 +++------------------------
> > 1 file changed, 3 insertions(+), 24 deletions(-)
> >
> > diff --git a/mm/sparse.c b/mm/sparse.c
> > index bf6c00a28045..bb99633575b5 100644
> > --- a/mm/sparse.c
> > +++ b/mm/sparse.c
> > @@ -734,35 +734,14 @@ static void free_map_bootmem(struct page *memmap)
> > struct page * __meminit populate_section_memmap(unsigned long pfn,
> > unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
> > {
> > - struct page *page, *ret;
> > - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
> > -
> > - page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
> > - if (page)
> > - goto got_map_page;
> > -
> > - ret = vmalloc(memmap_size);
> > - if (ret)
> > - goto got_map_ptr;
> > -
> > - return NULL;
> > -got_map_page:
> > - ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
> > -got_map_ptr:
> > -
> > - return ret;
> > + return kvmalloc_node(array_size(sizeof(struct page),
> > + PAGES_PER_SECTION), GFP_KERNEL, nid);
>
>
> Indentation of the parameters looks wrong/weird. Maybe just calculate
> memmap_size outside of the call, makes it easier to read IMHO.
I'll fix the indentation issue. Adding variable memmap_size seems not so
necessary.
>
> Apart from that, looks good to me.
>
> Reviewed-by: David Hildenbrand <[email protected]>
>
> --
> Thanks,
>
> David / dhildenb