PMD sharing for hugetlb mappings has been present for quite some time.
However, specific conditions must be met for mappings to be shared.
One of those conditions is that the mapping must include all pages that
can be mapped by a PUD. To help facilitate this, the mapping should be
PUD_SIZE aligned. The only way for a user to get PUD_SIZE alignment is
to pass an address to mmap() or shmat(). If the user does not pass an
address the mapping will be huge page size aligned.
To better utilize huge PMD sharing, attempt to PUD_SIZE align mappings
if the following conditions are met:
- Address passed to mmap or shmat is NULL
- The mapping is flaged as shared
- The mapping is at least PUD_SIZE in length
If a PUD_SIZE aligned mapping can not be created, then fall back to a
huge page size mapping.
Currently, only arm64 and x86 support PMD sharing. x86 has
HAVE_ARCH_HUGETLB_UNMAPPED_AREA (where code changes are made). arm64
uses the architecture independent code.
Mike Kravetz (2):
mm/hugetlbfs: Attempt PUD_SIZE mapping alignment if PMD sharing
enabled
x86/hugetlb: Attempt PUD_SIZE mapping alignment if PMD sharing enabled
arch/x86/mm/hugetlbpage.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
fs/hugetlbfs/inode.c | 29 +++++++++++++++++++--
2 files changed, 88 insertions(+), 5 deletions(-)
--
2.4.3
When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
following conditions are met:
- Address passed to mmap or shmat is NULL
- The mapping is flaged as shared
- The mapping is at least PUD_SIZE in length
If a PUD_SIZE aligned mapping can not be created, then fall back to a
huge page size mapping.
Signed-off-by: Mike Kravetz <[email protected]>
---
arch/x86/mm/hugetlbpage.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 61 insertions(+), 3 deletions(-)
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 42982b2..4f53af5 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -78,14 +78,39 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
{
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
+ bool pud_size_align = false;
+ unsigned long ret_addr;
+
+ /*
+ * If PMD sharing is enabled, align to PUD_SIZE to facilitate
+ * sharing. Only attempt alignment if no address was passed in,
+ * flags indicate sharing and size is big enough.
+ */
+ if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
+ !addr && flags & MAP_SHARED && len >= PUD_SIZE)
+ pud_size_align = true;
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_legacy_base;
info.high_limit = TASK_SIZE;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ if (pud_size_align)
+ info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
+ else
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
- return vm_unmapped_area(&info);
+ ret_addr = vm_unmapped_area(&info);
+
+ /*
+ * If failed with PUD_SIZE alignment, try again with huge page
+ * size alignment.
+ */
+ if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ ret_addr = vm_unmapped_area(&info);
+ }
+
+ return ret_addr;
}
static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
@@ -95,16 +120,38 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
unsigned long addr;
+ bool pud_size_align = false;
+
+ /*
+ * If PMD sharing is enabled, align to PUD_SIZE to facilitate
+ * sharing. Only attempt alignment if no address was passed in,
+ * flags indicate sharing and size is big enough.
+ */
+ if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
+ !addr0 && flags & MAP_SHARED && len >= PUD_SIZE)
+ pud_size_align = true;
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = current->mm->mmap_base;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ if (pud_size_align)
+ info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
+ else
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
+ * If failed with PUD_SIZE alignment, try again with huge page
+ * size alignment.
+ */
+ if ((addr & ~PAGE_MASK) && pud_size_align) {
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ addr = vm_unmapped_area(&info);
+ }
+
+ /*
* A failed mmap() very likely causes application failure,
* so fall back to the bottom-up function here. This scenario
* can happen with large stack limits and large mmap()
@@ -115,7 +162,18 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
+ if (pud_size_align)
+ info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
addr = vm_unmapped_area(&info);
+
+ /*
+ * If failed again with PUD_SIZE alignment, finally try with
+ * huge page size alignment.
+ */
+ if (addr & ~PAGE_MASK) {
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ addr = vm_unmapped_area(&info);
+ }
}
return addr;
--
2.4.3
When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
following conditions are met:
- Address passed to mmap or shmat is NULL
- The mapping is flaged as shared
- The mapping is at least PUD_SIZE in length
If a PUD_SIZE aligned mapping can not be created, then fall back to a
huge page size mapping.
Signed-off-by: Mike Kravetz <[email protected]>
---
fs/hugetlbfs/inode.c | 29 +++++++++++++++++++++++++++--
1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 540ddc9..22b2e38 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -175,6 +175,17 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
+ bool pud_size_align = false;
+ unsigned long ret_addr;
+
+ /*
+ * If PMD sharing is enabled, align to PUD_SIZE to facilitate
+ * sharing. Only attempt alignment if no address was passed in,
+ * flags indicate sharing and size is big enough.
+ */
+ if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
+ !addr && flags & MAP_SHARED && len >= PUD_SIZE)
+ pud_size_align = true;
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -199,9 +210,23 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
info.length = len;
info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ if (pud_size_align)
+ info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
+ else
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
- return vm_unmapped_area(&info);
+ ret_addr = vm_unmapped_area(&info);
+
+ /*
+ * If failed with PUD_SIZE alignment, try again with huge page
+ * size alignment.
+ */
+ if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ ret_addr = vm_unmapped_area(&info);
+ }
+
+ return ret_addr;
}
#endif
--
2.4.3
>
> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
> following conditions are met:
> - Address passed to mmap or shmat is NULL
> - The mapping is flaged as shared
> - The mapping is at least PUD_SIZE in length
> If a PUD_SIZE aligned mapping can not be created, then fall back to a
> huge page size mapping.
>
> Signed-off-by: Mike Kravetz <[email protected]>
> ---
> fs/hugetlbfs/inode.c | 29 +++++++++++++++++++++++++++--
> 1 file changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 540ddc9..22b2e38 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -175,6 +175,17 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> struct vm_area_struct *vma;
> struct hstate *h = hstate_file(file);
> struct vm_unmapped_area_info info;
> + bool pud_size_align = false;
> + unsigned long ret_addr;
> +
> + /*
> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
> + * sharing. Only attempt alignment if no address was passed in,
> + * flags indicate sharing and size is big enough.
> + */
> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
> + pud_size_align = true;
>
> if (len & ~huge_page_mask(h))
> return -EINVAL;
> @@ -199,9 +210,23 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> info.length = len;
> info.low_limit = TASK_UNMAPPED_BASE;
> info.high_limit = TASK_SIZE;
> - info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> + if (pud_size_align)
> + info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
> + else
> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> info.align_offset = 0;
> - return vm_unmapped_area(&info);
> + ret_addr = vm_unmapped_area(&info);
> +
> + /*
> + * If failed with PUD_SIZE alignment, try again with huge page
> + * size alignment.
> + */
Can we avoid going another round as long as it is a file with
the PUD page size?
Hillf
> + if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> + ret_addr = vm_unmapped_area(&info);
> + }
> +
> + return ret_addr;
> }
> #endif
>
> --
> 2.4.3
* Mike Kravetz <[email protected]> wrote:
> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
> following conditions are met:
> - Address passed to mmap or shmat is NULL
> - The mapping is flaged as shared
> - The mapping is at least PUD_SIZE in length
> If a PUD_SIZE aligned mapping can not be created, then fall back to a
> huge page size mapping.
>
> Signed-off-by: Mike Kravetz <[email protected]>
> ---
> arch/x86/mm/hugetlbpage.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 61 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
> index 42982b2..4f53af5 100644
> --- a/arch/x86/mm/hugetlbpage.c
> +++ b/arch/x86/mm/hugetlbpage.c
> @@ -78,14 +78,39 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
> {
> struct hstate *h = hstate_file(file);
> struct vm_unmapped_area_info info;
> + bool pud_size_align = false;
> + unsigned long ret_addr;
> +
> + /*
> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
> + * sharing. Only attempt alignment if no address was passed in,
> + * flags indicate sharing and size is big enough.
> + */
> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
> + pud_size_align = true;
>
> info.flags = 0;
> info.length = len;
> info.low_limit = current->mm->mmap_legacy_base;
> info.high_limit = TASK_SIZE;
> - info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> + if (pud_size_align)
> + info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
> + else
> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> info.align_offset = 0;
> - return vm_unmapped_area(&info);
> + ret_addr = vm_unmapped_area(&info);
> +
> + /*
> + * If failed with PUD_SIZE alignment, try again with huge page
> + * size alignment.
> + */
> + if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> + ret_addr = vm_unmapped_area(&info);
> + }
So AFAICS 'ret_addr' is either page aligned, or is an error code. Wouldn't it be a
lot easier to read to say:
if ((long)ret_addr > 0 && pud_size_align) {
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
ret_addr = vm_unmapped_area(&info);
}
return ret_addr;
to make it clear that it's about error handling, not some alignment
requirement/restriction?
> /*
> + * If failed with PUD_SIZE alignment, try again with huge page
> + * size alignment.
> + */
> + if ((addr & ~PAGE_MASK) && pud_size_align) {
> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> + addr = vm_unmapped_area(&info);
> + }
Ditto.
> addr = vm_unmapped_area(&info);
> +
> + /*
> + * If failed again with PUD_SIZE alignment, finally try with
> + * huge page size alignment.
> + */
> + if (addr & ~PAGE_MASK) {
> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> + addr = vm_unmapped_area(&info);
Ditto.
Thanks,
Ingo
On 03/28/2016 08:50 PM, Hillf Danton wrote:
>>
>> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
>> following conditions are met:
>> - Address passed to mmap or shmat is NULL
>> - The mapping is flaged as shared
>> - The mapping is at least PUD_SIZE in length
>> If a PUD_SIZE aligned mapping can not be created, then fall back to a
>> huge page size mapping.
>>
>> Signed-off-by: Mike Kravetz <[email protected]>
>> ---
>> fs/hugetlbfs/inode.c | 29 +++++++++++++++++++++++++++--
>> 1 file changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>> index 540ddc9..22b2e38 100644
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>> @@ -175,6 +175,17 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>> struct vm_area_struct *vma;
>> struct hstate *h = hstate_file(file);
>> struct vm_unmapped_area_info info;
>> + bool pud_size_align = false;
>> + unsigned long ret_addr;
>> +
>> + /*
>> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
>> + * sharing. Only attempt alignment if no address was passed in,
>> + * flags indicate sharing and size is big enough.
>> + */
>> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
>> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
>> + pud_size_align = true;
>>
>> if (len & ~huge_page_mask(h))
>> return -EINVAL;
>> @@ -199,9 +210,23 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>> info.length = len;
>> info.low_limit = TASK_UNMAPPED_BASE;
>> info.high_limit = TASK_SIZE;
>> - info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>> + if (pud_size_align)
>> + info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
>> + else
>> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>> info.align_offset = 0;
>> - return vm_unmapped_area(&info);
>> + ret_addr = vm_unmapped_area(&info);
>> +
>> + /*
>> + * If failed with PUD_SIZE alignment, try again with huge page
>> + * size alignment.
>> + */
>
> Can we avoid going another round as long as it is a file with
> the PUD page size?
Yes, that brings up a good point.
Since we only do PMD sharing with PMD_SIZE huge pages, that should be
part of the check as to whether we try PUD_SIZE alignment. The initial
check should be expanded as follows:
if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && !addr &&
flags & MAP_SHARED && huge_page_size(h) == PMD_SIZE && len >= PUD_SIZE)
pud_size_align = true;
In that case, pud_size_align remains false and we do not retry.
--
Mike Kravetz
>
> Hillf
>> + if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
>> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>> + ret_addr = vm_unmapped_area(&info);
>> + }
>> +
>> + return ret_addr;
>> }
>> #endif
>>
>> --
>> 2.4.3
>
On 03/29/2016 01:35 AM, Ingo Molnar wrote:
>
> * Mike Kravetz <[email protected]> wrote:
>
>> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
>> following conditions are met:
>> - Address passed to mmap or shmat is NULL
>> - The mapping is flaged as shared
>> - The mapping is at least PUD_SIZE in length
>> If a PUD_SIZE aligned mapping can not be created, then fall back to a
>> huge page size mapping.
>>
>> Signed-off-by: Mike Kravetz <[email protected]>
>> ---
>> arch/x86/mm/hugetlbpage.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
>> 1 file changed, 61 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
>> index 42982b2..4f53af5 100644
>> --- a/arch/x86/mm/hugetlbpage.c
>> +++ b/arch/x86/mm/hugetlbpage.c
>> @@ -78,14 +78,39 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
>> {
>> struct hstate *h = hstate_file(file);
>> struct vm_unmapped_area_info info;
>> + bool pud_size_align = false;
>> + unsigned long ret_addr;
>> +
>> + /*
>> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
>> + * sharing. Only attempt alignment if no address was passed in,
>> + * flags indicate sharing and size is big enough.
>> + */
>> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
>> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
>> + pud_size_align = true;
>>
>> info.flags = 0;
>> info.length = len;
>> info.low_limit = current->mm->mmap_legacy_base;
>> info.high_limit = TASK_SIZE;
>> - info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>> + if (pud_size_align)
>> + info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
>> + else
>> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>> info.align_offset = 0;
>> - return vm_unmapped_area(&info);
>> + ret_addr = vm_unmapped_area(&info);
>> +
>> + /*
>> + * If failed with PUD_SIZE alignment, try again with huge page
>> + * size alignment.
>> + */
>> + if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
>> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>> + ret_addr = vm_unmapped_area(&info);
>> + }
>
> So AFAICS 'ret_addr' is either page aligned, or is an error code. Wouldn't it be a
> lot easier to read to say:
>
> if ((long)ret_addr > 0 && pud_size_align) {
> info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> ret_addr = vm_unmapped_area(&info);
> }
>
> return ret_addr;
>
> to make it clear that it's about error handling, not some alignment
> requirement/restriction?
Yes, I agree that is easier to read. However, it assumes that process
virtual addresses can never evaluate to a negative long value. This may
be the case for x86_64 today. But, there are other architectures where
this is not the case. I know this is x86 specific code, but might it be
possible that x86 virtual addresses could be negative longs in the future?
It appears that all callers of vm_unmapped_area() are using the page aligned
check to determine error. I would prefer to do the same, and can add
comments to make that more clear.
Thanks,
--
Mike Kravetz
>
>> /*
>> + * If failed with PUD_SIZE alignment, try again with huge page
>> + * size alignment.
>> + */
>> + if ((addr & ~PAGE_MASK) && pud_size_align) {
>> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>> + addr = vm_unmapped_area(&info);
>> + }
>
> Ditto.
>
>> addr = vm_unmapped_area(&info);
>> +
>> + /*
>> + * If failed again with PUD_SIZE alignment, finally try with
>> + * huge page size alignment.
>> + */
>> + if (addr & ~PAGE_MASK) {
>> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>> + addr = vm_unmapped_area(&info);
>
> Ditto.
>
> Thanks,
>
> Ingo
>
On Mon, Mar 28, 2016 at 06:12:49PM -0700, Mike Kravetz wrote:
> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
> following conditions are met:
> - Address passed to mmap or shmat is NULL
> - The mapping is flaged as shared
> - The mapping is at least PUD_SIZE in length
> If a PUD_SIZE aligned mapping can not be created, then fall back to a
> huge page size mapping.
It would be kinder if the patch description includes why this change.
Simply "to facilitate pmd sharing" is helpful for someone who read
"git log".
>
> Signed-off-by: Mike Kravetz <[email protected]>
> ---
> fs/hugetlbfs/inode.c | 29 +++++++++++++++++++++++++++--
> 1 file changed, 27 insertions(+), 2 deletions(-)
>
> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
> index 540ddc9..22b2e38 100644
> --- a/fs/hugetlbfs/inode.c
> +++ b/fs/hugetlbfs/inode.c
> @@ -175,6 +175,17 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
> struct vm_area_struct *vma;
> struct hstate *h = hstate_file(file);
> struct vm_unmapped_area_info info;
> + bool pud_size_align = false;
> + unsigned long ret_addr;
> +
> + /*
> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
> + * sharing. Only attempt alignment if no address was passed in,
> + * flags indicate sharing and size is big enough.
> + */
> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
> + pud_size_align = true;
This code will have duplicates in the next patch, so how about checking
this in a separate check routine?
Thanks,
Naoya Horiguchi
On Tue, Mar 29, 2016 at 10:05:31AM -0700, Mike Kravetz wrote:
> On 03/29/2016 01:35 AM, Ingo Molnar wrote:
> >
> > * Mike Kravetz <[email protected]> wrote:
> >
> >> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
> >> following conditions are met:
> >> - Address passed to mmap or shmat is NULL
> >> - The mapping is flaged as shared
> >> - The mapping is at least PUD_SIZE in length
> >> If a PUD_SIZE aligned mapping can not be created, then fall back to a
> >> huge page size mapping.
> >>
> >> Signed-off-by: Mike Kravetz <[email protected]>
> >> ---
> >> arch/x86/mm/hugetlbpage.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
> >> 1 file changed, 61 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
> >> index 42982b2..4f53af5 100644
> >> --- a/arch/x86/mm/hugetlbpage.c
> >> +++ b/arch/x86/mm/hugetlbpage.c
> >> @@ -78,14 +78,39 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
> >> {
> >> struct hstate *h = hstate_file(file);
> >> struct vm_unmapped_area_info info;
> >> + bool pud_size_align = false;
> >> + unsigned long ret_addr;
> >> +
> >> + /*
> >> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
> >> + * sharing. Only attempt alignment if no address was passed in,
> >> + * flags indicate sharing and size is big enough.
> >> + */
> >> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> >> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
> >> + pud_size_align = true;
> >>
> >> info.flags = 0;
> >> info.length = len;
> >> info.low_limit = current->mm->mmap_legacy_base;
> >> info.high_limit = TASK_SIZE;
> >> - info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> >> + if (pud_size_align)
> >> + info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
> >> + else
> >> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> >> info.align_offset = 0;
> >> - return vm_unmapped_area(&info);
> >> + ret_addr = vm_unmapped_area(&info);
> >> +
> >> + /*
> >> + * If failed with PUD_SIZE alignment, try again with huge page
> >> + * size alignment.
> >> + */
> >> + if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
> >> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> >> + ret_addr = vm_unmapped_area(&info);
> >> + }
> >
> > So AFAICS 'ret_addr' is either page aligned, or is an error code. Wouldn't it be a
> > lot easier to read to say:
> >
> > if ((long)ret_addr > 0 && pud_size_align) {
> > info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> > ret_addr = vm_unmapped_area(&info);
> > }
> >
> > return ret_addr;
> >
> > to make it clear that it's about error handling, not some alignment
> > requirement/restriction?
>
> Yes, I agree that is easier to read. However, it assumes that process
> virtual addresses can never evaluate to a negative long value. This may
> be the case for x86_64 today. But, there are other architectures where
> this is not the case. I know this is x86 specific code, but might it be
> possible that x86 virtual addresses could be negative longs in the future?
>
> It appears that all callers of vm_unmapped_area() are using the page aligned
> check to determine error. I would prefer to do the same, and can add
> comments to make that more clear.
IS_ERR_VALUE() might be helpful?
* Naoya Horiguchi <[email protected]> wrote:
> On Tue, Mar 29, 2016 at 10:05:31AM -0700, Mike Kravetz wrote:
> > On 03/29/2016 01:35 AM, Ingo Molnar wrote:
> > >
> > > * Mike Kravetz <[email protected]> wrote:
> > >
> > >> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
> > >> following conditions are met:
> > >> - Address passed to mmap or shmat is NULL
> > >> - The mapping is flaged as shared
> > >> - The mapping is at least PUD_SIZE in length
> > >> If a PUD_SIZE aligned mapping can not be created, then fall back to a
> > >> huge page size mapping.
> > >>
> > >> Signed-off-by: Mike Kravetz <[email protected]>
> > >> ---
> > >> arch/x86/mm/hugetlbpage.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
> > >> 1 file changed, 61 insertions(+), 3 deletions(-)
> > >>
> > >> diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
> > >> index 42982b2..4f53af5 100644
> > >> --- a/arch/x86/mm/hugetlbpage.c
> > >> +++ b/arch/x86/mm/hugetlbpage.c
> > >> @@ -78,14 +78,39 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
> > >> {
> > >> struct hstate *h = hstate_file(file);
> > >> struct vm_unmapped_area_info info;
> > >> + bool pud_size_align = false;
> > >> + unsigned long ret_addr;
> > >> +
> > >> + /*
> > >> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
> > >> + * sharing. Only attempt alignment if no address was passed in,
> > >> + * flags indicate sharing and size is big enough.
> > >> + */
> > >> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> > >> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
> > >> + pud_size_align = true;
> > >>
> > >> info.flags = 0;
> > >> info.length = len;
> > >> info.low_limit = current->mm->mmap_legacy_base;
> > >> info.high_limit = TASK_SIZE;
> > >> - info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> > >> + if (pud_size_align)
> > >> + info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
> > >> + else
> > >> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> > >> info.align_offset = 0;
> > >> - return vm_unmapped_area(&info);
> > >> + ret_addr = vm_unmapped_area(&info);
> > >> +
> > >> + /*
> > >> + * If failed with PUD_SIZE alignment, try again with huge page
> > >> + * size alignment.
> > >> + */
> > >> + if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
> > >> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> > >> + ret_addr = vm_unmapped_area(&info);
> > >> + }
> > >
> > > So AFAICS 'ret_addr' is either page aligned, or is an error code. Wouldn't it be a
> > > lot easier to read to say:
> > >
> > > if ((long)ret_addr > 0 && pud_size_align) {
> > > info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> > > ret_addr = vm_unmapped_area(&info);
> > > }
> > >
> > > return ret_addr;
> > >
> > > to make it clear that it's about error handling, not some alignment
> > > requirement/restriction?
> >
> > Yes, I agree that is easier to read. However, it assumes that process
> > virtual addresses can never evaluate to a negative long value. This may
> > be the case for x86_64 today. But, there are other architectures where
> > this is not the case. I know this is x86 specific code, but might it be
> > possible that x86 virtual addresses could be negative longs in the future?
> >
> > It appears that all callers of vm_unmapped_area() are using the page aligned
> > check to determine error. I would prefer to do the same, and can add
> > comments to make that more clear.
>
> IS_ERR_VALUE() might be helpful?
Yes, please use IS_ERR_VALUE(), using PAGE_MASK is way too obfuscated.
Thanks,
Ingo
On 03/30/2016 07:26 PM, Naoya Horiguchi wrote:
> On Tue, Mar 29, 2016 at 10:05:31AM -0700, Mike Kravetz wrote:
>> On 03/29/2016 01:35 AM, Ingo Molnar wrote:
>>>
>>> * Mike Kravetz <[email protected]> wrote:
>>>
>>>> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
>>>> following conditions are met:
>>>> - Address passed to mmap or shmat is NULL
>>>> - The mapping is flaged as shared
>>>> - The mapping is at least PUD_SIZE in length
>>>> If a PUD_SIZE aligned mapping can not be created, then fall back to a
>>>> huge page size mapping.
>>>>
>>>> Signed-off-by: Mike Kravetz <[email protected]>
>>>> ---
>>>> arch/x86/mm/hugetlbpage.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---
>>>> 1 file changed, 61 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
>>>> index 42982b2..4f53af5 100644
>>>> --- a/arch/x86/mm/hugetlbpage.c
>>>> +++ b/arch/x86/mm/hugetlbpage.c
>>>> @@ -78,14 +78,39 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
>>>> {
>>>> struct hstate *h = hstate_file(file);
>>>> struct vm_unmapped_area_info info;
>>>> + bool pud_size_align = false;
>>>> + unsigned long ret_addr;
>>>> +
>>>> + /*
>>>> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
>>>> + * sharing. Only attempt alignment if no address was passed in,
>>>> + * flags indicate sharing and size is big enough.
>>>> + */
>>>> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
>>>> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
>>>> + pud_size_align = true;
>>>>
>>>> info.flags = 0;
>>>> info.length = len;
>>>> info.low_limit = current->mm->mmap_legacy_base;
>>>> info.high_limit = TASK_SIZE;
>>>> - info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>>>> + if (pud_size_align)
>>>> + info.align_mask = PAGE_MASK & (PUD_SIZE - 1);
>>>> + else
>>>> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>>>> info.align_offset = 0;
>>>> - return vm_unmapped_area(&info);
>>>> + ret_addr = vm_unmapped_area(&info);
>>>> +
>>>> + /*
>>>> + * If failed with PUD_SIZE alignment, try again with huge page
>>>> + * size alignment.
>>>> + */
>>>> + if ((ret_addr & ~PAGE_MASK) && pud_size_align) {
>>>> + info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>>>> + ret_addr = vm_unmapped_area(&info);
>>>> + }
>>>
>>> So AFAICS 'ret_addr' is either page aligned, or is an error code. Wouldn't it be a
>>> lot easier to read to say:
>>>
>>> if ((long)ret_addr > 0 && pud_size_align) {
>>> info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>>> ret_addr = vm_unmapped_area(&info);
>>> }
>>>
>>> return ret_addr;
>>>
>>> to make it clear that it's about error handling, not some alignment
>>> requirement/restriction?
>>
>> Yes, I agree that is easier to read. However, it assumes that process
>> virtual addresses can never evaluate to a negative long value. This may
>> be the case for x86_64 today. But, there are other architectures where
>> this is not the case. I know this is x86 specific code, but might it be
>> possible that x86 virtual addresses could be negative longs in the future?
>>
>> It appears that all callers of vm_unmapped_area() are using the page aligned
>> check to determine error. I would prefer to do the same, and can add
>> comments to make that more clear.
>
> IS_ERR_VALUE() might be helpful?
>
Thanks Naoya, I'll change all this to use IS_ERR_VALUE().
--
Mike Kravetz
On 03/30/2016 07:18 PM, Naoya Horiguchi wrote:
> On Mon, Mar 28, 2016 at 06:12:49PM -0700, Mike Kravetz wrote:
>> When creating a hugetlb mapping, attempt PUD_SIZE alignment if the
>> following conditions are met:
>> - Address passed to mmap or shmat is NULL
>> - The mapping is flaged as shared
>> - The mapping is at least PUD_SIZE in length
>> If a PUD_SIZE aligned mapping can not be created, then fall back to a
>> huge page size mapping.
>
> It would be kinder if the patch description includes why this change.
> Simply "to facilitate pmd sharing" is helpful for someone who read
> "git log".
Ok, will do.
>
>>
>> Signed-off-by: Mike Kravetz <[email protected]>
>> ---
>> fs/hugetlbfs/inode.c | 29 +++++++++++++++++++++++++++--
>> 1 file changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
>> index 540ddc9..22b2e38 100644
>> --- a/fs/hugetlbfs/inode.c
>> +++ b/fs/hugetlbfs/inode.c
>> @@ -175,6 +175,17 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
>> struct vm_area_struct *vma;
>> struct hstate *h = hstate_file(file);
>> struct vm_unmapped_area_info info;
>> + bool pud_size_align = false;
>> + unsigned long ret_addr;
>> +
>> + /*
>> + * If PMD sharing is enabled, align to PUD_SIZE to facilitate
>> + * sharing. Only attempt alignment if no address was passed in,
>> + * flags indicate sharing and size is big enough.
>> + */
>> + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
>> + !addr && flags & MAP_SHARED && len >= PUD_SIZE)
>> + pud_size_align = true;
>
> This code will have duplicates in the next patch, so how about checking
> this in a separate check routine?
Good suggestion, thanks
--
Mike Kravetz
>
> Thanks,
> Naoya Horiguchi
>