LinuxLists.cc - [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

2022-06-24 17:57:24

Subject: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

After high-granularity mapping, page table entries for HugeTLB pages can
be of any size/type. (For example, we can have a 1G page mapped with a
mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
PTE after we have done a page table walk.

Without this, we'd have to pass around the "size" of the PTE everywhere.
We effectively did this before; it could be fetched from the hstate,
which we pass around pretty much everywhere.

This commit includes definitions for some basic helper functions that
are used later. These helper functions wrap existing PTE
inspection/modification functions, where the correct version is picked
depending on if the HugeTLB PTE is actually "huge" or not. (Previously,
all HugeTLB PTEs were "huge").

For example, hugetlb_ptep_get wraps huge_ptep_get and ptep_get, where
ptep_get is used when the HugeTLB PTE is PAGE_SIZE, and huge_ptep_get is
used in all other cases.

Signed-off-by: James Houghton <[email protected]>
---
include/linux/hugetlb.h | 84 +++++++++++++++++++++++++++++++++++++++++
mm/hugetlb.c | 57 ++++++++++++++++++++++++++++
2 files changed, 141 insertions(+)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5fe1db46d8c9..1d4ec9dfdebf 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -46,6 +46,68 @@ enum {
__NR_USED_SUBPAGE,
};

+struct hugetlb_pte {
+ pte_t *ptep;
+ unsigned int shift;
+};
+
+static inline
+void hugetlb_pte_init(struct hugetlb_pte *hpte)
+{
+ hpte->ptep = NULL;
+}
+
+static inline
+void hugetlb_pte_populate(struct hugetlb_pte *hpte, pte_t *ptep,
+ unsigned int shift)
+{
+ BUG_ON(!ptep);
+ hpte->ptep = ptep;
+ hpte->shift = shift;
+}
+
+static inline
+unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
+{
+ BUG_ON(!hpte->ptep);
+ return 1UL << hpte->shift;
+}
+
+static inline
+unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
+{
+ BUG_ON(!hpte->ptep);
+ return ~(hugetlb_pte_size(hpte) - 1);
+}
+
+static inline
+unsigned int hugetlb_pte_shift(const struct hugetlb_pte *hpte)
+{
+ BUG_ON(!hpte->ptep);
+ return hpte->shift;
+}
+
+static inline
+bool hugetlb_pte_huge(const struct hugetlb_pte *hpte)
+{
+ return !IS_ENABLED(CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING) ||
+ hugetlb_pte_shift(hpte) > PAGE_SHIFT;
+}
+
+static inline
+void hugetlb_pte_copy(struct hugetlb_pte *dest, const struct hugetlb_pte *src)
+{
+ dest->ptep = src->ptep;
+ dest->shift = src->shift;
+}
+
+bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte);
+bool hugetlb_pte_none(const struct hugetlb_pte *hpte);
+bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte);
+pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte);
+void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
+ unsigned long address);
+
struct hugepage_subpool {
spinlock_t lock;
long count;
@@ -1130,6 +1192,28 @@ static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
return ptl;
}

+static inline
+spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
+{
+
+ BUG_ON(!hpte->ptep);
+ // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
+ // the regular page table lock.
+ if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
+ return huge_pte_lockptr(hugetlb_pte_shift(hpte),
+ mm, hpte->ptep);
+ return &mm->page_table_lock;
+}
+
+static inline
+spinlock_t *hugetlb_pte_lock(struct mm_struct *mm, struct hugetlb_pte *hpte)
+{
+ spinlock_t *ptl = hugetlb_pte_lockptr(mm, hpte);
+
+ spin_lock(ptl);
+ return ptl;
+}
+
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
extern void __init hugetlb_cma_check(void);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d6d0d4c03def..1a1434e29740 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1120,6 +1120,63 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
return false;
}

+bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte)
+{
+ pgd_t pgd;
+ p4d_t p4d;
+ pud_t pud;
+ pmd_t pmd;
+
+ BUG_ON(!hpte->ptep);
+ if (hugetlb_pte_size(hpte) >= PGDIR_SIZE) {
+ pgd = *(pgd_t *)hpte->ptep;
+ return pgd_present(pgd) && pgd_leaf(pgd);
+ } else if (hugetlb_pte_size(hpte) >= P4D_SIZE) {
+ p4d = *(p4d_t *)hpte->ptep;
+ return p4d_present(p4d) && p4d_leaf(p4d);
+ } else if (hugetlb_pte_size(hpte) >= PUD_SIZE) {
+ pud = *(pud_t *)hpte->ptep;
+ return pud_present(pud) && pud_leaf(pud);
+ } else if (hugetlb_pte_size(hpte) >= PMD_SIZE) {
+ pmd = *(pmd_t *)hpte->ptep;
+ return pmd_present(pmd) && pmd_leaf(pmd);
+ } else if (hugetlb_pte_size(hpte) >= PAGE_SIZE)
+ return pte_present(*hpte->ptep);
+ BUG();
+}
+
+bool hugetlb_pte_none(const struct hugetlb_pte *hpte)
+{
+ if (hugetlb_pte_huge(hpte))
+ return huge_pte_none(huge_ptep_get(hpte->ptep));
+ return pte_none(ptep_get(hpte->ptep));
+}
+
+bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte)
+{
+ if (hugetlb_pte_huge(hpte))
+ return huge_pte_none_mostly(huge_ptep_get(hpte->ptep));
+ return pte_none_mostly(ptep_get(hpte->ptep));
+}
+
+pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte)
+{
+ if (hugetlb_pte_huge(hpte))
+ return huge_ptep_get(hpte->ptep);
+ return ptep_get(hpte->ptep);
+}
+
+void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
+ unsigned long address)
+{
+ BUG_ON(!hpte->ptep);
+ unsigned long sz = hugetlb_pte_size(hpte);
+
+ if (sz > PAGE_SIZE)
+ return huge_pte_clear(mm, address, hpte->ptep, sz);
+ return pte_clear(mm, address, hpte->ptep);
+}
+
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
--
2.37.0.rc0.161.g10f37bed90-goog

2022-06-27 13:23:43

by manish.mishra

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On 24/06/22 11:06 pm, James Houghton wrote:
> After high-granularity mapping, page table entries for HugeTLB pages can
> be of any size/type. (For example, we can have a 1G page mapped with a
> mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> PTE after we have done a page table walk.
>
> Without this, we'd have to pass around the "size" of the PTE everywhere.
> We effectively did this before; it could be fetched from the hstate,
> which we pass around pretty much everywhere.
>
> This commit includes definitions for some basic helper functions that
> are used later. These helper functions wrap existing PTE
> inspection/modification functions, where the correct version is picked
> depending on if the HugeTLB PTE is actually "huge" or not. (Previously,
> all HugeTLB PTEs were "huge").
>
> For example, hugetlb_ptep_get wraps huge_ptep_get and ptep_get, where
> ptep_get is used when the HugeTLB PTE is PAGE_SIZE, and huge_ptep_get is
> used in all other cases.
>
> Signed-off-by: James Houghton <[email protected]>
> ---
> include/linux/hugetlb.h | 84 +++++++++++++++++++++++++++++++++++++++++
> mm/hugetlb.c | 57 ++++++++++++++++++++++++++++
> 2 files changed, 141 insertions(+)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 5fe1db46d8c9..1d4ec9dfdebf 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -46,6 +46,68 @@ enum {
> __NR_USED_SUBPAGE,
> };
>
> +struct hugetlb_pte {
> + pte_t *ptep;
> + unsigned int shift;
> +};
> +
> +static inline
> +void hugetlb_pte_init(struct hugetlb_pte *hpte)
> +{
> + hpte->ptep = NULL;
I agree it does not matter but still will hpte->shift = 0 too be better?
> +}
> +
> +static inline
> +void hugetlb_pte_populate(struct hugetlb_pte *hpte, pte_t *ptep,
> + unsigned int shift)
> +{
> + BUG_ON(!ptep);
> + hpte->ptep = ptep;
> + hpte->shift = shift;
> +}
> +
> +static inline
> +unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return 1UL << hpte->shift;
> +}
> +
> +static inline
> +unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return ~(hugetlb_pte_size(hpte) - 1);
> +}
> +
> +static inline
> +unsigned int hugetlb_pte_shift(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return hpte->shift;
> +}
> +
> +static inline
> +bool hugetlb_pte_huge(const struct hugetlb_pte *hpte)
> +{
> + return !IS_ENABLED(CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING) ||
> + hugetlb_pte_shift(hpte) > PAGE_SHIFT;
> +}
> +
> +static inline
> +void hugetlb_pte_copy(struct hugetlb_pte *dest, const struct hugetlb_pte *src)
> +{
> + dest->ptep = src->ptep;
> + dest->shift = src->shift;
> +}
> +
> +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte);
> +bool hugetlb_pte_none(const struct hugetlb_pte *hpte);
> +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte);
> +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte);
> +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> + unsigned long address);
> +
> struct hugepage_subpool {
> spinlock_t lock;
> long count;
> @@ -1130,6 +1192,28 @@ static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
> return ptl;
> }
>
> +static inline
> +spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
> +{
> +
> + BUG_ON(!hpte->ptep);
> + // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
> + // the regular page table lock.
> + if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
> + return huge_pte_lockptr(hugetlb_pte_shift(hpte),
> + mm, hpte->ptep);
> + return &mm->page_table_lock;
> +}
> +
> +static inline
> +spinlock_t *hugetlb_pte_lock(struct mm_struct *mm, struct hugetlb_pte *hpte)
> +{
> + spinlock_t *ptl = hugetlb_pte_lockptr(mm, hpte);
> +
> + spin_lock(ptl);
> + return ptl;
> +}
> +
> #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
> extern void __init hugetlb_cma_reserve(int order);
> extern void __init hugetlb_cma_check(void);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d6d0d4c03def..1a1434e29740 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1120,6 +1120,63 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
> return false;
> }
>
> +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte)
> +{
> + pgd_t pgd;
> + p4d_t p4d;
> + pud_t pud;
> + pmd_t pmd;
> +
> + BUG_ON(!hpte->ptep);
> + if (hugetlb_pte_size(hpte) >= PGDIR_SIZE) {
> + pgd = *(pgd_t *)hpte->ptep;

sorry did not understand in these conditions why

hugetlb_pte_size(hpte) >= PGDIR_SIZE. I mean why >= check

and not just == check?

> + return pgd_present(pgd) && pgd_leaf(pgd);
> + } else if (hugetlb_pte_size(hpte) >= P4D_SIZE) {
> + p4d = *(p4d_t *)hpte->ptep;
> + return p4d_present(p4d) && p4d_leaf(p4d);
> + } else if (hugetlb_pte_size(hpte) >= PUD_SIZE) {
> + pud = *(pud_t *)hpte->ptep;
> + return pud_present(pud) && pud_leaf(pud);
> + } else if (hugetlb_pte_size(hpte) >= PMD_SIZE) {
> + pmd = *(pmd_t *)hpte->ptep;
> + return pmd_present(pmd) && pmd_leaf(pmd);
> + } else if (hugetlb_pte_size(hpte) >= PAGE_SIZE)
> + return pte_present(*hpte->ptep);
> + BUG();
> +}
> +
> +bool hugetlb_pte_none(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_pte_none(huge_ptep_get(hpte->ptep));
> + return pte_none(ptep_get(hpte->ptep));
> +}
> +
> +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_pte_none_mostly(huge_ptep_get(hpte->ptep));
> + return pte_none_mostly(ptep_get(hpte->ptep));
> +}
> +
> +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_ptep_get(hpte->ptep);
> + return ptep_get(hpte->ptep);
> +}
> +
> +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> + unsigned long address)
> +{
> + BUG_ON(!hpte->ptep);
> + unsigned long sz = hugetlb_pte_size(hpte);
> +
> + if (sz > PAGE_SIZE)
> + return huge_pte_clear(mm, address, hpte->ptep, sz);

just for cosistency something like above?

if (hugetlb_pte_huge(hpte))
+ return huge_pte_clear
;

> + return pte_clear(mm, address, hpte->ptep);
> +}
> +
> static void enqueue_huge_page(struct hstate *h, struct page *page)
> {
> int nid = page_to_nid(page);

2022-06-28 20:53:17

by Mina Almasry

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On Fri, Jun 24, 2022 at 10:37 AM James Houghton <[email protected]> wrote:
>
> After high-granularity mapping, page table entries for HugeTLB pages can
> be of any size/type. (For example, we can have a 1G page mapped with a
> mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> PTE after we have done a page table walk.
>
> Without this, we'd have to pass around the "size" of the PTE everywhere.
> We effectively did this before; it could be fetched from the hstate,
> which we pass around pretty much everywhere.
>
> This commit includes definitions for some basic helper functions that
> are used later. These helper functions wrap existing PTE
> inspection/modification functions, where the correct version is picked
> depending on if the HugeTLB PTE is actually "huge" or not. (Previously,
> all HugeTLB PTEs were "huge").
>
> For example, hugetlb_ptep_get wraps huge_ptep_get and ptep_get, where
> ptep_get is used when the HugeTLB PTE is PAGE_SIZE, and huge_ptep_get is
> used in all other cases.
>
> Signed-off-by: James Houghton <[email protected]>
> ---
> include/linux/hugetlb.h | 84 +++++++++++++++++++++++++++++++++++++++++
> mm/hugetlb.c | 57 ++++++++++++++++++++++++++++
> 2 files changed, 141 insertions(+)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 5fe1db46d8c9..1d4ec9dfdebf 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -46,6 +46,68 @@ enum {
> __NR_USED_SUBPAGE,
> };
>
> +struct hugetlb_pte {
> + pte_t *ptep;
> + unsigned int shift;
> +};
> +
> +static inline
> +void hugetlb_pte_init(struct hugetlb_pte *hpte)
> +{
> + hpte->ptep = NULL;

shift = 0; ?

> +}
> +
> +static inline
> +void hugetlb_pte_populate(struct hugetlb_pte *hpte, pte_t *ptep,
> + unsigned int shift)
> +{
> + BUG_ON(!ptep);
> + hpte->ptep = ptep;
> + hpte->shift = shift;
> +}
> +
> +static inline
> +unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return 1UL << hpte->shift;
> +}
> +

This helper is quite redundant in my opinion.

> +static inline
> +unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return ~(hugetlb_pte_size(hpte) - 1);
> +}
> +
> +static inline
> +unsigned int hugetlb_pte_shift(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return hpte->shift;
> +}
> +

This one jumps as quite redundant too.

> +static inline
> +bool hugetlb_pte_huge(const struct hugetlb_pte *hpte)
> +{
> + return !IS_ENABLED(CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING) ||
> + hugetlb_pte_shift(hpte) > PAGE_SHIFT;
> +}
> +

I'm guessing the !IS_ENABLED() check is because only the HGM code
would store a non-huge pte in a hugetlb_pte struct. I think it's a bit
fragile because anyone can add code in the future that uses
hugetlb_pte in unexpected ways, but I will concede that it is correct
as written.

> +static inline
> +void hugetlb_pte_copy(struct hugetlb_pte *dest, const struct hugetlb_pte *src)
> +{
> + dest->ptep = src->ptep;
> + dest->shift = src->shift;
> +}
> +
> +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte);
> +bool hugetlb_pte_none(const struct hugetlb_pte *hpte);
> +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte);
> +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte);
> +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> + unsigned long address);
> +
> struct hugepage_subpool {
> spinlock_t lock;
> long count;
> @@ -1130,6 +1192,28 @@ static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
> return ptl;
> }
>
> +static inline

Maybe for organization, move all the static functions you're adding
above the hugetlb_pte_* declarations you're adding?

> +spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
> +{
> +
> + BUG_ON(!hpte->ptep);
> + // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
> + // the regular page table lock.

Does checkpatch.pl not complain about // style comments? I think those
are not allowed, no?

> + if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
> + return huge_pte_lockptr(hugetlb_pte_shift(hpte),
> + mm, hpte->ptep);
> + return &mm->page_table_lock;
> +}
> +
> +static inline
> +spinlock_t *hugetlb_pte_lock(struct mm_struct *mm, struct hugetlb_pte *hpte)
> +{
> + spinlock_t *ptl = hugetlb_pte_lockptr(mm, hpte);
> +
> + spin_lock(ptl);
> + return ptl;
> +}
> +
> #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
> extern void __init hugetlb_cma_reserve(int order);
> extern void __init hugetlb_cma_check(void);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d6d0d4c03def..1a1434e29740 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1120,6 +1120,63 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
> return false;
> }
>
> +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte)
> +{
> + pgd_t pgd;
> + p4d_t p4d;
> + pud_t pud;
> + pmd_t pmd;
> +
> + BUG_ON(!hpte->ptep);
> + if (hugetlb_pte_size(hpte) >= PGDIR_SIZE) {
> + pgd = *(pgd_t *)hpte->ptep;
> + return pgd_present(pgd) && pgd_leaf(pgd);
> + } else if (hugetlb_pte_size(hpte) >= P4D_SIZE) {
> + p4d = *(p4d_t *)hpte->ptep;
> + return p4d_present(p4d) && p4d_leaf(p4d);
> + } else if (hugetlb_pte_size(hpte) >= PUD_SIZE) {
> + pud = *(pud_t *)hpte->ptep;
> + return pud_present(pud) && pud_leaf(pud);
> + } else if (hugetlb_pte_size(hpte) >= PMD_SIZE) {
> + pmd = *(pmd_t *)hpte->ptep;
> + return pmd_present(pmd) && pmd_leaf(pmd);
> + } else if (hugetlb_pte_size(hpte) >= PAGE_SIZE)
> + return pte_present(*hpte->ptep);

The use of >= is a bit curious to me. Shouldn't these be ==?

Also probably doesn't matter but I was thinking to use *_SHIFTs
instead of *_SIZE so you don't have to calculate the size 5 times in
this routine, or calculate hugetlb_pte_size() once for some less code
duplication and re-use?

> + BUG();
> +}
> +
> +bool hugetlb_pte_none(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_pte_none(huge_ptep_get(hpte->ptep));
> + return pte_none(ptep_get(hpte->ptep));
> +}
> +
> +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_pte_none_mostly(huge_ptep_get(hpte->ptep));
> + return pte_none_mostly(ptep_get(hpte->ptep));
> +}
> +
> +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_ptep_get(hpte->ptep);
> + return ptep_get(hpte->ptep);
> +}
> +
> +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> + unsigned long address)
> +{
> + BUG_ON(!hpte->ptep);
> + unsigned long sz = hugetlb_pte_size(hpte);
> +
> + if (sz > PAGE_SIZE)
> + return huge_pte_clear(mm, address, hpte->ptep, sz);
> + return pte_clear(mm, address, hpte->ptep);
> +}
> +
> static void enqueue_huge_page(struct hstate *h, struct page *page)
> {
> int nid = page_to_nid(page);
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>

2022-06-28 20:59:37

by Mike Kravetz

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On 06/24/22 17:36, James Houghton wrote:
> After high-granularity mapping, page table entries for HugeTLB pages can
> be of any size/type. (For example, we can have a 1G page mapped with a
> mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> PTE after we have done a page table walk.
>
> Without this, we'd have to pass around the "size" of the PTE everywhere.
> We effectively did this before; it could be fetched from the hstate,
> which we pass around pretty much everywhere.
>
> This commit includes definitions for some basic helper functions that
> are used later. These helper functions wrap existing PTE
> inspection/modification functions, where the correct version is picked
> depending on if the HugeTLB PTE is actually "huge" or not. (Previously,
> all HugeTLB PTEs were "huge").
>
> For example, hugetlb_ptep_get wraps huge_ptep_get and ptep_get, where
> ptep_get is used when the HugeTLB PTE is PAGE_SIZE, and huge_ptep_get is
> used in all other cases.
>
> Signed-off-by: James Houghton <[email protected]>
> ---
> include/linux/hugetlb.h | 84 +++++++++++++++++++++++++++++++++++++++++
> mm/hugetlb.c | 57 ++++++++++++++++++++++++++++
> 2 files changed, 141 insertions(+)

There is nothing 'wrong' with this patch, but it does make me wonder.
After introducing hugetlb_pte, is all code dealing with hugetlb mappings
going to be using hugetlb_ptes? It would be quite confusing if there is
a mix of hugetlb_ptes and non-hugetlb_ptes. This will be revealed later
in the series, but a comment about suture direction would be helpful
here.
--
Mike Kravetz

>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 5fe1db46d8c9..1d4ec9dfdebf 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -46,6 +46,68 @@ enum {
> __NR_USED_SUBPAGE,
> };
>
> +struct hugetlb_pte {
> + pte_t *ptep;
> + unsigned int shift;
> +};
> +
> +static inline
> +void hugetlb_pte_init(struct hugetlb_pte *hpte)
> +{
> + hpte->ptep = NULL;
> +}
> +
> +static inline
> +void hugetlb_pte_populate(struct hugetlb_pte *hpte, pte_t *ptep,
> + unsigned int shift)
> +{
> + BUG_ON(!ptep);
> + hpte->ptep = ptep;
> + hpte->shift = shift;
> +}
> +
> +static inline
> +unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return 1UL << hpte->shift;
> +}
> +
> +static inline
> +unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return ~(hugetlb_pte_size(hpte) - 1);
> +}
> +
> +static inline
> +unsigned int hugetlb_pte_shift(const struct hugetlb_pte *hpte)
> +{
> + BUG_ON(!hpte->ptep);
> + return hpte->shift;
> +}
> +
> +static inline
> +bool hugetlb_pte_huge(const struct hugetlb_pte *hpte)
> +{
> + return !IS_ENABLED(CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING) ||
> + hugetlb_pte_shift(hpte) > PAGE_SHIFT;
> +}
> +
> +static inline
> +void hugetlb_pte_copy(struct hugetlb_pte *dest, const struct hugetlb_pte *src)
> +{
> + dest->ptep = src->ptep;
> + dest->shift = src->shift;
> +}
> +
> +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte);
> +bool hugetlb_pte_none(const struct hugetlb_pte *hpte);
> +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte);
> +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte);
> +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> + unsigned long address);
> +
> struct hugepage_subpool {
> spinlock_t lock;
> long count;
> @@ -1130,6 +1192,28 @@ static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
> return ptl;
> }
>
> +static inline
> +spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
> +{
> +
> + BUG_ON(!hpte->ptep);
> + // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
> + // the regular page table lock.
> + if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
> + return huge_pte_lockptr(hugetlb_pte_shift(hpte),
> + mm, hpte->ptep);
> + return &mm->page_table_lock;
> +}
> +
> +static inline
> +spinlock_t *hugetlb_pte_lock(struct mm_struct *mm, struct hugetlb_pte *hpte)
> +{
> + spinlock_t *ptl = hugetlb_pte_lockptr(mm, hpte);
> +
> + spin_lock(ptl);
> + return ptl;
> +}
> +
> #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
> extern void __init hugetlb_cma_reserve(int order);
> extern void __init hugetlb_cma_check(void);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index d6d0d4c03def..1a1434e29740 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1120,6 +1120,63 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
> return false;
> }
>
> +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte)
> +{
> + pgd_t pgd;
> + p4d_t p4d;
> + pud_t pud;
> + pmd_t pmd;
> +
> + BUG_ON(!hpte->ptep);
> + if (hugetlb_pte_size(hpte) >= PGDIR_SIZE) {
> + pgd = *(pgd_t *)hpte->ptep;
> + return pgd_present(pgd) && pgd_leaf(pgd);
> + } else if (hugetlb_pte_size(hpte) >= P4D_SIZE) {
> + p4d = *(p4d_t *)hpte->ptep;
> + return p4d_present(p4d) && p4d_leaf(p4d);
> + } else if (hugetlb_pte_size(hpte) >= PUD_SIZE) {
> + pud = *(pud_t *)hpte->ptep;
> + return pud_present(pud) && pud_leaf(pud);
> + } else if (hugetlb_pte_size(hpte) >= PMD_SIZE) {
> + pmd = *(pmd_t *)hpte->ptep;
> + return pmd_present(pmd) && pmd_leaf(pmd);
> + } else if (hugetlb_pte_size(hpte) >= PAGE_SIZE)
> + return pte_present(*hpte->ptep);
> + BUG();
> +}
> +
> +bool hugetlb_pte_none(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_pte_none(huge_ptep_get(hpte->ptep));
> + return pte_none(ptep_get(hpte->ptep));
> +}
> +
> +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_pte_none_mostly(huge_ptep_get(hpte->ptep));
> + return pte_none_mostly(ptep_get(hpte->ptep));
> +}
> +
> +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte)
> +{
> + if (hugetlb_pte_huge(hpte))
> + return huge_ptep_get(hpte->ptep);
> + return ptep_get(hpte->ptep);
> +}
> +
> +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> + unsigned long address)
> +{
> + BUG_ON(!hpte->ptep);
> + unsigned long sz = hugetlb_pte_size(hpte);
> +
> + if (sz > PAGE_SIZE)
> + return huge_pte_clear(mm, address, hpte->ptep, sz);
> + return pte_clear(mm, address, hpte->ptep);
> +}
> +
> static void enqueue_huge_page(struct hstate *h, struct page *page)
> {
> int nid = page_to_nid(page);
> --
> 2.37.0.rc0.161.g10f37bed90-goog
>

2022-06-29 16:43:50

by James Houghton

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On Tue, Jun 28, 2022 at 1:45 PM Mike Kravetz <[email protected]> wrote:
>
> On 06/24/22 17:36, James Houghton wrote:
> > After high-granularity mapping, page table entries for HugeTLB pages can
> > be of any size/type. (For example, we can have a 1G page mapped with a
> > mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> > PTE after we have done a page table walk.
> >
> > Without this, we'd have to pass around the "size" of the PTE everywhere.
> > We effectively did this before; it could be fetched from the hstate,
> > which we pass around pretty much everywhere.
> >
> > This commit includes definitions for some basic helper functions that
> > are used later. These helper functions wrap existing PTE
> > inspection/modification functions, where the correct version is picked
> > depending on if the HugeTLB PTE is actually "huge" or not. (Previously,
> > all HugeTLB PTEs were "huge").
> >
> > For example, hugetlb_ptep_get wraps huge_ptep_get and ptep_get, where
> > ptep_get is used when the HugeTLB PTE is PAGE_SIZE, and huge_ptep_get is
> > used in all other cases.
> >
> > Signed-off-by: James Houghton <[email protected]>
> > ---
> > include/linux/hugetlb.h | 84 +++++++++++++++++++++++++++++++++++++++++
> > mm/hugetlb.c | 57 ++++++++++++++++++++++++++++
> > 2 files changed, 141 insertions(+)
>
> There is nothing 'wrong' with this patch, but it does make me wonder.
> After introducing hugetlb_pte, is all code dealing with hugetlb mappings
> going to be using hugetlb_ptes? It would be quite confusing if there is
> a mix of hugetlb_ptes and non-hugetlb_ptes. This will be revealed later
> in the series, but a comment about suture direction would be helpful
> here.

That is indeed the direction I am trying to go -- I'll make sure to
comment on this in this patch. I am planning to replace all other
non-hugetlb_pte uses with hugetlb_pte in the next version of this
series (I see it as necessary to get HGM merged).

> --
> Mike Kravetz
>
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 5fe1db46d8c9..1d4ec9dfdebf 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -46,6 +46,68 @@ enum {
> > __NR_USED_SUBPAGE,
> > };
> >
> > +struct hugetlb_pte {
> > + pte_t *ptep;
> > + unsigned int shift;
> > +};
> > +
> > +static inline
> > +void hugetlb_pte_init(struct hugetlb_pte *hpte)
> > +{
> > + hpte->ptep = NULL;
> > +}
> > +
> > +static inline
> > +void hugetlb_pte_populate(struct hugetlb_pte *hpte, pte_t *ptep,
> > + unsigned int shift)
> > +{
> > + BUG_ON(!ptep);
> > + hpte->ptep = ptep;
> > + hpte->shift = shift;
> > +}
> > +
> > +static inline
> > +unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return 1UL << hpte->shift;
> > +}
> > +
> > +static inline
> > +unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return ~(hugetlb_pte_size(hpte) - 1);
> > +}
> > +
> > +static inline
> > +unsigned int hugetlb_pte_shift(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return hpte->shift;
> > +}
> > +
> > +static inline
> > +bool hugetlb_pte_huge(const struct hugetlb_pte *hpte)
> > +{
> > + return !IS_ENABLED(CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING) ||
> > + hugetlb_pte_shift(hpte) > PAGE_SHIFT;
> > +}
> > +
> > +static inline
> > +void hugetlb_pte_copy(struct hugetlb_pte *dest, const struct hugetlb_pte *src)
> > +{
> > + dest->ptep = src->ptep;
> > + dest->shift = src->shift;
> > +}
> > +
> > +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte);
> > +bool hugetlb_pte_none(const struct hugetlb_pte *hpte);
> > +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte);
> > +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte);
> > +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> > + unsigned long address);
> > +
> > struct hugepage_subpool {
> > spinlock_t lock;
> > long count;
> > @@ -1130,6 +1192,28 @@ static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
> > return ptl;
> > }
> >
> > +static inline
> > +spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
> > +{
> > +
> > + BUG_ON(!hpte->ptep);
> > + // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
> > + // the regular page table lock.
> > + if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
> > + return huge_pte_lockptr(hugetlb_pte_shift(hpte),
> > + mm, hpte->ptep);
> > + return &mm->page_table_lock;
> > +}
> > +
> > +static inline
> > +spinlock_t *hugetlb_pte_lock(struct mm_struct *mm, struct hugetlb_pte *hpte)
> > +{
> > + spinlock_t *ptl = hugetlb_pte_lockptr(mm, hpte);
> > +
> > + spin_lock(ptl);
> > + return ptl;
> > +}
> > +
> > #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
> > extern void __init hugetlb_cma_reserve(int order);
> > extern void __init hugetlb_cma_check(void);
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index d6d0d4c03def..1a1434e29740 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1120,6 +1120,63 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
> > return false;
> > }
> >
> > +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte)
> > +{
> > + pgd_t pgd;
> > + p4d_t p4d;
> > + pud_t pud;
> > + pmd_t pmd;
> > +
> > + BUG_ON(!hpte->ptep);
> > + if (hugetlb_pte_size(hpte) >= PGDIR_SIZE) {
> > + pgd = *(pgd_t *)hpte->ptep;
> > + return pgd_present(pgd) && pgd_leaf(pgd);
> > + } else if (hugetlb_pte_size(hpte) >= P4D_SIZE) {
> > + p4d = *(p4d_t *)hpte->ptep;
> > + return p4d_present(p4d) && p4d_leaf(p4d);
> > + } else if (hugetlb_pte_size(hpte) >= PUD_SIZE) {
> > + pud = *(pud_t *)hpte->ptep;
> > + return pud_present(pud) && pud_leaf(pud);
> > + } else if (hugetlb_pte_size(hpte) >= PMD_SIZE) {
> > + pmd = *(pmd_t *)hpte->ptep;
> > + return pmd_present(pmd) && pmd_leaf(pmd);
> > + } else if (hugetlb_pte_size(hpte) >= PAGE_SIZE)
> > + return pte_present(*hpte->ptep);
> > + BUG();
> > +}
> > +
> > +bool hugetlb_pte_none(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_pte_none(huge_ptep_get(hpte->ptep));
> > + return pte_none(ptep_get(hpte->ptep));
> > +}
> > +
> > +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_pte_none_mostly(huge_ptep_get(hpte->ptep));
> > + return pte_none_mostly(ptep_get(hpte->ptep));
> > +}
> > +
> > +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_ptep_get(hpte->ptep);
> > + return ptep_get(hpte->ptep);
> > +}
> > +
> > +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> > + unsigned long address)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + unsigned long sz = hugetlb_pte_size(hpte);
> > +
> > + if (sz > PAGE_SIZE)
> > + return huge_pte_clear(mm, address, hpte->ptep, sz);
> > + return pte_clear(mm, address, hpte->ptep);
> > +}
> > +
> > static void enqueue_huge_page(struct hstate *h, struct page *page)
> > {
> > int nid = page_to_nid(page);
> > --
> > 2.37.0.rc0.161.g10f37bed90-goog
> >

2022-06-29 16:45:47

by James Houghton

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On Tue, Jun 28, 2022 at 1:25 PM Mina Almasry <[email protected]> wrote:
>
> On Fri, Jun 24, 2022 at 10:37 AM James Houghton <[email protected]> wrote:
> >
> > After high-granularity mapping, page table entries for HugeTLB pages can
> > be of any size/type. (For example, we can have a 1G page mapped with a
> > mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> > PTE after we have done a page table walk.
> >
> > Without this, we'd have to pass around the "size" of the PTE everywhere.
> > We effectively did this before; it could be fetched from the hstate,
> > which we pass around pretty much everywhere.
> >
> > This commit includes definitions for some basic helper functions that
> > are used later. These helper functions wrap existing PTE
> > inspection/modification functions, where the correct version is picked
> > depending on if the HugeTLB PTE is actually "huge" or not. (Previously,
> > all HugeTLB PTEs were "huge").
> >
> > For example, hugetlb_ptep_get wraps huge_ptep_get and ptep_get, where
> > ptep_get is used when the HugeTLB PTE is PAGE_SIZE, and huge_ptep_get is
> > used in all other cases.
> >
> > Signed-off-by: James Houghton <[email protected]>
> > ---
> > include/linux/hugetlb.h | 84 +++++++++++++++++++++++++++++++++++++++++
> > mm/hugetlb.c | 57 ++++++++++++++++++++++++++++
> > 2 files changed, 141 insertions(+)
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 5fe1db46d8c9..1d4ec9dfdebf 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -46,6 +46,68 @@ enum {
> > __NR_USED_SUBPAGE,
> > };
> >
> > +struct hugetlb_pte {
> > + pte_t *ptep;
> > + unsigned int shift;
> > +};
> > +
> > +static inline
> > +void hugetlb_pte_init(struct hugetlb_pte *hpte)
> > +{
> > + hpte->ptep = NULL;
>
> shift = 0; ?

I don't think this is necessary (but, admittedly, it is quite harmless
to add). ptep = NULL means that the hugetlb_pte isn't valid, and shift
could be anything. Originally I had a separate `bool valid`, but
ptep=NULL was exactly the same as valid=false.

>
> > +}
> > +
> > +static inline
> > +void hugetlb_pte_populate(struct hugetlb_pte *hpte, pte_t *ptep,
> > + unsigned int shift)
> > +{
> > + BUG_ON(!ptep);
> > + hpte->ptep = ptep;
> > + hpte->shift = shift;
> > +}
> > +
> > +static inline
> > +unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return 1UL << hpte->shift;
> > +}
> > +
>
> This helper is quite redundant in my opinion.

Putting 1UL << hugetlb_pte_shit(hpte) everywhere is kind of annoying. :)

>
> > +static inline
> > +unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return ~(hugetlb_pte_size(hpte) - 1);
> > +}
> > +
> > +static inline
> > +unsigned int hugetlb_pte_shift(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return hpte->shift;
> > +}
> > +
>
> This one jumps as quite redundant too.

To make sure we aren't using an invalid hugetlb_pte, I want to remove
all places where I directly access hpte->shift -- really they should
all go through hugetlb_pte_shift.

>
> > +static inline
> > +bool hugetlb_pte_huge(const struct hugetlb_pte *hpte)
> > +{
> > + return !IS_ENABLED(CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING) ||
> > + hugetlb_pte_shift(hpte) > PAGE_SHIFT;
> > +}
> > +
>
> I'm guessing the !IS_ENABLED() check is because only the HGM code
> would store a non-huge pte in a hugetlb_pte struct. I think it's a bit
> fragile because anyone can add code in the future that uses
> hugetlb_pte in unexpected ways, but I will concede that it is correct
> as written.

I added this so that, if HGM isn't enabled, the compiler would have an
easier time optimizing things. I don't really have strong feelings
about keeping/removing it.

>
> > +static inline
> > +void hugetlb_pte_copy(struct hugetlb_pte *dest, const struct hugetlb_pte *src)
> > +{
> > + dest->ptep = src->ptep;
> > + dest->shift = src->shift;
> > +}
> > +
> > +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte);
> > +bool hugetlb_pte_none(const struct hugetlb_pte *hpte);
> > +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte);
> > +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte);
> > +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> > + unsigned long address);
> > +
> > struct hugepage_subpool {
> > spinlock_t lock;
> > long count;
> > @@ -1130,6 +1192,28 @@ static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
> > return ptl;
> > }
> >
> > +static inline
>
> Maybe for organization, move all the static functions you're adding
> above the hugetlb_pte_* declarations you're adding?

Will do.

>
> > +spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
> > +{
> > +
> > + BUG_ON(!hpte->ptep);
> > + // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
> > + // the regular page table lock.
>
> Does checkpatch.pl not complain about // style comments? I think those
> are not allowed, no?

It didn't :( I thought I went through and removed them all -- I guess
I missed some.

>
> > + if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
> > + return huge_pte_lockptr(hugetlb_pte_shift(hpte),
> > + mm, hpte->ptep);
> > + return &mm->page_table_lock;
> > +}
> > +
> > +static inline
> > +spinlock_t *hugetlb_pte_lock(struct mm_struct *mm, struct hugetlb_pte *hpte)
> > +{
> > + spinlock_t *ptl = hugetlb_pte_lockptr(mm, hpte);
> > +
> > + spin_lock(ptl);
> > + return ptl;
> > +}
> > +
> > #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
> > extern void __init hugetlb_cma_reserve(int order);
> > extern void __init hugetlb_cma_check(void);
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index d6d0d4c03def..1a1434e29740 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1120,6 +1120,63 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
> > return false;
> > }
> >
> > +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte)
> > +{
> > + pgd_t pgd;
> > + p4d_t p4d;
> > + pud_t pud;
> > + pmd_t pmd;
> > +
> > + BUG_ON(!hpte->ptep);
> > + if (hugetlb_pte_size(hpte) >= PGDIR_SIZE) {
> > + pgd = *(pgd_t *)hpte->ptep;
> > + return pgd_present(pgd) && pgd_leaf(pgd);
> > + } else if (hugetlb_pte_size(hpte) >= P4D_SIZE) {
> > + p4d = *(p4d_t *)hpte->ptep;
> > + return p4d_present(p4d) && p4d_leaf(p4d);
> > + } else if (hugetlb_pte_size(hpte) >= PUD_SIZE) {
> > + pud = *(pud_t *)hpte->ptep;
> > + return pud_present(pud) && pud_leaf(pud);
> > + } else if (hugetlb_pte_size(hpte) >= PMD_SIZE) {
> > + pmd = *(pmd_t *)hpte->ptep;
> > + return pmd_present(pmd) && pmd_leaf(pmd);
> > + } else if (hugetlb_pte_size(hpte) >= PAGE_SIZE)
> > + return pte_present(*hpte->ptep);
>
> The use of >= is a bit curious to me. Shouldn't these be ==?

These (except PGDIR_SIZE) should be >=. This is because some
architectures support multiple huge PTE sizes at the same page table
level. For example, on arm64, you can have 2M PMDs, and you can also
have 32M PMDs[1].

[1]: https://www.kernel.org/doc/html/latest/arm64/hugetlbpage.html

>
> Also probably doesn't matter but I was thinking to use *_SHIFTs
> instead of *_SIZE so you don't have to calculate the size 5 times in
> this routine, or calculate hugetlb_pte_size() once for some less code
> duplication and re-use?

I'll change this to use the shift, and I'll move the computation so
it's only done once (it is probably helpful for readability too). (I
imagine the compiler only actually computes the size once here.)

>
> > + BUG();
> > +}
> > +
> > +bool hugetlb_pte_none(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_pte_none(huge_ptep_get(hpte->ptep));
> > + return pte_none(ptep_get(hpte->ptep));
> > +}
> > +
> > +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_pte_none_mostly(huge_ptep_get(hpte->ptep));
> > + return pte_none_mostly(ptep_get(hpte->ptep));
> > +}
> > +
> > +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_ptep_get(hpte->ptep);
> > + return ptep_get(hpte->ptep);
> > +}
> > +
> > +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> > + unsigned long address)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + unsigned long sz = hugetlb_pte_size(hpte);
> > +
> > + if (sz > PAGE_SIZE)
> > + return huge_pte_clear(mm, address, hpte->ptep, sz);
> > + return pte_clear(mm, address, hpte->ptep);
> > +}
> > +
> > static void enqueue_huge_page(struct hstate *h, struct page *page)
> > {
> > int nid = page_to_nid(page);
> > --
> > 2.37.0.rc0.161.g10f37bed90-goog
> >

2022-06-29 16:45:56

by James Houghton

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On Mon, Jun 27, 2022 at 5:47 AM manish.mishra <[email protected]> wrote:
>
>
> On 24/06/22 11:06 pm, James Houghton wrote:
> > After high-granularity mapping, page table entries for HugeTLB pages can
> > be of any size/type. (For example, we can have a 1G page mapped with a
> > mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> > PTE after we have done a page table walk.
> >
> > Without this, we'd have to pass around the "size" of the PTE everywhere.
> > We effectively did this before; it could be fetched from the hstate,
> > which we pass around pretty much everywhere.
> >
> > This commit includes definitions for some basic helper functions that
> > are used later. These helper functions wrap existing PTE
> > inspection/modification functions, where the correct version is picked
> > depending on if the HugeTLB PTE is actually "huge" or not. (Previously,
> > all HugeTLB PTEs were "huge").
> >
> > For example, hugetlb_ptep_get wraps huge_ptep_get and ptep_get, where
> > ptep_get is used when the HugeTLB PTE is PAGE_SIZE, and huge_ptep_get is
> > used in all other cases.
> >
> > Signed-off-by: James Houghton <[email protected]>
> > ---
> > include/linux/hugetlb.h | 84 +++++++++++++++++++++++++++++++++++++++++
> > mm/hugetlb.c | 57 ++++++++++++++++++++++++++++
> > 2 files changed, 141 insertions(+)
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index 5fe1db46d8c9..1d4ec9dfdebf 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -46,6 +46,68 @@ enum {
> > __NR_USED_SUBPAGE,
> > };
> >
> > +struct hugetlb_pte {
> > + pte_t *ptep;
> > + unsigned int shift;
> > +};
> > +
> > +static inline
> > +void hugetlb_pte_init(struct hugetlb_pte *hpte)
> > +{
> > + hpte->ptep = NULL;
> I agree it does not matter but still will hpte->shift = 0 too be better?
> > +}
> > +
> > +static inline
> > +void hugetlb_pte_populate(struct hugetlb_pte *hpte, pte_t *ptep,
> > + unsigned int shift)
> > +{
> > + BUG_ON(!ptep);
> > + hpte->ptep = ptep;
> > + hpte->shift = shift;
> > +}
> > +
> > +static inline
> > +unsigned long hugetlb_pte_size(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return 1UL << hpte->shift;
> > +}
> > +
> > +static inline
> > +unsigned long hugetlb_pte_mask(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return ~(hugetlb_pte_size(hpte) - 1);
> > +}
> > +
> > +static inline
> > +unsigned int hugetlb_pte_shift(const struct hugetlb_pte *hpte)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + return hpte->shift;
> > +}
> > +
> > +static inline
> > +bool hugetlb_pte_huge(const struct hugetlb_pte *hpte)
> > +{
> > + return !IS_ENABLED(CONFIG_HUGETLB_HIGH_GRANULARITY_MAPPING) ||
> > + hugetlb_pte_shift(hpte) > PAGE_SHIFT;
> > +}
> > +
> > +static inline
> > +void hugetlb_pte_copy(struct hugetlb_pte *dest, const struct hugetlb_pte *src)
> > +{
> > + dest->ptep = src->ptep;
> > + dest->shift = src->shift;
> > +}
> > +
> > +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte);
> > +bool hugetlb_pte_none(const struct hugetlb_pte *hpte);
> > +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte);
> > +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte);
> > +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> > + unsigned long address);
> > +
> > struct hugepage_subpool {
> > spinlock_t lock;
> > long count;
> > @@ -1130,6 +1192,28 @@ static inline spinlock_t *huge_pte_lock_shift(unsigned int shift,
> > return ptl;
> > }
> >
> > +static inline
> > +spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
> > +{
> > +
> > + BUG_ON(!hpte->ptep);
> > + // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
> > + // the regular page table lock.
> > + if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
> > + return huge_pte_lockptr(hugetlb_pte_shift(hpte),
> > + mm, hpte->ptep);
> > + return &mm->page_table_lock;
> > +}
> > +
> > +static inline
> > +spinlock_t *hugetlb_pte_lock(struct mm_struct *mm, struct hugetlb_pte *hpte)
> > +{
> > + spinlock_t *ptl = hugetlb_pte_lockptr(mm, hpte);
> > +
> > + spin_lock(ptl);
> > + return ptl;
> > +}
> > +
> > #if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
> > extern void __init hugetlb_cma_reserve(int order);
> > extern void __init hugetlb_cma_check(void);
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index d6d0d4c03def..1a1434e29740 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1120,6 +1120,63 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
> > return false;
> > }
> >
> > +bool hugetlb_pte_present_leaf(const struct hugetlb_pte *hpte)
> > +{
> > + pgd_t pgd;
> > + p4d_t p4d;
> > + pud_t pud;
> > + pmd_t pmd;
> > +
> > + BUG_ON(!hpte->ptep);
> > + if (hugetlb_pte_size(hpte) >= PGDIR_SIZE) {
> > + pgd = *(pgd_t *)hpte->ptep;
>
> sorry did not understand in these conditions why
>
> hugetlb_pte_size(hpte) >= PGDIR_SIZE. I mean why >= check
>
> and not just == check?

I did >= PGDIR_SIZE just because it was consistent with the rest of
the sizes, but, indeed, > PGDIR_SIZE makes little sense, so I'll
replace it with ==.

>
> > + return pgd_present(pgd) && pgd_leaf(pgd);
> > + } else if (hugetlb_pte_size(hpte) >= P4D_SIZE) {
> > + p4d = *(p4d_t *)hpte->ptep;
> > + return p4d_present(p4d) && p4d_leaf(p4d);
> > + } else if (hugetlb_pte_size(hpte) >= PUD_SIZE) {
> > + pud = *(pud_t *)hpte->ptep;
> > + return pud_present(pud) && pud_leaf(pud);
> > + } else if (hugetlb_pte_size(hpte) >= PMD_SIZE) {
> > + pmd = *(pmd_t *)hpte->ptep;
> > + return pmd_present(pmd) && pmd_leaf(pmd);
> > + } else if (hugetlb_pte_size(hpte) >= PAGE_SIZE)
> > + return pte_present(*hpte->ptep);
> > + BUG();
> > +}
> > +
> > +bool hugetlb_pte_none(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_pte_none(huge_ptep_get(hpte->ptep));
> > + return pte_none(ptep_get(hpte->ptep));
> > +}
> > +
> > +bool hugetlb_pte_none_mostly(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_pte_none_mostly(huge_ptep_get(hpte->ptep));
> > + return pte_none_mostly(ptep_get(hpte->ptep));
> > +}
> > +
> > +pte_t hugetlb_ptep_get(const struct hugetlb_pte *hpte)
> > +{
> > + if (hugetlb_pte_huge(hpte))
> > + return huge_ptep_get(hpte->ptep);
> > + return ptep_get(hpte->ptep);
> > +}
> > +
> > +void hugetlb_pte_clear(struct mm_struct *mm, const struct hugetlb_pte *hpte,
> > + unsigned long address)
> > +{
> > + BUG_ON(!hpte->ptep);
> > + unsigned long sz = hugetlb_pte_size(hpte);
> > +
> > + if (sz > PAGE_SIZE)
> > + return huge_pte_clear(mm, address, hpte->ptep, sz);
>
> just for cosistency something like above?
>
> if (hugetlb_pte_huge(hpte))
> + return huge_pte_clear
> ;

Will do, yes. (I added hugetlb_pte_huge quite late, and I guess I
missed updating this spot. :))

>
> > + return pte_clear(mm, address, hpte->ptep);
> > +}
> > +
> > static void enqueue_huge_page(struct hstate *h, struct page *page)
> > {
> > int nid = page_to_nid(page);

2022-07-12 00:05:49

by Mike Kravetz

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

2022-07-12 09:50:01

by Dr. David Alan Gilbert

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

* Mike Kravetz ([email protected]) wrote:
> On 06/24/22 17:36, James Houghton wrote:
> > After high-granularity mapping, page table entries for HugeTLB pages can
> > be of any size/type. (For example, we can have a 1G page mapped with a
> > mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> > PTE after we have done a page table walk.
>
> This has been rolling around in my head.
>
> Will this first use case (live migration) actually make use of this
> 'mixed mapping' model where hugetlb pages could be mapped at the PUD,
> PMD and PTE level all within the same vma? I only understand the use
> case from a high level. But, it seems that we would want to only want
> to migrate PTE (or PMD) sized pages and not necessarily a mix.

I suspect we would pick one size and use that size for all transfers
when in postcopy; not sure if there are any side cases though.

> The only reason I ask is because the code might be much simpler if all
> mappings within a vma were of the same size. Of course, the
> performance/latency of converting a large mapping may be prohibitively
> expensive.

Imagine we're migrating a few TB VM, backed by 1GB hugepages, I'm guessing it
would be nice to clean up the PTE/PMDs for split 1GB pages as they're
completed rather than having thousands of them for the whole VM.
(I'm not sure if that is already doable)

Dave

> Looking to the future when supporting memory error handling/page poisoning
> it seems like we would certainly want multiple size mappings.
>
> Just a thought.
> --
> Mike Kravetz
>
--
Dr. David Alan Gilbert / [email protected] / Manchester, UK

2022-07-12 18:04:43

by Mike Kravetz

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On 07/12/22 10:42, Dr. David Alan Gilbert wrote:
> * Mike Kravetz ([email protected]) wrote:
> > On 06/24/22 17:36, James Houghton wrote:
> > > After high-granularity mapping, page table entries for HugeTLB pages can
> > > be of any size/type. (For example, we can have a 1G page mapped with a
> > > mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> > > PTE after we have done a page table walk.
> >
> > This has been rolling around in my head.
> >
> > Will this first use case (live migration) actually make use of this
> > 'mixed mapping' model where hugetlb pages could be mapped at the PUD,
> > PMD and PTE level all within the same vma? I only understand the use
> > case from a high level. But, it seems that we would want to only want
> > to migrate PTE (or PMD) sized pages and not necessarily a mix.
>
> I suspect we would pick one size and use that size for all transfers
> when in postcopy; not sure if there are any side cases though.
>
> > The only reason I ask is because the code might be much simpler if all
> > mappings within a vma were of the same size. Of course, the
> > performance/latency of converting a large mapping may be prohibitively
> > expensive.
>
> Imagine we're migrating a few TB VM, backed by 1GB hugepages, I'm guessing it
> would be nice to clean up the PTE/PMDs for split 1GB pages as they're
> completed rather than having thousands of them for the whole VM.
> (I'm not sure if that is already doable)

Seems that would be doable by calling MADV_COLLAPSE for 1GB pages as
they are completed.

Thanks for information on post copy.
--
Mike Kravetz

2022-07-15 16:45:25

by Peter Xu

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On Tue, Jul 12, 2022 at 10:42:17AM +0100, Dr. David Alan Gilbert wrote:
> * Mike Kravetz ([email protected]) wrote:
> > On 06/24/22 17:36, James Houghton wrote:
> > > After high-granularity mapping, page table entries for HugeTLB pages can
> > > be of any size/type. (For example, we can have a 1G page mapped with a
> > > mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> > > PTE after we have done a page table walk.
> >
> > This has been rolling around in my head.
> >
> > Will this first use case (live migration) actually make use of this
> > 'mixed mapping' model where hugetlb pages could be mapped at the PUD,
> > PMD and PTE level all within the same vma? I only understand the use
> > case from a high level. But, it seems that we would want to only want
> > to migrate PTE (or PMD) sized pages and not necessarily a mix.
>
> I suspect we would pick one size and use that size for all transfers
> when in postcopy; not sure if there are any side cases though.

Yes, I'm also curious whether the series can be much simplified if we have
a static way to do sub-page mappings, e.g., when sub-page mapping enabled
we always map to PAGE_SIZE only; if not we keep the old hpage size mappings
only.

> > Looking to the future when supporting memory error handling/page poisoning
> > it seems like we would certainly want multiple size mappings.

If we treat page poisoning as very rare events anyway, IMHO it'll even be
acceptable if we always split 1G pages into 4K ones but only rule out the
real poisoned 4K phys page. After all IIUC the major goal is for reducing
poisoned memory footprint.

It'll be definitely nicer if we can keep 511 2M pages and 511 4K pages in
that case so the 511 2M pages performs slightly better, but it'll be
something extra to me. It can always be something worked upon a simpler
version of sub-page mapping which is only PAGE_SIZE based.

Thanks,

--
Peter Xu

2022-07-15 22:29:07

by Axel Rasmussen

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On Fri, Jul 15, 2022 at 9:35 AM Peter Xu <[email protected]> wrote:
>
> On Tue, Jul 12, 2022 at 10:42:17AM +0100, Dr. David Alan Gilbert wrote:
> > * Mike Kravetz ([email protected]) wrote:
> > > On 06/24/22 17:36, James Houghton wrote:
> > > > After high-granularity mapping, page table entries for HugeTLB pages can
> > > > be of any size/type. (For example, we can have a 1G page mapped with a
> > > > mix of PMDs and PTEs.) This struct is to help keep track of a HugeTLB
> > > > PTE after we have done a page table walk.
> > >
> > > This has been rolling around in my head.
> > >
> > > Will this first use case (live migration) actually make use of this
> > > 'mixed mapping' model where hugetlb pages could be mapped at the PUD,
> > > PMD and PTE level all within the same vma? I only understand the use
> > > case from a high level. But, it seems that we would want to only want
> > > to migrate PTE (or PMD) sized pages and not necessarily a mix.
> >
> > I suspect we would pick one size and use that size for all transfers
> > when in postcopy; not sure if there are any side cases though.

Sorry for chiming in late. At least from my perspective being able to
do multiple sizes is a nice to have optmization.

As talked about above, imagine a guest VM backed by 1G hugetlb pages.
We're going along doing demand paging at 4K; because we want each
request to complete as quickly as possible, we want very small
granularity.

Guest access in terms of "physical" memory address is basically
random. So, actually filling in all 262k 4K PTEs making up a
contiguous 1G region might take quite some time. Once we've completed
any of the various 2M contiguous regions, it would be nice to go ahead
and collapse those right away. The benefit is, the guest will see some
performance benefit from the 2G page already, without having to wait
for the full 1G page to complete. Once we do complete a 1G page, it
would be nice to collapse that one level further. If we do this, the
whole guest memory will be a mix of 1G, 2M, and 4K.

>
> Yes, I'm also curious whether the series can be much simplified if we have
> a static way to do sub-page mappings, e.g., when sub-page mapping enabled
> we always map to PAGE_SIZE only; if not we keep the old hpage size mappings
> only.
>
> > > Looking to the future when supporting memory error handling/page poisoning
> > > it seems like we would certainly want multiple size mappings.
>
> If we treat page poisoning as very rare events anyway, IMHO it'll even be
> acceptable if we always split 1G pages into 4K ones but only rule out the
> real poisoned 4K phys page. After all IIUC the major goal is for reducing
> poisoned memory footprint.
>
> It'll be definitely nicer if we can keep 511 2M pages and 511 4K pages in
> that case so the 511 2M pages performs slightly better, but it'll be
> something extra to me. It can always be something worked upon a simpler
> version of sub-page mapping which is only PAGE_SIZE based.
>
> Thanks,
>
> --
> Peter Xu
>

2022-07-15 23:13:50

by Peter Xu

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On Fri, Jul 15, 2022 at 02:52:27PM -0700, Axel Rasmussen wrote:
> Guest access in terms of "physical" memory address is basically
> random. So, actually filling in all 262k 4K PTEs making up a
> contiguous 1G region might take quite some time. Once we've completed
> any of the various 2M contiguous regions, it would be nice to go ahead
> and collapse those right away. The benefit is, the guest will see some
> performance benefit from the 2G page already, without having to wait
> for the full 1G page to complete. Once we do complete a 1G page, it
> would be nice to collapse that one level further. If we do this, the
> whole guest memory will be a mix of 1G, 2M, and 4K.

Just to mention that we've got quite some other things that drags perf down
much more than tlb hits on page sizes during any VM migration process.

For example, when we split & wr-protect pages during the starting phase of
migration on src host, it's not about 10% or 20% drop but much drastic. In
the postcopy case it's for dest but still it's part of the whole migration
process and probably guest-aware too. If the guest wants, it can simply
start writting some pages continuously and it'll see obvious drag downs any
time during migration I bet.

It'll always be nice to have multi-level sub-mappings and I fully agree.
IMHO it's a matter of whether keeping 4k-only would greatly simplify the
work, especially on the rework of hugetlb sub-mage aware pgtable ops.

Thanks,

--
Peter Xu

2022-09-08 19:10:34

by Peter Xu

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

James,

On Fri, Jun 24, 2022 at 05:36:37PM +0000, James Houghton wrote:
> +static inline
> +spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
> +{
> +
> + BUG_ON(!hpte->ptep);
> + // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
> + // the regular page table lock.
> + if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
> + return huge_pte_lockptr(hugetlb_pte_shift(hpte),
> + mm, hpte->ptep);
> + return &mm->page_table_lock;
> +}

Today when I re-read part of this thread, I found that I'm not sure whether
this is safe. IIUC taking different locks depending on the state of pte
may lead to issues.

For example, could below race happen where two threads can be taking
different locks even if stumbled over the same pmd entry?

thread 1 thread 2
-------- --------

hugetlb_pte_lockptr (for pmd level)
pte_none()==true,
take pmd lock
pmd_alloc()
hugetlb_pte_lockptr (for pmd level)
pte is pgtable entry (so !none, !present_leaf)
take page_table_lock
(can run concurrently with thread 1...)
pte_alloc()
...

--
Peter Xu

2022-09-08 19:14:43

by James Houghton

[permalink] [raw]

Subject: Re: [RFC PATCH 07/26] hugetlb: add hugetlb_pte to track HugeTLB page table entries

On Thu, Sep 8, 2022 at 10:38 AM Peter Xu <[email protected]> wrote:
>
> James,
>
> On Fri, Jun 24, 2022 at 05:36:37PM +0000, James Houghton wrote:
> > +static inline
> > +spinlock_t *hugetlb_pte_lockptr(struct mm_struct *mm, struct hugetlb_pte *hpte)
> > +{
> > +
> > + BUG_ON(!hpte->ptep);
> > + // Only use huge_pte_lockptr if we are at leaf-level. Otherwise use
> > + // the regular page table lock.
> > + if (hugetlb_pte_none(hpte) || hugetlb_pte_present_leaf(hpte))
> > + return huge_pte_lockptr(hugetlb_pte_shift(hpte),
> > + mm, hpte->ptep);
> > + return &mm->page_table_lock;
> > +}
>
> Today when I re-read part of this thread, I found that I'm not sure whether
> this is safe. IIUC taking different locks depending on the state of pte
> may lead to issues.
>
> For example, could below race happen where two threads can be taking
> different locks even if stumbled over the same pmd entry?
>
> thread 1 thread 2
> -------- --------
>
> hugetlb_pte_lockptr (for pmd level)
> pte_none()==true,
> take pmd lock
> pmd_alloc()
> hugetlb_pte_lockptr (for pmd level)
> pte is pgtable entry (so !none, !present_leaf)
> take page_table_lock
> (can run concurrently with thread 1...)
> pte_alloc()
> ...

Thanks for pointing out this race. Yes, it is wrong to change which
lock we take depending on the value of the PTE, as we would need to
lock the PTE first to correctly make the decision. This has already
been fixed in the next version of this series :). That is, we choose
which lock to grab based on the PTE's page table level.

- James

>
> --
> Peter Xu
>