2023-11-23 06:57:48

by Xu Lu

[permalink] [raw]
Subject: [RFC PATCH V1 01/11] mm: Fix misused APIs on huge pte

There exist some paths that try to get value of huge pte via normal
pte API ptep_get instead of huge pte API huge_ptep_get. This commit
corrects these misused APIs.

Signed-off-by: Xu Lu <[email protected]>
---
arch/riscv/mm/hugetlbpage.c | 2 +-
fs/proc/task_mmu.c | 2 +-
include/asm-generic/hugetlb.h | 7 +++++++
mm/hugetlb.c | 2 +-
mm/migrate.c | 5 ++++-
mm/mprotect.c | 2 +-
mm/rmap.c | 10 ++++++++--
mm/vmalloc.c | 3 ++-
8 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index b52f0210481f..d7cf8e2d3c5b 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -74,7 +74,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,

out:
if (pte) {
- pte_t pteval = ptep_get_lockless(pte);
+ pte_t pteval = huge_ptep_get_lockless(pte);

WARN_ON_ONCE(pte_present(pteval) && !pte_huge(pteval));
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ef2eb12906da..0fe9d23aa062 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -726,7 +726,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
struct page *page = NULL;
- pte_t ptent = ptep_get(pte);
+ pte_t ptent = huge_ptep_get(pte);

if (pte_present(ptent)) {
page = vm_normal_page(vma, addr, ptent);
diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
index 6dcf4d576970..52c299db971a 100644
--- a/include/asm-generic/hugetlb.h
+++ b/include/asm-generic/hugetlb.h
@@ -150,6 +150,13 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
}
#endif

+#ifndef __HAVE_ARCH_HUGE_PTEP_GET_LOCKLESS
+static inline pte_t huge_ptep_get_lockless(pte_t *ptep)
+{
+ return huge_ptep_get(ptep);
+}
+#endif
+
#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
static inline bool gigantic_page_runtime_supported(void)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1169ef2f2176..9f773eb95b3b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7406,7 +7406,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
}

if (pte) {
- pte_t pteval = ptep_get_lockless(pte);
+ pte_t pteval = huge_ptep_get_lockless(pte);

BUG_ON(pte_present(pteval) && !pte_huge(pteval));
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 35a88334bb3c..d0daf58e486e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -210,7 +210,10 @@ static bool remove_migration_pte(struct folio *folio,

folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
- old_pte = ptep_get(pvmw.pte);
+ if (folio_test_hugetlb(folio))
+ old_pte = huge_ptep_get(pvmw.pte);
+ else
+ old_pte = ptep_get(pvmw.pte);
if (pte_swp_soft_dirty(old_pte))
pte = pte_mksoft_dirty(pte);

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 81991102f785..b9129c03f451 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -555,7 +555,7 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long next,
struct mm_walk *walk)
{
- return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
+ return pfn_modify_allowed(pte_pfn(huge_ptep_get(pte)),
*(pgprot_t *)(walk->private)) ?
0 : -EACCES;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 7a27a2b41802..d93c6dabbdf4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1577,7 +1577,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
break;
}

- pfn = pte_pfn(ptep_get(pvmw.pte));
+ if (folio_test_hugetlb(folio))
+ pfn = pte_pfn(huge_ptep_get(pvmw.pte));
+ else
+ pfn = pte_pfn(ptep_get(pvmw.pte));
subpage = folio_page(folio, pfn - folio_pfn(folio));
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
@@ -1931,7 +1934,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);

- pfn = pte_pfn(ptep_get(pvmw.pte));
+ if (folio_test_hugetlb(folio))
+ pfn = pte_pfn(huge_ptep_get(pvmw.pte));
+ else
+ pfn = pte_pfn(ptep_get(pvmw.pte));

if (folio_is_zone_device(folio)) {
/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..1a451b82a7ac 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -103,7 +103,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
do {
- BUG_ON(!pte_none(ptep_get(pte)));

#ifdef CONFIG_HUGETLB_PAGE
size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
@@ -111,11 +110,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t entry = pfn_pte(pfn, prot);

entry = arch_make_huge_pte(entry, ilog2(size), 0);
+ BUG_ON(!pte_none(huge_ptep_get(pte)));
set_huge_pte_at(&init_mm, addr, pte, entry, size);
pfn += PFN_DOWN(size);
continue;
}
#endif
+ BUG_ON(!pte_none(ptep_get(pte)));
set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
pfn++;
} while (pte += PFN_DOWN(size), addr += size, addr != end);
--
2.20.1


2023-12-31 16:40:01

by Alexandre Ghiti

[permalink] [raw]
Subject: Re: [RFC PATCH V1 01/11] mm: Fix misused APIs on huge pte

Hi Xu,

On 23/11/2023 07:56, Xu Lu wrote:
> There exist some paths that try to get value of huge pte via normal
> pte API ptep_get instead of huge pte API huge_ptep_get. This commit
> corrects these misused APIs.
>
> Signed-off-by: Xu Lu <[email protected]>
> ---
> arch/riscv/mm/hugetlbpage.c | 2 +-
> fs/proc/task_mmu.c | 2 +-
> include/asm-generic/hugetlb.h | 7 +++++++
> mm/hugetlb.c | 2 +-
> mm/migrate.c | 5 ++++-
> mm/mprotect.c | 2 +-
> mm/rmap.c | 10 ++++++++--
> mm/vmalloc.c | 3 ++-
> 8 files changed, 25 insertions(+), 8 deletions(-)
>
> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> index b52f0210481f..d7cf8e2d3c5b 100644
> --- a/arch/riscv/mm/hugetlbpage.c
> +++ b/arch/riscv/mm/hugetlbpage.c
> @@ -74,7 +74,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
>
> out:
> if (pte) {
> - pte_t pteval = ptep_get_lockless(pte);
> + pte_t pteval = huge_ptep_get_lockless(pte);
>
> WARN_ON_ONCE(pte_present(pteval) && !pte_huge(pteval));
> }


Only looking at riscv for this patch, this change does not look
necessary as the pte value is only used to check if the pte is huge or
not, it does not use the A/D bits.

Thanks,

Alex


> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index ef2eb12906da..0fe9d23aa062 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -726,7 +726,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
> struct mem_size_stats *mss = walk->private;
> struct vm_area_struct *vma = walk->vma;
> struct page *page = NULL;
> - pte_t ptent = ptep_get(pte);
> + pte_t ptent = huge_ptep_get(pte);
>
> if (pte_present(ptent)) {
> page = vm_normal_page(vma, addr, ptent);
> diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
> index 6dcf4d576970..52c299db971a 100644
> --- a/include/asm-generic/hugetlb.h
> +++ b/include/asm-generic/hugetlb.h
> @@ -150,6 +150,13 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
> }
> #endif
>
> +#ifndef __HAVE_ARCH_HUGE_PTEP_GET_LOCKLESS
> +static inline pte_t huge_ptep_get_lockless(pte_t *ptep)
> +{
> + return huge_ptep_get(ptep);
> +}
> +#endif
> +
> #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
> static inline bool gigantic_page_runtime_supported(void)
> {
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 1169ef2f2176..9f773eb95b3b 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -7406,7 +7406,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
> }
>
> if (pte) {
> - pte_t pteval = ptep_get_lockless(pte);
> + pte_t pteval = huge_ptep_get_lockless(pte);
>
> BUG_ON(pte_present(pteval) && !pte_huge(pteval));
> }
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 35a88334bb3c..d0daf58e486e 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -210,7 +210,10 @@ static bool remove_migration_pte(struct folio *folio,
>
> folio_get(folio);
> pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> - old_pte = ptep_get(pvmw.pte);
> + if (folio_test_hugetlb(folio))
> + old_pte = huge_ptep_get(pvmw.pte);
> + else
> + old_pte = ptep_get(pvmw.pte);
> if (pte_swp_soft_dirty(old_pte))
> pte = pte_mksoft_dirty(pte);
>
> diff --git a/mm/mprotect.c b/mm/mprotect.c
> index 81991102f785..b9129c03f451 100644
> --- a/mm/mprotect.c
> +++ b/mm/mprotect.c
> @@ -555,7 +555,7 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
> unsigned long addr, unsigned long next,
> struct mm_walk *walk)
> {
> - return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
> + return pfn_modify_allowed(pte_pfn(huge_ptep_get(pte)),
> *(pgprot_t *)(walk->private)) ?
> 0 : -EACCES;
> }
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 7a27a2b41802..d93c6dabbdf4 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1577,7 +1577,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> break;
> }
>
> - pfn = pte_pfn(ptep_get(pvmw.pte));
> + if (folio_test_hugetlb(folio))
> + pfn = pte_pfn(huge_ptep_get(pvmw.pte));
> + else
> + pfn = pte_pfn(ptep_get(pvmw.pte));
> subpage = folio_page(folio, pfn - folio_pfn(folio));
> address = pvmw.address;
> anon_exclusive = folio_test_anon(folio) &&
> @@ -1931,7 +1934,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
> /* Unexpected PMD-mapped THP? */
> VM_BUG_ON_FOLIO(!pvmw.pte, folio);
>
> - pfn = pte_pfn(ptep_get(pvmw.pte));
> + if (folio_test_hugetlb(folio))
> + pfn = pte_pfn(huge_ptep_get(pvmw.pte));
> + else
> + pfn = pte_pfn(ptep_get(pvmw.pte));
>
> if (folio_is_zone_device(folio)) {
> /*
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c17..1a451b82a7ac 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -103,7 +103,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> if (!pte)
> return -ENOMEM;
> do {
> - BUG_ON(!pte_none(ptep_get(pte)));
>
> #ifdef CONFIG_HUGETLB_PAGE
> size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
> @@ -111,11 +110,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> pte_t entry = pfn_pte(pfn, prot);
>
> entry = arch_make_huge_pte(entry, ilog2(size), 0);
> + BUG_ON(!pte_none(huge_ptep_get(pte)));
> set_huge_pte_at(&init_mm, addr, pte, entry, size);
> pfn += PFN_DOWN(size);
> continue;
> }
> #endif
> + BUG_ON(!pte_none(ptep_get(pte)));
> set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
> pfn++;
> } while (pte += PFN_DOWN(size), addr += size, addr != end);

2024-01-04 08:00:15

by Xu Lu

[permalink] [raw]
Subject: Re: [External] Re: [RFC PATCH V1 01/11] mm: Fix misused APIs on huge pte

Hi Alexandre,

Thanks for your feedback!

On Mon, Jan 1, 2024 at 12:40 AM Alexandre Ghiti <[email protected]> wrote:
>
> Hi Xu,
>
> On 23/11/2023 07:56, Xu Lu wrote:
> > There exist some paths that try to get value of huge pte via normal
> > pte API ptep_get instead of huge pte API huge_ptep_get. This commit
> > corrects these misused APIs.
> >
> > Signed-off-by: Xu Lu <[email protected]>
> > ---
> > arch/riscv/mm/hugetlbpage.c | 2 +-
> > fs/proc/task_mmu.c | 2 +-
> > include/asm-generic/hugetlb.h | 7 +++++++
> > mm/hugetlb.c | 2 +-
> > mm/migrate.c | 5 ++++-
> > mm/mprotect.c | 2 +-
> > mm/rmap.c | 10 ++++++++--
> > mm/vmalloc.c | 3 ++-
> > 8 files changed, 25 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> > index b52f0210481f..d7cf8e2d3c5b 100644
> > --- a/arch/riscv/mm/hugetlbpage.c
> > +++ b/arch/riscv/mm/hugetlbpage.c
> > @@ -74,7 +74,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
> >
> > out:
> > if (pte) {
> > - pte_t pteval = ptep_get_lockless(pte);
> > + pte_t pteval = huge_ptep_get_lockless(pte);
> >
> > WARN_ON_ONCE(pte_present(pteval) && !pte_huge(pteval));
> > }
>
>
> Only looking at riscv for this patch, this change does not look
> necessary as the pte value is only used to check if the pte is huge or
> not, it does not use the A/D bits.
>
> Thanks,
>
> Alex
>

Your observation is very accurate. The pte value here does not use the A/D bits.

Actually, the modification here is to ensure the match between pte
type and pte operating functions. In this patch version,
'huge_ptep_get_lockless' and 'ptep_get_lockless' won't traverse pte
array in pte_t to check A/D bits. Instead, to ensure atomicity, they
just get the value of the first pte element and use it to generate the
whole pte_t struct. Thus they may lead to potential A/D bits loss. In
our opinion, if one wants to check the A/D bit to indicate following
page flushing or page swapping, he should acquire page table lock and
use huge_ptep_get/ptep_get instead.

Regards,

Xu

>
> > diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> > index ef2eb12906da..0fe9d23aa062 100644
> > --- a/fs/proc/task_mmu.c
> > +++ b/fs/proc/task_mmu.c
> > @@ -726,7 +726,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
> > struct mem_size_stats *mss = walk->private;
> > struct vm_area_struct *vma = walk->vma;
> > struct page *page = NULL;
> > - pte_t ptent = ptep_get(pte);
> > + pte_t ptent = huge_ptep_get(pte);
> >
> > if (pte_present(ptent)) {
> > page = vm_normal_page(vma, addr, ptent);
> > diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h
> > index 6dcf4d576970..52c299db971a 100644
> > --- a/include/asm-generic/hugetlb.h
> > +++ b/include/asm-generic/hugetlb.h
> > @@ -150,6 +150,13 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
> > }
> > #endif
> >
> > +#ifndef __HAVE_ARCH_HUGE_PTEP_GET_LOCKLESS
> > +static inline pte_t huge_ptep_get_lockless(pte_t *ptep)
> > +{
> > + return huge_ptep_get(ptep);
> > +}
> > +#endif
> > +
> > #ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
> > static inline bool gigantic_page_runtime_supported(void)
> > {
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 1169ef2f2176..9f773eb95b3b 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -7406,7 +7406,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
> > }
> >
> > if (pte) {
> > - pte_t pteval = ptep_get_lockless(pte);
> > + pte_t pteval = huge_ptep_get_lockless(pte);
> >
> > BUG_ON(pte_present(pteval) && !pte_huge(pteval));
> > }
> > diff --git a/mm/migrate.c b/mm/migrate.c
> > index 35a88334bb3c..d0daf58e486e 100644
> > --- a/mm/migrate.c
> > +++ b/mm/migrate.c
> > @@ -210,7 +210,10 @@ static bool remove_migration_pte(struct folio *folio,
> >
> > folio_get(folio);
> > pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
> > - old_pte = ptep_get(pvmw.pte);
> > + if (folio_test_hugetlb(folio))
> > + old_pte = huge_ptep_get(pvmw.pte);
> > + else
> > + old_pte = ptep_get(pvmw.pte);
> > if (pte_swp_soft_dirty(old_pte))
> > pte = pte_mksoft_dirty(pte);
> >
> > diff --git a/mm/mprotect.c b/mm/mprotect.c
> > index 81991102f785..b9129c03f451 100644
> > --- a/mm/mprotect.c
> > +++ b/mm/mprotect.c
> > @@ -555,7 +555,7 @@ static int prot_none_hugetlb_entry(pte_t *pte, unsigned long hmask,
> > unsigned long addr, unsigned long next,
> > struct mm_walk *walk)
> > {
> > - return pfn_modify_allowed(pte_pfn(ptep_get(pte)),
> > + return pfn_modify_allowed(pte_pfn(huge_ptep_get(pte)),
> > *(pgprot_t *)(walk->private)) ?
> > 0 : -EACCES;
> > }
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index 7a27a2b41802..d93c6dabbdf4 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -1577,7 +1577,10 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
> > break;
> > }
> >
> > - pfn = pte_pfn(ptep_get(pvmw.pte));
> > + if (folio_test_hugetlb(folio))
> > + pfn = pte_pfn(huge_ptep_get(pvmw.pte));
> > + else
> > + pfn = pte_pfn(ptep_get(pvmw.pte));
> > subpage = folio_page(folio, pfn - folio_pfn(folio));
> > address = pvmw.address;
> > anon_exclusive = folio_test_anon(folio) &&
> > @@ -1931,7 +1934,10 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
> > /* Unexpected PMD-mapped THP? */
> > VM_BUG_ON_FOLIO(!pvmw.pte, folio);
> >
> > - pfn = pte_pfn(ptep_get(pvmw.pte));
> > + if (folio_test_hugetlb(folio))
> > + pfn = pte_pfn(huge_ptep_get(pvmw.pte));
> > + else
> > + pfn = pte_pfn(ptep_get(pvmw.pte));
> >
> > if (folio_is_zone_device(folio)) {
> > /*
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index d12a17fc0c17..1a451b82a7ac 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -103,7 +103,6 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> > if (!pte)
> > return -ENOMEM;
> > do {
> > - BUG_ON(!pte_none(ptep_get(pte)));
> >
> > #ifdef CONFIG_HUGETLB_PAGE
> > size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
> > @@ -111,11 +110,13 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
> > pte_t entry = pfn_pte(pfn, prot);
> >
> > entry = arch_make_huge_pte(entry, ilog2(size), 0);
> > + BUG_ON(!pte_none(huge_ptep_get(pte)));
> > set_huge_pte_at(&init_mm, addr, pte, entry, size);
> > pfn += PFN_DOWN(size);
> > continue;
> > }
> > #endif
> > + BUG_ON(!pte_none(ptep_get(pte)));
> > set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
> > pfn++;
> > } while (pte += PFN_DOWN(size), addr += size, addr != end);