This patchset intends to merge the contiguous ptes hugetlbfs implementation
of arm64 and riscv.
Both arm64 and riscv support the use of contiguous ptes to map pages that
are larger than the default page table size, respectively called contpte
and svnapot.
The riscv implementation differs from the arm64's in that the LSBs of the
pfn of a svnapot pte are used to store the size of the mapping, allowing
for future sizes to be added (for now only 64KB is supported). That's an
issue for the core mm code which expects to find the *real* pfn a pte points
to. Patch 1 fixes that by always returning svnapot ptes with the real pfn
and restores the size of the mapping when it is written to a page table.
The following patches are just merges of the 2 different implementations
that currently exist in arm64 and riscv which are very similar. It paves
the way to the reuse of the recent contpte THP work by Ryan [1] to avoid
reimplementing the same in riscv.
This patchset was tested by running the libhugetlbfs testsuite with 64KB
and 2MB pages on both architectures (on a 4KB base page size arm64 kernel).
[1] https://lore.kernel.org/linux-arm-kernel/[email protected]/
Changes in v2:
- Rebase on top of 6.9-rc3
Alexandre Ghiti (9):
riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code
riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes
mm: Use common huge_ptep_get() function for riscv/arm64
mm: Use common set_huge_pte_at() function for riscv/arm64
mm: Use common huge_pte_clear() function for riscv/arm64
mm: Use common huge_ptep_get_and_clear() function for riscv/arm64
mm: Use common huge_ptep_set_access_flags() function for riscv/arm64
mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64
mm: Use common huge_ptep_clear_flush() function for riscv/arm64
arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/pgtable.h | 56 +++++-
arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/hugetlb.h | 2 +-
arch/riscv/include/asm/pgtable-64.h | 11 ++
arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
arch/riscv/mm/hugetlbpage.c | 227 ----------------------
arch/riscv/mm/pgtable.c | 6 +-
mm/Kconfig | 3 +
mm/Makefile | 1 +
mm/contpte.c | 272 ++++++++++++++++++++++++++
12 files changed, 480 insertions(+), 544 deletions(-)
create mode 100644 mm/contpte.c
--
2.39.2
Both architectures have the same implementation so move it to generic code.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 12 ------------
arch/riscv/include/asm/pgtable.h | 5 +++--
arch/riscv/mm/hugetlbpage.c | 19 -------------------
mm/contpte.c | 14 ++++++++++++++
4 files changed, 17 insertions(+), 33 deletions(-)
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index b8353b0a273c..cf44837369be 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -277,18 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned long sz)
-{
- int i, ncontig;
- size_t pgsize;
-
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- __pte_clear(mm, addr, ptep);
-}
-
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 5d1d3a6c7c44..0847a7fb8661 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -644,8 +644,8 @@ static inline void __set_ptes(struct mm_struct *mm, unsigned long addr,
#define set_contptes(mm, addr, ptep, pte, nr, pgsize) \
__set_ptes(mm, addr, ptep, pte, nr)
-static inline void pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+static inline void __pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
__set_pte_at(mm, ptep, __pte(0));
}
@@ -700,6 +700,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define set_ptes __set_ptes
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define ptep_get_and_clear __ptep_get_and_clear
+#define pte_clear __pte_clear
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index d8f07aef758b..437b1df059eb 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -254,25 +254,6 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
}
-void huge_pte_clear(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long sz)
-{
- size_t pgsize;
- pte_t pte = ptep_get(ptep);
- int i, pte_num;
-
- if (!pte_napot(pte)) {
- pte_clear(mm, addr, ptep);
- return;
- }
-
- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
- for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
- pte_clear(mm, addr, ptep);
-}
-
static bool is_napot_size(unsigned long size)
{
unsigned long order;
diff --git a/mm/contpte.c b/mm/contpte.c
index 2320ee23478a..22e0de197bd3 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -12,6 +12,7 @@
* - __ptep_get()
* - __set_ptes()
* - __ptep_get_and_clear()
+ * - __pte_clear()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -20,6 +21,7 @@
* This file implements the following contpte aware API:
* - huge_ptep_get()
* - set_huge_pte_at()
+ * - huge_pte_clear()
*/
pte_t huge_ptep_get(pte_t *ptep)
@@ -102,3 +104,15 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
}
+
+void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned long sz)
+{
+ int i, ncontig;
+ size_t pgsize;
+
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
+ __pte_clear(mm, addr, ptep);
+}
--
2.39.2
After some adjustments, both architectures have the same implementation
so move it to the generic code.
Note that get_clear_contig() function is duplicated in the generic and
the arm64 code because it is still used by some arm64 functions that
will, in the next commits, be moved to the generic code. Once all have
been moved, the arm64 version will be removed.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 14 +++++++++-
arch/arm64/mm/hugetlbpage.c | 19 ++-----------
arch/riscv/include/asm/pgtable.h | 4 ++-
arch/riscv/mm/hugetlbpage.c | 21 ++------------
mm/contpte.c | 48 ++++++++++++++++++++++++++++++--
5 files changed, 66 insertions(+), 40 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 2e0415fd5083..7c2938cb70b9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1644,11 +1644,23 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
#endif /* CONFIG_ARM64_CONTPTE */
-static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+int find_num_contig(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, size_t *pgsize);
+
+static inline int arch_contpte_get_num_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long size,
size_t *pgsize)
{
int contig_ptes = 0;
+ /*
+ * If the size is not passed, we need to go through the page table to
+ * find out the number of contiguous ptes.
+ */
+ if (size == 0)
+ return find_num_contig(mm, addr, ptep, pgsize);
+
*pgsize = size;
switch (size) {
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index cf44837369be..5ace4bf7ce35 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -93,8 +93,8 @@ int pud_huge(pud_t pud)
#endif
}
-static int find_num_contig(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, size_t *pgsize)
+int find_num_contig(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, size_t *pgsize)
{
pgd_t *pgdp = pgd_offset(mm, addr);
p4d_t *p4dp;
@@ -277,21 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- int ncontig;
- size_t pgsize;
- pte_t orig_pte = __ptep_get(ptep);
-
- if (!pte_cont(orig_pte))
- return __ptep_get_and_clear(mm, addr, ptep);
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
-
- return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
-}
-
/*
* huge_ptep_set_access_flags will update access flags (dirty, accesssed)
* and write permission.
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 0847a7fb8661..d976113a370d 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -532,7 +532,9 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
#ifdef CONFIG_RISCV_ISA_SVNAPOT
-static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+static inline int arch_contpte_get_num_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long size,
size_t *pgsize)
{
unsigned long hugepage_shift;
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 437b1df059eb..a757e0b2f090 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -187,7 +187,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
if (!pte_napot(pte))
return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
@@ -202,21 +202,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return true;
}
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep)
-{
- pte_t orig_pte = ptep_get(ptep);
- int pte_num;
-
- if (!pte_napot(orig_pte))
- return ptep_get_and_clear(mm, addr, ptep);
-
- pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
-
- return get_clear_contig(mm, addr, ptep, pte_num);
-}
-
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep)
@@ -231,7 +216,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
return;
}
- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ pte_num = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
orig_pte = pte_wrprotect(orig_pte);
@@ -249,7 +234,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
if (!pte_napot(pte))
return ptep_clear_flush(vma, addr, ptep);
- pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
+ pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, NULL);
return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
}
diff --git a/mm/contpte.c b/mm/contpte.c
index 22e0de197bd3..68eb1634b922 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -22,6 +22,7 @@
* - huge_ptep_get()
* - set_huge_pte_at()
* - huge_pte_clear()
+ * - huge_ptep_get_and_clear()
*/
pte_t huge_ptep_get(pte_t *ptep)
@@ -33,7 +34,7 @@ pte_t huge_ptep_get(pte_t *ptep)
if (!pte_present(orig_pte) || !pte_cont(orig_pte))
return orig_pte;
- ncontig = arch_contpte_get_num_contig(ptep,
+ ncontig = arch_contpte_get_num_contig(NULL, 0, ptep,
page_size(pte_page(orig_pte)),
&pgsize);
@@ -87,7 +88,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
int i;
int ncontig;
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, sz, &pgsize);
if (!pte_present(pte)) {
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
@@ -111,8 +112,49 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
int i, ncontig;
size_t pgsize;
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, sz, &pgsize);
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
__pte_clear(mm, addr, ptep);
}
+
+static pte_t get_clear_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned long pgsize,
+ unsigned long ncontig)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+ unsigned long i;
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
+ pte_t pte = __ptep_get_and_clear(mm, addr, ptep);
+
+ /*
+ * If HW_AFDBM (arm64) or svadu (riscv) is enabled, then the HW
+ * could turn on the dirty or accessed bit for any page in the
+ * set, so check them all.
+ */
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+ return orig_pte;
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ int ncontig;
+ size_t pgsize;
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (!pte_cont(orig_pte))
+ return __ptep_get_and_clear(mm, addr, ptep);
+
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
+
+ return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
+}
--
2.39.2
I should not have unplugged this outlet...Sorry, ignore this series, I'm
sending a RESEND right away.
On 08/05/2024 13:18, Alexandre Ghiti wrote:
> This patchset intends to merge the contiguous ptes hugetlbfs implementation
> of arm64 and riscv.
>
> Both arm64 and riscv support the use of contiguous ptes to map pages that
> are larger than the default page table size, respectively called contpte
> and svnapot.
>
> The riscv implementation differs from the arm64's in that the LSBs of the
> pfn of a svnapot pte are used to store the size of the mapping, allowing
> for future sizes to be added (for now only 64KB is supported). That's an
> issue for the core mm code which expects to find the *real* pfn a pte points
> to. Patch 1 fixes that by always returning svnapot ptes with the real pfn
> and restores the size of the mapping when it is written to a page table.
>
> The following patches are just merges of the 2 different implementations
> that currently exist in arm64 and riscv which are very similar. It paves
> the way to the reuse of the recent contpte THP work by Ryan [1] to avoid
> reimplementing the same in riscv.
>
> This patchset was tested by running the libhugetlbfs testsuite with 64KB
> and 2MB pages on both architectures (on a 4KB base page size arm64 kernel).
>
> [1] https://lore.kernel.org/linux-arm-kernel/[email protected]/
>
> Changes in v2:
> - Rebase on top of 6.9-rc3
>
> Alexandre Ghiti (9):
> riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code
> riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes
> mm: Use common huge_ptep_get() function for riscv/arm64
> mm: Use common set_huge_pte_at() function for riscv/arm64
> mm: Use common huge_pte_clear() function for riscv/arm64
> mm: Use common huge_ptep_get_and_clear() function for riscv/arm64
> mm: Use common huge_ptep_set_access_flags() function for riscv/arm64
> mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64
> mm: Use common huge_ptep_clear_flush() function for riscv/arm64
>
> arch/arm64/Kconfig | 1 +
> arch/arm64/include/asm/pgtable.h | 56 +++++-
> arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
> arch/riscv/Kconfig | 1 +
> arch/riscv/include/asm/hugetlb.h | 2 +-
> arch/riscv/include/asm/pgtable-64.h | 11 ++
> arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
> arch/riscv/mm/hugetlbpage.c | 227 ----------------------
> arch/riscv/mm/pgtable.c | 6 +-
> mm/Kconfig | 3 +
> mm/Makefile | 1 +
> mm/contpte.c | 272 ++++++++++++++++++++++++++
> 12 files changed, 480 insertions(+), 544 deletions(-)
> create mode 100644 mm/contpte.c
>
On Wed, 8 May 2024 13:18:20 +0200 Alexandre Ghiti <[email protected]> wrote:
> This patchset intends to merge the contiguous ptes hugetlbfs implementation
> of arm64 and riscv.
>
> ...
>
> arch/arm64/Kconfig | 1 +
> arch/arm64/include/asm/pgtable.h | 56 +++++-
> arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
> arch/riscv/Kconfig | 1 +
> arch/riscv/include/asm/hugetlb.h | 2 +-
> arch/riscv/include/asm/pgtable-64.h | 11 ++
> arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
> arch/riscv/mm/hugetlbpage.c | 227 ----------------------
> arch/riscv/mm/pgtable.c | 6 +-
> mm/Kconfig | 3 +
> mm/Makefile | 1 +
> mm/contpte.c | 272 ++++++++++++++++++++++++++
> 12 files changed, 480 insertions(+), 544 deletions(-)
> create mode 100644 mm/contpte.c
Hits three subsystems, so I guess mm.git is the place. I'll await
reviewer/tester input and let's look at getting this into mm.git after
6.10-rc1?
Hi Andrew,
On 08/05/2024 18:27, Andrew Morton wrote:
> On Wed, 8 May 2024 13:18:20 +0200 Alexandre Ghiti <[email protected]> wrote:
>
>> This patchset intends to merge the contiguous ptes hugetlbfs implementation
>> of arm64 and riscv.
>>
>> ...
>>
>> arch/arm64/Kconfig | 1 +
>> arch/arm64/include/asm/pgtable.h | 56 +++++-
>> arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
>> arch/riscv/Kconfig | 1 +
>> arch/riscv/include/asm/hugetlb.h | 2 +-
>> arch/riscv/include/asm/pgtable-64.h | 11 ++
>> arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
>> arch/riscv/mm/hugetlbpage.c | 227 ----------------------
>> arch/riscv/mm/pgtable.c | 6 +-
>> mm/Kconfig | 3 +
>> mm/Makefile | 1 +
>> mm/contpte.c | 272 ++++++++++++++++++++++++++
>> 12 files changed, 480 insertions(+), 544 deletions(-)
>> create mode 100644 mm/contpte.c
> Hits three subsystems, so I guess mm.git is the place. I'll await
> reviewer/tester input and let's look at getting this into mm.git after
> 6.10-rc1?
Sure, fine by me :)
Thanks,
Alex
>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv