This patchset intends to merge the contiguous ptes hugetlbfs implementation
of arm64 and riscv.
Both arm64 and riscv support the use of contiguous ptes to map pages that
are larger than the default page table size, respectively called contpte
and svnapot.
The riscv implementation differs from the arm64's in that the LSBs of the
pfn of a svnapot pte are used to store the size of the mapping, allowing
for future sizes to be added (for now only 64KB is supported). That's an
issue for the core mm code which expects to find the *real* pfn a pte points
to. Patch 1 fixes that by always returning svnapot ptes with the real pfn
and restores the size of the mapping when it is written to a page table.
The following patches are just merges of the 2 different implementations
that currently exist in arm64 and riscv which are very similar. It paves
the way to the reuse of the recent contpte THP work by Ryan [1] to avoid
reimplementing the same in riscv.
This patchset was tested by running the libhugetlbfs testsuite with 64KB
and 2MB pages on both architectures (on a 4KB base page size arm64 kernel).
[1] https://lore.kernel.org/linux-arm-kernel/[email protected]/
Changes in v2:
- Rebase on top of 6.9-rc3
Alexandre Ghiti (9):
riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code
riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes
mm: Use common huge_ptep_get() function for riscv/arm64
mm: Use common set_huge_pte_at() function for riscv/arm64
mm: Use common huge_pte_clear() function for riscv/arm64
mm: Use common huge_ptep_get_and_clear() function for riscv/arm64
mm: Use common huge_ptep_set_access_flags() function for riscv/arm64
mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64
mm: Use common huge_ptep_clear_flush() function for riscv/arm64
arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/pgtable.h | 56 +++++-
arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/hugetlb.h | 2 +-
arch/riscv/include/asm/pgtable-64.h | 11 ++
arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
arch/riscv/mm/hugetlbpage.c | 227 ----------------------
arch/riscv/mm/pgtable.c | 6 +-
mm/Kconfig | 3 +
mm/Makefile | 1 +
mm/contpte.c | 272 ++++++++++++++++++++++++++
12 files changed, 480 insertions(+), 544 deletions(-)
create mode 100644 mm/contpte.c
--
2.39.2
The core mm code expects to be able to extract the pfn from a pte. NAPOT
mappings work differently since its ptes actually point to the first pfn
of the mapping, the other bits being used to encode the size of the
mapping.
So modify ptep_get() so that it returns a pte value that contains the
*real* pfn (which is then different from what the HW expects) and right
before storing the ptes to the page table, reset the pfn LSBs to the
size of the mapping.
And make sure that all NAPOT mappings are set using set_ptes().
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/riscv/include/asm/pgtable-64.h | 11 +++
arch/riscv/include/asm/pgtable.h | 105 ++++++++++++++++++++++++++--
arch/riscv/mm/hugetlbpage.c | 38 +++++-----
3 files changed, 128 insertions(+), 26 deletions(-)
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 221a5c1ee287..9fe076fc503e 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -106,6 +106,17 @@ enum napot_cont_order {
#define napot_cont_mask(order) (~(napot_cont_size(order) - 1UL))
#define napot_pte_num(order) BIT(order)
+static inline bool is_napot_order(unsigned int order)
+{
+ unsigned int napot_order;
+
+ for_each_napot_order(napot_order)
+ if (order == napot_order)
+ return true;
+
+ return false;
+}
+
#ifdef CONFIG_RISCV_ISA_SVNAPOT
#define HUGE_MAX_HSTATE (2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
#else
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 9f8ea0e33eb1..268c828f5152 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -297,6 +297,8 @@ static inline unsigned long pte_napot(pte_t pte)
return pte_val(pte) & _PAGE_NAPOT;
}
+#define pte_valid_napot(pte) (pte_present(pte) && pte_napot(pte))
+
static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
{
int pos = order - 1 + _PAGE_PFN_SHIFT;
@@ -306,6 +308,12 @@ static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
}
+/* pte at entry must *not* encode the mapping size in the pfn LSBs. */
+static inline pte_t pte_clear_napot(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_NAPOT);
+}
+
#else
static __always_inline bool has_svnapot(void) { return false; }
@@ -315,17 +323,14 @@ static inline unsigned long pte_napot(pte_t pte)
return 0;
}
+#define pte_valid_napot(pte) false
+
#endif /* CONFIG_RISCV_ISA_SVNAPOT */
/* Yields the page frame number (PFN) of a page table entry */
static inline unsigned long pte_pfn(pte_t pte)
{
- unsigned long res = __page_val_to_pfn(pte_val(pte));
-
- if (has_svnapot() && pte_napot(pte))
- res = res & (res - 1UL);
-
- return res;
+ return __page_val_to_pfn(pte_val(pte));
}
#define pte_page(x) pfn_to_page(pte_pfn(x))
@@ -525,9 +530,91 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+ size_t *pgsize)
+{
+ pte_t __pte;
+
+ /* We must read the raw value of the pte to get the size of the mapping */
+ __pte = READ_ONCE(*ptep);
+
+ if (pgsize) {
+ if (size >= PGDIR_SIZE)
+ *pgsize = PGDIR_SIZE;
+ else if (size >= P4D_SIZE)
+ *pgsize = P4D_SIZE;
+ else if (size >= PUD_SIZE)
+ *pgsize = PUD_SIZE;
+ else if (size >= PMD_SIZE)
+ *pgsize = PMD_SIZE;
+ else
+ *pgsize = PAGE_SIZE;
+ }
+
+ /* Make sure __pte is not a swap entry */
+ if (pte_valid_napot(__pte))
+ return napot_pte_num(napot_cont_order(__pte));
+
+ return 1;
+}
+#endif
+
+static inline pte_t ptep_get(pte_t *ptep)
+{
+ pte_t pte = READ_ONCE(*ptep);
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+ /*
+ * The pte we load has the N bit set and the size of the mapping in
+ * the pfn LSBs: keep the N bit and replace the mapping size with
+ * the *real* pfn since the core mm code expects to find it there.
+ * The mapping size will be reset just before being written to the
+ * page table in set_ptes().
+ */
+ if (unlikely(pte_valid_napot(pte))) {
+ unsigned int order = napot_cont_order(pte);
+ int pos = order - 1 + _PAGE_PFN_SHIFT;
+ unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
+ pte_t *orig_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * napot_pte_num(order));
+
+ pte = __pte((pte_val(pte) & napot_mask) + ((ptep - orig_ptep) << _PAGE_PFN_SHIFT));
+ }
+#endif
+
+ return pte;
+}
+#define ptep_get ptep_get
+
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval, unsigned int nr)
{
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+ if (unlikely(pte_valid_napot(pteval))) {
+ unsigned int order = ilog2(nr);
+
+ if (!is_napot_order(order)) {
+ /*
+ * Something's weird, we are given a NAPOT pte but the
+ * size of the mapping is not a known NAPOT mapping
+ * size, so clear the NAPOT bit and map this without
+ * NAPOT support: core mm only manipulates pte with the
+ * real pfn so we know the pte is valid without the N
+ * bit.
+ */
+ pr_err("Incorrect NAPOT mapping, resetting.\n");
+ pteval = pte_clear_napot(pteval);
+ } else {
+ /*
+ * NAPOT ptes that arrive here only have the N bit set
+ * and their pfn does not contain the mapping size, so
+ * set that here.
+ */
+ pteval = pte_mknapot(pteval, order);
+ }
+ }
+#endif
+
page_table_check_ptes_set(mm, ptep, pteval, nr);
for (;;) {
@@ -535,6 +622,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
if (--nr == 0)
break;
ptep++;
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+ if (unlikely(pte_valid_napot(pteval)))
+ continue;
+#endif
+
pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
}
}
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 5ef2a6891158..fe8067ee71b4 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -256,8 +256,7 @@ void set_huge_pte_at(struct mm_struct *mm,
clear_flush(mm, addr, ptep, pgsize, pte_num);
- for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
- set_pte_at(mm, addr, ptep, pte);
+ set_ptes(mm, addr, ptep, pte, pte_num);
}
int huge_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -267,16 +266,16 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
int dirty)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long order;
+ size_t pgsize;
pte_t orig_pte;
- int i, pte_num;
+ int pte_num;
if (!pte_napot(pte))
return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
- order = napot_cont_order(pte);
- pte_num = napot_pte_num(order);
- ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
+ pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
+
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
if (pte_dirty(orig_pte))
@@ -285,8 +284,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
if (pte_young(orig_pte))
pte = pte_mkyoung(pte);
- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
- set_pte_at(mm, addr, ptep, pte);
+ set_ptes(mm, addr, ptep, pte, pte_num);
return true;
}
@@ -301,7 +299,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
if (!pte_napot(orig_pte))
return ptep_get_and_clear(mm, addr, ptep);
- pte_num = napot_pte_num(napot_cont_order(orig_pte));
+ pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
return get_clear_contig(mm, addr, ptep, pte_num);
}
@@ -311,24 +309,23 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
pte_t *ptep)
{
pte_t pte = ptep_get(ptep);
- unsigned long order;
+ size_t pgsize;
pte_t orig_pte;
- int i, pte_num;
+ int pte_num;
if (!pte_napot(pte)) {
ptep_set_wrprotect(mm, addr, ptep);
return;
}
- order = napot_cont_order(pte);
- pte_num = napot_pte_num(order);
- ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
+ pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
+
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
orig_pte = pte_wrprotect(orig_pte);
- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
- set_pte_at(mm, addr, ptep, orig_pte);
+ set_ptes(mm, addr, ptep, orig_pte, pte_num);
}
pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -341,7 +338,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
if (!pte_napot(pte))
return ptep_clear_flush(vma, addr, ptep);
- pte_num = napot_pte_num(napot_cont_order(pte));
+ pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
}
@@ -351,6 +348,7 @@ void huge_pte_clear(struct mm_struct *mm,
pte_t *ptep,
unsigned long sz)
{
+ size_t pgsize;
pte_t pte = ptep_get(ptep);
int i, pte_num;
@@ -359,8 +357,8 @@ void huge_pte_clear(struct mm_struct *mm,
return;
}
- pte_num = napot_pte_num(napot_cont_order(pte));
- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+ pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
pte_clear(mm, addr, ptep);
}
--
2.39.2
The pte_t pointer is expected to point to the first entry of the NAPOT
mapping so no need to use huge_pte_offset(), similarly to what is done
in arm64.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/riscv/mm/hugetlbpage.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index fe8067ee71b4..f042f5c8bdb7 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -274,7 +274,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
- ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
@@ -319,10 +318,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
}
pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
- ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
-
orig_pte = pte_wrprotect(orig_pte);
set_ptes(mm, addr, ptep, orig_pte, pte_num);
--
2.39.2
For that, we need to introduce:
- a new config: ARCH_HAS_CONTPTE,
- a new arch specific function which returns the number of contiguous PTE
in a mapping and its base page size,
- a pte_cont() helper, only introduced for riscv since we keep the arm64
naming (contpte) which is more explicit than the riscv's (napot).
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/pgtable.h | 30 +++++++++++++++++
arch/arm64/mm/hugetlbpage.c | 55 ++------------------------------
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/hugetlb.h | 2 +-
arch/riscv/include/asm/pgtable.h | 6 ++--
arch/riscv/mm/hugetlbpage.c | 24 --------------
mm/Kconfig | 3 ++
mm/Makefile | 1 +
mm/contpte.c | 45 ++++++++++++++++++++++++++
10 files changed, 88 insertions(+), 80 deletions(-)
create mode 100644 mm/contpte.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b11c98b3e84..ac2f6d906cc3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -20,6 +20,7 @@ config ARM64
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_CACHE_LINE_SIZE
+ select ARCH_HAS_CONTPTE
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index afdd56d26ad7..e30149a128f2 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1638,6 +1638,36 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
#endif /* CONFIG_ARM64_CONTPTE */
+static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+ size_t *pgsize)
+{
+ int contig_ptes = 0;
+
+ *pgsize = size;
+
+ switch (size) {
+#ifndef __PAGETABLE_PMD_FOLDED
+ case PUD_SIZE:
+ if (pud_sect_supported())
+ contig_ptes = 1;
+ break;
+#endif
+ case PMD_SIZE:
+ contig_ptes = 1;
+ break;
+ case CONT_PMD_SIZE:
+ *pgsize = PMD_SIZE;
+ contig_ptes = CONT_PMDS;
+ break;
+ case CONT_PTE_SIZE:
+ *pgsize = PAGE_SIZE;
+ contig_ptes = CONT_PTES;
+ break;
+ }
+
+ return contig_ptes;
+}
+
#endif /* !__ASSEMBLY__ */
#endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 0f0e10bb0a95..9e9c80ec6e74 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -112,57 +112,6 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr,
return CONT_PTES;
}
-static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
-{
- int contig_ptes = 0;
-
- *pgsize = size;
-
- switch (size) {
-#ifndef __PAGETABLE_PMD_FOLDED
- case PUD_SIZE:
- if (pud_sect_supported())
- contig_ptes = 1;
- break;
-#endif
- case PMD_SIZE:
- contig_ptes = 1;
- break;
- case CONT_PMD_SIZE:
- *pgsize = PMD_SIZE;
- contig_ptes = CONT_PMDS;
- break;
- case CONT_PTE_SIZE:
- *pgsize = PAGE_SIZE;
- contig_ptes = CONT_PTES;
- break;
- }
-
- return contig_ptes;
-}
-
-pte_t huge_ptep_get(pte_t *ptep)
-{
- int ncontig, i;
- size_t pgsize;
- pte_t orig_pte = __ptep_get(ptep);
-
- if (!pte_present(orig_pte) || !pte_cont(orig_pte))
- return orig_pte;
-
- ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
- for (i = 0; i < ncontig; i++, ptep++) {
- pte_t pte = __ptep_get(ptep);
-
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
- return orig_pte;
-}
-
/*
* Changing some bits of contiguous entries requires us to follow a
* Break-Before-Make approach, breaking the whole contiguous set
@@ -243,7 +192,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
unsigned long pfn, dpfn;
pgprot_t hugeprot;
- ncontig = num_contig_ptes(sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
if (!pte_present(pte)) {
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
@@ -390,7 +339,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
int i, ncontig;
size_t pgsize;
- ncontig = num_contig_ptes(sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
__pte_clear(mm, addr, ptep);
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 8a0f403432e8..38d93cf44456 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -19,6 +19,7 @@ config RISCV
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_BINFMT_FLAT
+ select ARCH_HAS_CONTPTE if RISCV_ISA_SVNAPOT
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 22deb7a2a6ec..f195f611722b 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -49,7 +49,7 @@ pte_t huge_ptep_get(pte_t *ptep);
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
#define arch_make_huge_pte arch_make_huge_pte
-#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
+#endif /* CONFIG_RISCV_ISA_SVNAPOT */
#include <asm-generic/hugetlb.h>
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 268c828f5152..66061002ff36 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -296,6 +296,7 @@ static inline unsigned long pte_napot(pte_t pte)
{
return pte_val(pte) & _PAGE_NAPOT;
}
+#define pte_cont pte_napot
#define pte_valid_napot(pte) (pte_present(pte) && pte_napot(pte))
@@ -560,7 +561,7 @@ static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
}
#endif
-static inline pte_t ptep_get(pte_t *ptep)
+static inline pte_t __ptep_get(pte_t *ptep)
{
pte_t pte = READ_ONCE(*ptep);
@@ -584,7 +585,6 @@ static inline pte_t ptep_get(pte_t *ptep)
return pte;
}
-#define ptep_get ptep_get
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval, unsigned int nr)
@@ -686,6 +686,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
return ptep_test_and_clear_young(vma, address, ptep);
}
+#define ptep_get __ptep_get
+
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
{
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index f042f5c8bdb7..be129f4f1503 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -3,30 +3,6 @@
#include <linux/err.h>
#ifdef CONFIG_RISCV_ISA_SVNAPOT
-pte_t huge_ptep_get(pte_t *ptep)
-{
- unsigned long pte_num;
- int i;
- pte_t orig_pte = ptep_get(ptep);
-
- if (!pte_present(orig_pte) || !pte_napot(orig_pte))
- return orig_pte;
-
- pte_num = napot_pte_num(napot_cont_order(orig_pte));
-
- for (i = 0; i < pte_num; i++, ptep++) {
- pte_t pte = ptep_get(ptep);
-
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
-
- return orig_pte;
-}
-
pte_t *huge_pte_alloc(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr,
diff --git a/mm/Kconfig b/mm/Kconfig
index b1448aa81e15..c325003d6552 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -981,6 +981,9 @@ config ARCH_HAS_CPU_CACHE_ALIASING
config ARCH_HAS_CACHE_LINE_SIZE
bool
+config ARCH_HAS_CONTPTE
+ bool
+
config ARCH_HAS_CURRENT_STACK_POINTER
bool
help
diff --git a/mm/Makefile b/mm/Makefile
index 4abb40b911ec..605ead58403b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
+obj-$(CONFIG_ARCH_HAS_CONTPTE) += contpte.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
diff --git a/mm/contpte.c b/mm/contpte.c
new file mode 100644
index 000000000000..e8574051d0b9
--- /dev/null
+++ b/mm/contpte.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Rivos Inc.
+ */
+
+#include <linux/mm.h>
+#include <linux/pgtable.h>
+#include <linux/hugetlb.h>
+
+/*
+ * Any arch that wants to use that needs to define:
+ * - __ptep_get()
+ * - pte_cont()
+ * - arch_contpte_get_num_contig()
+ */
+
+/*
+ * This file implements the following contpte aware API:
+ * - huge_ptep_get()
+ */
+
+pte_t huge_ptep_get(pte_t *ptep)
+{
+ int ncontig, i;
+ size_t pgsize;
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (!pte_present(orig_pte) || !pte_cont(orig_pte))
+ return orig_pte;
+
+ ncontig = arch_contpte_get_num_contig(ptep,
+ page_size(pte_page(orig_pte)),
+ &pgsize);
+
+ for (i = 0; i < ncontig; i++, ptep++) {
+ pte_t pte = __ptep_get(ptep);
+
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+ return orig_pte;
+}
--
2.39.2
After some adjustments, both architectures have the same implementation
so move it to generic code.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 14 +++++---
arch/arm64/mm/hugetlbpage.c | 56 -----------------------------
arch/riscv/include/asm/pgtable.h | 39 +++++++++++++-------
arch/riscv/mm/hugetlbpage.c | 62 --------------------------------
mm/contpte.c | 59 ++++++++++++++++++++++++++++++
5 files changed, 95 insertions(+), 135 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e30149a128f2..2e0415fd5083 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -372,9 +372,10 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
}
-static inline void __set_ptes(struct mm_struct *mm,
- unsigned long __always_unused addr,
- pte_t *ptep, pte_t pte, unsigned int nr)
+static inline void ___set_ptes(struct mm_struct *mm,
+ unsigned long __always_unused addr,
+ pte_t *ptep, pte_t pte, unsigned int nr,
+ size_t pgsize)
{
page_table_check_ptes_set(mm, ptep, pte, nr);
__sync_cache_and_tags(pte, nr);
@@ -385,10 +386,15 @@ static inline void __set_ptes(struct mm_struct *mm,
if (--nr == 0)
break;
ptep++;
- pte = pte_advance_pfn(pte, 1);
+ pte = pte_advance_pfn(pte, pgsize >> PAGE_SHIFT);
}
}
+#define __set_ptes(mm, addr, ptep, pte, nr) \
+ ___set_ptes(mm, addr, ptep, pte, nr, PAGE_SIZE)
+
+#define set_contptes ___set_ptes
+
/*
* Huge pte definitions.
*/
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 9e9c80ec6e74..b8353b0a273c 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -159,62 +159,6 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm,
return orig_pte;
}
-/*
- * Changing some bits of contiguous entries requires us to follow a
- * Break-Before-Make approach, breaking the whole contiguous set
- * before we can change any entries. See ARM DDI 0487A.k_iss10775,
- * "Misprogramming of the Contiguous bit", page D4-1762.
- *
- * This helper performs the break step for use cases where the
- * original pte is not needed.
- */
-static void clear_flush(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pgsize,
- unsigned long ncontig)
-{
- struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
- unsigned long i, saddr = addr;
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- __ptep_get_and_clear(mm, addr, ptep);
-
- flush_tlb_range(&vma, saddr, addr);
-}
-
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte, unsigned long sz)
-{
- size_t pgsize;
- int i;
- int ncontig;
- unsigned long pfn, dpfn;
- pgprot_t hugeprot;
-
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
-
- if (!pte_present(pte)) {
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
- __set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- if (!pte_cont(pte)) {
- __set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- pfn = pte_pfn(pte);
- dpfn = pgsize >> PAGE_SHIFT;
- hugeprot = pte_pgprot(pte);
-
- clear_flush(mm, addr, ptep, pgsize, ncontig);
-
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
-}
-
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 66061002ff36..5d1d3a6c7c44 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -535,29 +535,39 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
size_t *pgsize)
{
+ unsigned long hugepage_shift;
pte_t __pte;
/* We must read the raw value of the pte to get the size of the mapping */
__pte = READ_ONCE(*ptep);
- if (pgsize) {
- if (size >= PGDIR_SIZE)
+ if (size >= PGDIR_SIZE) {
+ if (pgsize)
*pgsize = PGDIR_SIZE;
- else if (size >= P4D_SIZE)
+ hugepage_shift = PGDIR_SHIFT;
+ } else if (size >= P4D_SIZE) {
+ if (pgsize)
*pgsize = P4D_SIZE;
- else if (size >= PUD_SIZE)
+ hugepage_shift = P4D_SHIFT;
+ } else if (size >= PUD_SIZE) {
+ if (pgsize)
*pgsize = PUD_SIZE;
- else if (size >= PMD_SIZE)
+ hugepage_shift = PUD_SHIFT;
+ } else if (size >= PMD_SIZE) {
+ if (pgsize)
*pgsize = PMD_SIZE;
- else
+ hugepage_shift = PMD_SHIFT;
+ } else {
+ if (pgsize)
*pgsize = PAGE_SIZE;
+ hugepage_shift = PAGE_SHIFT;
}
/* Make sure __pte is not a swap entry */
if (pte_valid_napot(__pte))
return napot_pte_num(napot_cont_order(__pte));
- return 1;
+ return size >> hugepage_shift;
}
#endif
@@ -586,8 +596,8 @@ static inline pte_t __ptep_get(pte_t *ptep)
return pte;
}
-static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval, unsigned int nr)
+static inline void __set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval, unsigned int nr)
{
#ifdef CONFIG_RISCV_ISA_SVNAPOT
if (unlikely(pte_valid_napot(pteval))) {
@@ -631,7 +641,8 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
}
}
-#define set_ptes set_ptes
+#define set_contptes(mm, addr, ptep, pte, nr, pgsize) \
+ __set_ptes(mm, addr, ptep, pte, nr)
static inline void pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
@@ -646,9 +657,8 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addre
extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep);
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
+static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
{
pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0));
@@ -687,6 +697,9 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
}
#define ptep_get __ptep_get
+#define set_ptes __set_ptes
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear __ptep_get_and_clear
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index be129f4f1503..d8f07aef758b 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -173,68 +173,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-static void clear_flush(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pgsize,
- unsigned long ncontig)
-{
- struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
- unsigned long i, saddr = addr;
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- ptep_get_and_clear(mm, addr, ptep);
-
- flush_tlb_range(&vma, saddr, addr);
-}
-
-/*
- * When dealing with NAPOT mappings, the privileged specification indicates that
- * "if an update needs to be made, the OS generally should first mark all of the
- * PTEs invalid, then issue SFENCE.VMA instruction(s) covering all 4 KiB regions
- * within the range, [...] then update the PTE(s), as described in Section
- * 4.2.1.". That's the equivalent of the Break-Before-Make approach used by
- * arm64.
- */
-void set_huge_pte_at(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- pte_t pte,
- unsigned long sz)
-{
- unsigned long hugepage_shift, pgsize;
- int i, pte_num;
-
- if (sz >= PGDIR_SIZE)
- hugepage_shift = PGDIR_SHIFT;
- else if (sz >= P4D_SIZE)
- hugepage_shift = P4D_SHIFT;
- else if (sz >= PUD_SIZE)
- hugepage_shift = PUD_SHIFT;
- else if (sz >= PMD_SIZE)
- hugepage_shift = PMD_SHIFT;
- else
- hugepage_shift = PAGE_SHIFT;
-
- pte_num = sz >> hugepage_shift;
- pgsize = 1 << hugepage_shift;
-
- if (!pte_present(pte)) {
- for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
- set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- if (!pte_napot(pte)) {
- set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- clear_flush(mm, addr, ptep, pgsize, pte_num);
-
- set_ptes(mm, addr, ptep, pte, pte_num);
-}
-
int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep,
diff --git a/mm/contpte.c b/mm/contpte.c
index e8574051d0b9..2320ee23478a 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -10,6 +10,8 @@
/*
* Any arch that wants to use that needs to define:
* - __ptep_get()
+ * - __set_ptes()
+ * - __ptep_get_and_clear()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -17,6 +19,7 @@
/*
* This file implements the following contpte aware API:
* - huge_ptep_get()
+ * - set_huge_pte_at()
*/
pte_t huge_ptep_get(pte_t *ptep)
@@ -43,3 +46,59 @@ pte_t huge_ptep_get(pte_t *ptep)
}
return orig_pte;
}
+
+/*
+ * ARM64: Changing some bits of contiguous entries requires us to follow a
+ * Break-Before-Make approach, breaking the whole contiguous set
+ * before we can change any entries. See ARM DDI 0487A.k_iss10775,
+ * "Misprogramming of the Contiguous bit", page D4-1762.
+ *
+ * RISCV: When dealing with NAPOT mappings, the privileged specification
+ * indicates that "if an update needs to be made, the OS generally should first
+ * mark all of the PTEs invalid, then issue SFENCE.VMA instruction(s) covering
+ * all 4 KiB regions within the range, [...] then update the PTE(s), as
+ * described in Section 4.2.1.". That's the equivalent of the Break-Before-Make
+ * approach used by arm64.
+ *
+ * This helper performs the break step for use cases where the
+ * original pte is not needed.
+ */
+static void clear_flush(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned long pgsize,
+ unsigned long ncontig)
+{
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ unsigned long i, saddr = addr;
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
+ __ptep_get_and_clear(mm, addr, ptep);
+
+ flush_tlb_range(&vma, saddr, addr);
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned long sz)
+{
+ size_t pgsize;
+ int i;
+ int ncontig;
+
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+
+ if (!pte_present(pte)) {
+ for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
+ __set_ptes(mm, addr, ptep, pte, 1);
+ return;
+ }
+
+ if (!pte_cont(pte)) {
+ __set_ptes(mm, addr, ptep, pte, 1);
+ return;
+ }
+
+ clear_flush(mm, addr, ptep, pgsize, ncontig);
+
+ set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
+}
--
2.39.2
Both architectures have the same implementation so move it to generic code.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 12 ------------
arch/riscv/include/asm/pgtable.h | 5 +++--
arch/riscv/mm/hugetlbpage.c | 19 -------------------
mm/contpte.c | 14 ++++++++++++++
4 files changed, 17 insertions(+), 33 deletions(-)
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index b8353b0a273c..cf44837369be 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -277,18 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned long sz)
-{
- int i, ncontig;
- size_t pgsize;
-
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- __pte_clear(mm, addr, ptep);
-}
-
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 5d1d3a6c7c44..0847a7fb8661 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -644,8 +644,8 @@ static inline void __set_ptes(struct mm_struct *mm, unsigned long addr,
#define set_contptes(mm, addr, ptep, pte, nr, pgsize) \
__set_ptes(mm, addr, ptep, pte, nr)
-static inline void pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+static inline void __pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
__set_pte_at(mm, ptep, __pte(0));
}
@@ -700,6 +700,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define set_ptes __set_ptes
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define ptep_get_and_clear __ptep_get_and_clear
+#define pte_clear __pte_clear
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index d8f07aef758b..437b1df059eb 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -254,25 +254,6 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
}
-void huge_pte_clear(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long sz)
-{
- size_t pgsize;
- pte_t pte = ptep_get(ptep);
- int i, pte_num;
-
- if (!pte_napot(pte)) {
- pte_clear(mm, addr, ptep);
- return;
- }
-
- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
- for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
- pte_clear(mm, addr, ptep);
-}
-
static bool is_napot_size(unsigned long size)
{
unsigned long order;
diff --git a/mm/contpte.c b/mm/contpte.c
index 2320ee23478a..22e0de197bd3 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -12,6 +12,7 @@
* - __ptep_get()
* - __set_ptes()
* - __ptep_get_and_clear()
+ * - __pte_clear()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -20,6 +21,7 @@
* This file implements the following contpte aware API:
* - huge_ptep_get()
* - set_huge_pte_at()
+ * - huge_pte_clear()
*/
pte_t huge_ptep_get(pte_t *ptep)
@@ -102,3 +104,15 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
}
+
+void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned long sz)
+{
+ int i, ncontig;
+ size_t pgsize;
+
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
+ __pte_clear(mm, addr, ptep);
+}
--
2.39.2
Both architectures have almost the same implementation:
__cont_access_flags_changed() is also correct on riscv and brings the
same benefits (ie don't do anything if the flags are unchanged).
As in the previous commit, get_clear_contig_flush() is duplicated in both
the arch and the generic codes, it will be removed from the arch code when
the last reference there gets moved to the generic code.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 65 ---------------------------
arch/riscv/include/asm/pgtable.h | 7 +--
arch/riscv/mm/hugetlbpage.c | 29 ------------
arch/riscv/mm/pgtable.c | 6 +--
mm/contpte.c | 75 ++++++++++++++++++++++++++++++++
5 files changed, 82 insertions(+), 100 deletions(-)
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 5ace4bf7ce35..052a5bf2926c 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -277,71 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-/*
- * huge_ptep_set_access_flags will update access flags (dirty, accesssed)
- * and write permission.
- *
- * For a contiguous huge pte range we need to check whether or not write
- * permission has to change only on the first pte in the set. Then for
- * all the contiguous ptes we need to check whether or not there is a
- * discrepancy between dirty or young.
- */
-static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
-{
- int i;
-
- if (pte_write(pte) != pte_write(__ptep_get(ptep)))
- return 1;
-
- for (i = 0; i < ncontig; i++) {
- pte_t orig_pte = __ptep_get(ptep + i);
-
- if (pte_dirty(pte) != pte_dirty(orig_pte))
- return 1;
-
- if (pte_young(pte) != pte_young(orig_pte))
- return 1;
- }
-
- return 0;
-}
-
-int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
-{
- int ncontig, i;
- size_t pgsize = 0;
- unsigned long pfn = pte_pfn(pte), dpfn;
- struct mm_struct *mm = vma->vm_mm;
- pgprot_t hugeprot;
- pte_t orig_pte;
-
- if (!pte_cont(pte))
- return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
- dpfn = pgsize >> PAGE_SHIFT;
-
- if (!__cont_access_flags_changed(ptep, pte, ncontig))
- return 0;
-
- orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
-
- /* Make sure we don't lose the dirty or young state */
- if (pte_dirty(orig_pte))
- pte = pte_mkdirty(pte);
-
- if (pte_young(orig_pte))
- pte = pte_mkyoung(pte);
-
- hugeprot = pte_pgprot(pte);
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
-
- return 1;
-}
-
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index d976113a370d..20f62505d0bc 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -652,9 +652,8 @@ static inline void __pte_clear(struct mm_struct *mm,
__set_pte_at(mm, ptep, __pte(0));
}
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* defined in mm/pgtable.c */
-extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep, pte_t entry, int dirty);
+extern int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep, pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG /* defined in mm/pgtable.c */
extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep);
@@ -703,6 +702,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define ptep_get_and_clear __ptep_get_and_clear
#define pte_clear __pte_clear
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags __ptep_set_access_flags
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index a757e0b2f090..a59b776e9c8b 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -173,35 +173,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr,
- pte_t *ptep,
- pte_t pte,
- int dirty)
-{
- struct mm_struct *mm = vma->vm_mm;
- size_t pgsize;
- pte_t orig_pte;
- int pte_num;
-
- if (!pte_napot(pte))
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-
- pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);
-
- orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
-
- if (pte_dirty(orig_pte))
- pte = pte_mkdirty(pte);
-
- if (pte_young(orig_pte))
- pte = pte_mkyoung(pte);
-
- set_ptes(mm, addr, ptep, pte, pte_num);
-
- return true;
-}
-
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep)
diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c
index 533ec9055fa0..e86df7ef193c 100644
--- a/arch/riscv/mm/pgtable.c
+++ b/arch/riscv/mm/pgtable.c
@@ -5,9 +5,9 @@
#include <linux/kernel.h>
#include <linux/pgtable.h>
-int ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
- pte_t entry, int dirty)
+int __ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
{
if (!pte_same(ptep_get(ptep), entry))
__set_pte_at(vma->vm_mm, ptep, entry);
diff --git a/mm/contpte.c b/mm/contpte.c
index 68eb1634b922..9c7a9f250bca 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -13,6 +13,7 @@
* - __set_ptes()
* - __ptep_get_and_clear()
* - __pte_clear()
+ * - __ptep_set_access_flags()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -23,6 +24,7 @@
* - set_huge_pte_at()
* - huge_pte_clear()
* - huge_ptep_get_and_clear()
+ * - huge_ptep_set_access_flags()
*/
pte_t huge_ptep_get(pte_t *ptep)
@@ -158,3 +160,76 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
}
+
+/*
+ * huge_ptep_set_access_flags will update access flags (dirty, accesssed)
+ * and write permission.
+ *
+ * For a contiguous huge pte range we need to check whether or not write
+ * permission has to change only on the first pte in the set. Then for
+ * all the contiguous ptes we need to check whether or not there is a
+ * discrepancy between dirty or young.
+ */
+static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
+{
+ int i;
+
+ if (pte_write(pte) != pte_write(__ptep_get(ptep)))
+ return 1;
+
+ for (i = 0; i < ncontig; i++) {
+ pte_t orig_pte = __ptep_get(ptep + i);
+
+ if (pte_dirty(pte) != pte_dirty(orig_pte))
+ return 1;
+
+ if (pte_young(pte) != pte_young(orig_pte))
+ return 1;
+ }
+
+ return 0;
+}
+
+static pte_t get_clear_contig_flush(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned long pgsize,
+ unsigned long ncontig)
+{
+ pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig);
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+
+ flush_tlb_range(&vma, addr, addr + (pgsize * ncontig));
+ return orig_pte;
+}
+
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+ int ncontig;
+ size_t pgsize = 0;
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t orig_pte;
+
+ if (!pte_cont(pte))
+ return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+
+ ncontig = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);
+
+ if (!__cont_access_flags_changed(ptep, pte, ncontig))
+ return 0;
+
+ orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
+
+ /* Make sure we don't lose the dirty or young state */
+ if (pte_dirty(orig_pte))
+ pte = pte_mkdirty(pte);
+
+ if (pte_young(orig_pte))
+ pte = pte_mkyoung(pte);
+
+ set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
+
+ return 1;
+}
--
2.39.2
After some adjustments, both architectures have the same implementation
so move it to the generic code.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 27 ---------------------------
arch/riscv/include/asm/pgtable.h | 7 ++++---
arch/riscv/mm/hugetlbpage.c | 22 ----------------------
mm/contpte.c | 22 ++++++++++++++++++++++
4 files changed, 26 insertions(+), 52 deletions(-)
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 052a5bf2926c..e56f2c8ec7e7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -277,33 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- unsigned long pfn, dpfn;
- pgprot_t hugeprot;
- int ncontig, i;
- size_t pgsize;
- pte_t pte;
-
- if (!pte_cont(__ptep_get(ptep))) {
- __ptep_set_wrprotect(mm, addr, ptep);
- return;
- }
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
- dpfn = pgsize >> PAGE_SHIFT;
-
- pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
- pte = pte_wrprotect(pte);
-
- hugeprot = pte_pgprot(pte);
- pfn = pte_pfn(pte);
-
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
-}
-
pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 20f62505d0bc..9e397935536e 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -668,9 +668,8 @@ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
return pte;
}
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
{
atomic_long_and(~(unsigned long)_PAGE_WRITE, (atomic_long_t *)ptep);
}
@@ -704,6 +703,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define pte_clear __pte_clear
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect __ptep_set_wrprotect
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index a59b776e9c8b..440d3bde88f2 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -173,28 +173,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep)
-{
- pte_t pte = ptep_get(ptep);
- size_t pgsize;
- pte_t orig_pte;
- int pte_num;
-
- if (!pte_napot(pte)) {
- ptep_set_wrprotect(mm, addr, ptep);
- return;
- }
-
- pte_num = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
-
- orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
- orig_pte = pte_wrprotect(orig_pte);
-
- set_ptes(mm, addr, ptep, orig_pte, pte_num);
-}
-
pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
diff --git a/mm/contpte.c b/mm/contpte.c
index 9c7a9f250bca..8ad2a3099dfd 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -14,6 +14,7 @@
* - __ptep_get_and_clear()
* - __pte_clear()
* - __ptep_set_access_flags()
+ * - __ptep_set_wrprotect()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -25,6 +26,7 @@
* - huge_pte_clear()
* - huge_ptep_get_and_clear()
* - huge_ptep_set_access_flags()
+ * - huge_ptep_set_wrprotect()
*/
pte_t huge_ptep_get(pte_t *ptep)
@@ -233,3 +235,23 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return 1;
}
+
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ int ncontig;
+ size_t pgsize;
+ pte_t pte;
+
+ if (!pte_cont(__ptep_get(ptep))) {
+ __ptep_set_wrprotect(mm, addr, ptep);
+ return;
+ }
+
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
+
+ pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
+ pte = pte_wrprotect(pte);
+
+ set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
+}
--
2.39.2
After some adjustments, both architectures have the same implementation
so move it to the generic code.
Note that get_clear_contig() function is duplicated in the generic and
the arm64 code because it is still used by some arm64 functions that
will, in the next commits, be moved to the generic code. Once all have
been moved, the arm64 version will be removed.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 14 +++++++++-
arch/arm64/mm/hugetlbpage.c | 19 ++-----------
arch/riscv/include/asm/pgtable.h | 4 ++-
arch/riscv/mm/hugetlbpage.c | 21 ++------------
mm/contpte.c | 48 ++++++++++++++++++++++++++++++--
5 files changed, 66 insertions(+), 40 deletions(-)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 2e0415fd5083..7c2938cb70b9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1644,11 +1644,23 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
#endif /* CONFIG_ARM64_CONTPTE */
-static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+int find_num_contig(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, size_t *pgsize);
+
+static inline int arch_contpte_get_num_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long size,
size_t *pgsize)
{
int contig_ptes = 0;
+ /*
+ * If the size is not passed, we need to go through the page table to
+ * find out the number of contiguous ptes.
+ */
+ if (size == 0)
+ return find_num_contig(mm, addr, ptep, pgsize);
+
*pgsize = size;
switch (size) {
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index cf44837369be..5ace4bf7ce35 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -93,8 +93,8 @@ int pud_huge(pud_t pud)
#endif
}
-static int find_num_contig(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, size_t *pgsize)
+int find_num_contig(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, size_t *pgsize)
{
pgd_t *pgdp = pgd_offset(mm, addr);
p4d_t *p4dp;
@@ -277,21 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- int ncontig;
- size_t pgsize;
- pte_t orig_pte = __ptep_get(ptep);
-
- if (!pte_cont(orig_pte))
- return __ptep_get_and_clear(mm, addr, ptep);
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
-
- return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
-}
-
/*
* huge_ptep_set_access_flags will update access flags (dirty, accesssed)
* and write permission.
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 0847a7fb8661..d976113a370d 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -532,7 +532,9 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
#ifdef CONFIG_RISCV_ISA_SVNAPOT
-static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+static inline int arch_contpte_get_num_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long size,
size_t *pgsize)
{
unsigned long hugepage_shift;
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 437b1df059eb..a757e0b2f090 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -187,7 +187,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
if (!pte_napot(pte))
return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
@@ -202,21 +202,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return true;
}
-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep)
-{
- pte_t orig_pte = ptep_get(ptep);
- int pte_num;
-
- if (!pte_napot(orig_pte))
- return ptep_get_and_clear(mm, addr, ptep);
-
- pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
-
- return get_clear_contig(mm, addr, ptep, pte_num);
-}
-
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep)
@@ -231,7 +216,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
return;
}
- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ pte_num = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
orig_pte = pte_wrprotect(orig_pte);
@@ -249,7 +234,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
if (!pte_napot(pte))
return ptep_clear_flush(vma, addr, ptep);
- pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
+ pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, NULL);
return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
}
diff --git a/mm/contpte.c b/mm/contpte.c
index 22e0de197bd3..68eb1634b922 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -22,6 +22,7 @@
* - huge_ptep_get()
* - set_huge_pte_at()
* - huge_pte_clear()
+ * - huge_ptep_get_and_clear()
*/
pte_t huge_ptep_get(pte_t *ptep)
@@ -33,7 +34,7 @@ pte_t huge_ptep_get(pte_t *ptep)
if (!pte_present(orig_pte) || !pte_cont(orig_pte))
return orig_pte;
- ncontig = arch_contpte_get_num_contig(ptep,
+ ncontig = arch_contpte_get_num_contig(NULL, 0, ptep,
page_size(pte_page(orig_pte)),
&pgsize);
@@ -87,7 +88,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
int i;
int ncontig;
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, sz, &pgsize);
if (!pte_present(pte)) {
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
@@ -111,8 +112,49 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
int i, ncontig;
size_t pgsize;
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, sz, &pgsize);
for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
__pte_clear(mm, addr, ptep);
}
+
+static pte_t get_clear_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned long pgsize,
+ unsigned long ncontig)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+ unsigned long i;
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
+ pte_t pte = __ptep_get_and_clear(mm, addr, ptep);
+
+ /*
+ * If HW_AFDBM (arm64) or svadu (riscv) is enabled, then the HW
+ * could turn on the dirty or accessed bit for any page in the
+ * set, so check them all.
+ */
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+ return orig_pte;
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ int ncontig;
+ size_t pgsize;
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (!pte_cont(orig_pte))
+ return __ptep_get_and_clear(mm, addr, ptep);
+
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
+
+ return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
+}
--
2.39.2
After some adjustments, both architectures have the same implementation
so move it to the generic code.
Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 61 -------------------------------------
arch/riscv/mm/hugetlbpage.c | 51 -------------------------------
mm/contpte.c | 15 +++++++++
3 files changed, 15 insertions(+), 112 deletions(-)
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index e56f2c8ec7e7..5869f20ca28e 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -112,53 +112,6 @@ int find_num_contig(struct mm_struct *mm, unsigned long addr,
return CONT_PTES;
}
-/*
- * Changing some bits of contiguous entries requires us to follow a
- * Break-Before-Make approach, breaking the whole contiguous set
- * before we can change any entries. See ARM DDI 0487A.k_iss10775,
- * "Misprogramming of the Contiguous bit", page D4-1762.
- *
- * This helper performs the break step.
- */
-static pte_t get_clear_contig(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pgsize,
- unsigned long ncontig)
-{
- pte_t orig_pte = __ptep_get(ptep);
- unsigned long i;
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
- pte_t pte = __ptep_get_and_clear(mm, addr, ptep);
-
- /*
- * If HW_AFDBM is enabled, then the HW could turn on
- * the dirty or accessed bit for any page in the set,
- * so check them all.
- */
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
- return orig_pte;
-}
-
-static pte_t get_clear_contig_flush(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pgsize,
- unsigned long ncontig)
-{
- pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig);
- struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
-
- flush_tlb_range(&vma, addr, addr + (pgsize * ncontig));
- return orig_pte;
-}
-
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
@@ -277,20 +230,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- struct mm_struct *mm = vma->vm_mm;
- size_t pgsize;
- int ncontig;
-
- if (!pte_cont(__ptep_get(ptep)))
- return ptep_clear_flush(vma, addr, ptep);
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
- return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
-}
-
static int __init hugetlbpage_init(void)
{
if (pud_sect_supported())
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 440d3bde88f2..47333efa2d83 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -121,42 +121,6 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
return 0UL;
}
-static pte_t get_clear_contig(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pte_num)
-{
- pte_t orig_pte = ptep_get(ptep);
- unsigned long i;
-
- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
- pte_t pte = ptep_get_and_clear(mm, addr, ptep);
-
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
-
- return orig_pte;
-}
-
-static pte_t get_clear_contig_flush(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pte_num)
-{
- pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num);
- struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
- bool valid = !pte_none(orig_pte);
-
- if (valid)
- flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num));
-
- return orig_pte;
-}
-
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
unsigned long order;
@@ -173,21 +137,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}
-pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long addr,
- pte_t *ptep)
-{
- pte_t pte = ptep_get(ptep);
- int pte_num;
-
- if (!pte_napot(pte))
- return ptep_clear_flush(vma, addr, ptep);
-
- pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, NULL);
-
- return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
-}
-
static bool is_napot_size(unsigned long size)
{
unsigned long order;
diff --git a/mm/contpte.c b/mm/contpte.c
index 8ad2a3099dfd..15791f6d9c41 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -27,6 +27,7 @@
* - huge_ptep_get_and_clear()
* - huge_ptep_set_access_flags()
* - huge_ptep_set_wrprotect()
+ * - huge_ptep_clear_flush()
*/
pte_t huge_ptep_get(pte_t *ptep)
@@ -255,3 +256,17 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
}
+
+pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ size_t pgsize;
+ int ncontig;
+
+ if (!pte_cont(__ptep_get(ptep)))
+ return ptep_clear_flush(vma, addr, ptep);
+
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
+ return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
+}
--
2.39.2
On 08/05/2024 12:34, Alexandre Ghiti wrote:
> The core mm code expects to be able to extract the pfn from a pte. NAPOT
> mappings work differently since its ptes actually point to the first pfn
> of the mapping, the other bits being used to encode the size of the
> mapping.
>
> So modify ptep_get() so that it returns a pte value that contains the
> *real* pfn (which is then different from what the HW expects) and right
> before storing the ptes to the page table, reset the pfn LSBs to the
> size of the mapping.
Did you consider leaving the pte as is and instead modifying your pte_pfn()
implementation?
For arm64 at least, it is beneficial to keep the pte marked as contiguous when
passing it up to core-mm because there are other helpers which need to parse the
contiguous bit (e.g. pte_leaf_size()). If we were to clear the cont bit in
ptep_get() that info would be lost and perf_get_pgtable_size() would always
conclude the leaf size is 4K even when it is actually 64K.
>
> And make sure that all NAPOT mappings are set using set_ptes().
>
> Signed-off-by: Alexandre Ghiti <[email protected]>
> ---
> arch/riscv/include/asm/pgtable-64.h | 11 +++
> arch/riscv/include/asm/pgtable.h | 105 ++++++++++++++++++++++++++--
> arch/riscv/mm/hugetlbpage.c | 38 +++++-----
> 3 files changed, 128 insertions(+), 26 deletions(-)
>
> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> index 221a5c1ee287..9fe076fc503e 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -106,6 +106,17 @@ enum napot_cont_order {
> #define napot_cont_mask(order) (~(napot_cont_size(order) - 1UL))
> #define napot_pte_num(order) BIT(order)
>
> +static inline bool is_napot_order(unsigned int order)
> +{
> + unsigned int napot_order;
> +
> + for_each_napot_order(napot_order)
> + if (order == napot_order)
> + return true;
> +
> + return false;
> +}
> +
> #ifdef CONFIG_RISCV_ISA_SVNAPOT
> #define HUGE_MAX_HSTATE (2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
> #else
> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> index 9f8ea0e33eb1..268c828f5152 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -297,6 +297,8 @@ static inline unsigned long pte_napot(pte_t pte)
> return pte_val(pte) & _PAGE_NAPOT;
> }
>
> +#define pte_valid_napot(pte) (pte_present(pte) && pte_napot(pte))
> +
> static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> {
> int pos = order - 1 + _PAGE_PFN_SHIFT;
> @@ -306,6 +308,12 @@ static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
> }
>
> +/* pte at entry must *not* encode the mapping size in the pfn LSBs. */
> +static inline pte_t pte_clear_napot(pte_t pte)
> +{
> + return __pte(pte_val(pte) & ~_PAGE_NAPOT);
> +}
> +
> #else
>
> static __always_inline bool has_svnapot(void) { return false; }
> @@ -315,17 +323,14 @@ static inline unsigned long pte_napot(pte_t pte)
> return 0;
> }
>
> +#define pte_valid_napot(pte) false
> +
> #endif /* CONFIG_RISCV_ISA_SVNAPOT */
>
> /* Yields the page frame number (PFN) of a page table entry */
> static inline unsigned long pte_pfn(pte_t pte)
> {
> - unsigned long res = __page_val_to_pfn(pte_val(pte));
> -
> - if (has_svnapot() && pte_napot(pte))
> - res = res & (res - 1UL);
> -
> - return res;
> + return __page_val_to_pfn(pte_val(pte));
> }
>
> #define pte_page(x) pfn_to_page(pte_pfn(x))
> @@ -525,9 +530,91 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
>
> #define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
>
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> +static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
> + size_t *pgsize)
> +{
> + pte_t __pte;
> +
> + /* We must read the raw value of the pte to get the size of the mapping */
> + __pte = READ_ONCE(*ptep);
> +
> + if (pgsize) {
> + if (size >= PGDIR_SIZE)
> + *pgsize = PGDIR_SIZE;
> + else if (size >= P4D_SIZE)
> + *pgsize = P4D_SIZE;
> + else if (size >= PUD_SIZE)
> + *pgsize = PUD_SIZE;
> + else if (size >= PMD_SIZE)
> + *pgsize = PMD_SIZE;
> + else
> + *pgsize = PAGE_SIZE;
> + }
> +
> + /* Make sure __pte is not a swap entry */
> + if (pte_valid_napot(__pte))
> + return napot_pte_num(napot_cont_order(__pte));
> +
> + return 1;
> +}
> +#endif
> +
> +static inline pte_t ptep_get(pte_t *ptep)
> +{
> + pte_t pte = READ_ONCE(*ptep);
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> + /*
> + * The pte we load has the N bit set and the size of the mapping in
> + * the pfn LSBs: keep the N bit and replace the mapping size with
> + * the *real* pfn since the core mm code expects to find it there.
> + * The mapping size will be reset just before being written to the
> + * page table in set_ptes().
> + */
> + if (unlikely(pte_valid_napot(pte))) {
> + unsigned int order = napot_cont_order(pte);
> + int pos = order - 1 + _PAGE_PFN_SHIFT;
> + unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
> + pte_t *orig_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * napot_pte_num(order));
> +
> + pte = __pte((pte_val(pte) & napot_mask) + ((ptep - orig_ptep) << _PAGE_PFN_SHIFT));
> + }
> +#endif
> +
> + return pte;
> +}
> +#define ptep_get ptep_get
> +
> static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
> pte_t *ptep, pte_t pteval, unsigned int nr)
> {
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> + if (unlikely(pte_valid_napot(pteval))) {
> + unsigned int order = ilog2(nr);
> +
> + if (!is_napot_order(order)) {
> + /*
> + * Something's weird, we are given a NAPOT pte but the
> + * size of the mapping is not a known NAPOT mapping
> + * size, so clear the NAPOT bit and map this without
> + * NAPOT support: core mm only manipulates pte with the
> + * real pfn so we know the pte is valid without the N
> + * bit.
> + */
> + pr_err("Incorrect NAPOT mapping, resetting.\n");
> + pteval = pte_clear_napot(pteval);
> + } else {
> + /*
> + * NAPOT ptes that arrive here only have the N bit set
> + * and their pfn does not contain the mapping size, so
> + * set that here.
> + */
> + pteval = pte_mknapot(pteval, order);
> + }
> + }
> +#endif
I think all this complexity comes along due to using this function both as a
public interface that the core-mm uses (which never sets napot) and also using
it as an internal interface that riscv-hugetlb uses (which does set napot)? It
might be more understandable if you layer it into a lower level/internal API and
a higher level/public API (similar to arm64)?
> +
> page_table_check_ptes_set(mm, ptep, pteval, nr);
>
> for (;;) {
> @@ -535,6 +622,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
> if (--nr == 0)
> break;
> ptep++;
> +
> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> + if (unlikely(pte_valid_napot(pteval)))
> + continue;
> +#endif
> +
> pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
> }
> }
> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> index 5ef2a6891158..fe8067ee71b4 100644
> --- a/arch/riscv/mm/hugetlbpage.c
> +++ b/arch/riscv/mm/hugetlbpage.c
> @@ -256,8 +256,7 @@ void set_huge_pte_at(struct mm_struct *mm,
>
> clear_flush(mm, addr, ptep, pgsize, pte_num);
>
> - for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
> - set_pte_at(mm, addr, ptep, pte);
> + set_ptes(mm, addr, ptep, pte, pte_num);
> }
>
> int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> @@ -267,16 +266,16 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> int dirty)
> {
> struct mm_struct *mm = vma->vm_mm;
> - unsigned long order;
> + size_t pgsize;
> pte_t orig_pte;
> - int i, pte_num;
> + int pte_num;
>
> if (!pte_napot(pte))
> return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>
> - order = napot_cont_order(pte);
> - pte_num = napot_pte_num(order);
> - ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
> + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> + ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
> +
> orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
>
> if (pte_dirty(orig_pte))
> @@ -285,8 +284,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> if (pte_young(orig_pte))
> pte = pte_mkyoung(pte);
>
> - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> - set_pte_at(mm, addr, ptep, pte);
> + set_ptes(mm, addr, ptep, pte, pte_num);
>
> return true;
> }
> @@ -301,7 +299,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> if (!pte_napot(orig_pte))
> return ptep_get_and_clear(mm, addr, ptep);
>
> - pte_num = napot_pte_num(napot_cont_order(orig_pte));
> + pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
>
> return get_clear_contig(mm, addr, ptep, pte_num);
> }
> @@ -311,24 +309,23 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
> pte_t *ptep)
> {
> pte_t pte = ptep_get(ptep);
> - unsigned long order;
> + size_t pgsize;
> pte_t orig_pte;
> - int i, pte_num;
> + int pte_num;
>
> if (!pte_napot(pte)) {
> ptep_set_wrprotect(mm, addr, ptep);
> return;
> }
>
> - order = napot_cont_order(pte);
> - pte_num = napot_pte_num(order);
> - ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
> + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> + ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
> +
> orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
>
> orig_pte = pte_wrprotect(orig_pte);
>
> - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> - set_pte_at(mm, addr, ptep, orig_pte);
> + set_ptes(mm, addr, ptep, orig_pte, pte_num);
> }
>
> pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> @@ -341,7 +338,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> if (!pte_napot(pte))
> return ptep_clear_flush(vma, addr, ptep);
>
> - pte_num = napot_pte_num(napot_cont_order(pte));
> + pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
>
> return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
> }
> @@ -351,6 +348,7 @@ void huge_pte_clear(struct mm_struct *mm,
> pte_t *ptep,
> unsigned long sz)
> {
> + size_t pgsize;
> pte_t pte = ptep_get(ptep);
> int i, pte_num;
>
> @@ -359,8 +357,8 @@ void huge_pte_clear(struct mm_struct *mm,
> return;
> }
>
> - pte_num = napot_pte_num(napot_cont_order(pte));
> - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> + for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
> pte_clear(mm, addr, ptep);
> }
>
On 08/05/2024 12:34, Alexandre Ghiti wrote:
> This patchset intends to merge the contiguous ptes hugetlbfs implementation
> of arm64 and riscv.
>
> Both arm64 and riscv support the use of contiguous ptes to map pages that
> are larger than the default page table size, respectively called contpte
> and svnapot.
>
> The riscv implementation differs from the arm64's in that the LSBs of the
> pfn of a svnapot pte are used to store the size of the mapping, allowing
> for future sizes to be added (for now only 64KB is supported). That's an
> issue for the core mm code which expects to find the *real* pfn a pte points
> to. Patch 1 fixes that by always returning svnapot ptes with the real pfn
> and restores the size of the mapping when it is written to a page table.
>
> The following patches are just merges of the 2 different implementations
> that currently exist in arm64 and riscv which are very similar. It paves
> the way to the reuse of the recent contpte THP work by Ryan [1] to avoid
> reimplementing the same in riscv.
Hi Alexandre,
I've skimmed through this series and the one that moves contpte. I can see there
is definitely value in sharing the implementation, and the rough shape of things
seems appropriate. I had some minor concerns about making it harder to implement
potential future arm64 errata workarounds but on reflection, most of the
now-shared code is really just wrapping the primitives that are still arch-specific.
I'm going to need to spend proper time reviewing it to give detailed feedback,
but I'll be out on paternity leave for 3 weeks from end of Monday at the latest.
So realistically I won't be able to do the detailed review until at least the
first week of June.
Some high level thoughts:
- huge_ptep_* functions could be working on different sized huge ptes - arm64
supports contpte, pmd, contpmd and pud. Is keeping them in contpte.c
appropriate? Perhaps it's better to keep huge_pte and contpte separate? Also, it
only works on arm64 because we can get away with calling the lower-level pte
functions even when the huge_pte is actually a contpmd/pmd/pud, because the
format is the same. That might present challenges to other arches if the format
is different?
- It might be easier to review if the arm64 stuff is first moved (without
changes) then modified to make it suitable for riscv, then for riscv to be
hooked up. At the moment I'm trying to follow all 3 parts per-function.
Thanks,
Ryan
>
> This patchset was tested by running the libhugetlbfs testsuite with 64KB
> and 2MB pages on both architectures (on a 4KB base page size arm64 kernel).
>
> [1] https://lore.kernel.org/linux-arm-kernel/[email protected]/
>
> Changes in v2:
> - Rebase on top of 6.9-rc3
>
> Alexandre Ghiti (9):
> riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code
> riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes
> mm: Use common huge_ptep_get() function for riscv/arm64
> mm: Use common set_huge_pte_at() function for riscv/arm64
> mm: Use common huge_pte_clear() function for riscv/arm64
> mm: Use common huge_ptep_get_and_clear() function for riscv/arm64
> mm: Use common huge_ptep_set_access_flags() function for riscv/arm64
> mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64
> mm: Use common huge_ptep_clear_flush() function for riscv/arm64
>
> arch/arm64/Kconfig | 1 +
> arch/arm64/include/asm/pgtable.h | 56 +++++-
> arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
> arch/riscv/Kconfig | 1 +
> arch/riscv/include/asm/hugetlb.h | 2 +-
> arch/riscv/include/asm/pgtable-64.h | 11 ++
> arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
> arch/riscv/mm/hugetlbpage.c | 227 ----------------------
> arch/riscv/mm/pgtable.c | 6 +-
> mm/Kconfig | 3 +
> mm/Makefile | 1 +
> mm/contpte.c | 272 ++++++++++++++++++++++++++
> 12 files changed, 480 insertions(+), 544 deletions(-)
> create mode 100644 mm/contpte.c
>
Hi Ryan,
On Fri, May 10, 2024 at 3:49 PM Ryan Roberts <[email protected]> wrote:
>
> On 08/05/2024 12:34, Alexandre Ghiti wrote:
> > This patchset intends to merge the contiguous ptes hugetlbfs implementation
> > of arm64 and riscv.
> >
> > Both arm64 and riscv support the use of contiguous ptes to map pages that
> > are larger than the default page table size, respectively called contpte
> > and svnapot.
> >
> > The riscv implementation differs from the arm64's in that the LSBs of the
> > pfn of a svnapot pte are used to store the size of the mapping, allowing
> > for future sizes to be added (for now only 64KB is supported). That's an
> > issue for the core mm code which expects to find the *real* pfn a pte points
> > to. Patch 1 fixes that by always returning svnapot ptes with the real pfn
> > and restores the size of the mapping when it is written to a page table.
> >
> > The following patches are just merges of the 2 different implementations
> > that currently exist in arm64 and riscv which are very similar. It paves
> > the way to the reuse of the recent contpte THP work by Ryan [1] to avoid
> > reimplementing the same in riscv.
>
> Hi Alexandre,
>
> I've skimmed through this series and the one that moves contpte. I can see there
> is definitely value in sharing the implementation, and the rough shape of things
> seems appropriate. I had some minor concerns about making it harder to implement
> potential future arm64 errata workarounds but on reflection, most of the
> now-shared code is really just wrapping the primitives that are still arch-specific.
>
> I'm going to need to spend proper time reviewing it to give detailed feedback,
> but I'll be out on paternity leave for 3 weeks from end of Monday at the latest.
Too bad, I expected to discuss that with you at LSF/MM...But congrats!
Hope your wife is fine :)
> So realistically I won't be able to do the detailed review until at least the
> first week of June.
>
> Some high level thoughts:
>
> - huge_ptep_* functions could be working on different sized huge ptes - arm64
> supports contpte, pmd, contpmd and pud. Is keeping them in contpte.c
> appropriate?
Hmm indeed, I'll see what I can do.
> Perhaps it's better to keep huge_pte and contpte separate? Also, it
> only works on arm64 because we can get away with calling the lower-level pte
> functions even when the huge_pte is actually a contpmd/pmd/pud, because the
> format is the same. That might present challenges to other arches if the format
> is different?
Yes, but I think that if that happens, we could get away with it by
choosing the right function depending on the size of the mapping?
>
> - It might be easier to review if the arm64 stuff is first moved (without
> changes) then modified to make it suitable for riscv, then for riscv to be
> hooked up. At the moment I'm trying to follow all 3 parts per-function.
Ok, let me give it a try during your paternity leave!
>
> Thanks,
> Ryan
Thanks,
Alex
>
>
> >
> > This patchset was tested by running the libhugetlbfs testsuite with 64KB
> > and 2MB pages on both architectures (on a 4KB base page size arm64 kernel).
> >
> > [1] https://lore.kernel.org/linux-arm-kernel/[email protected]/
> >
> > Changes in v2:
> > - Rebase on top of 6.9-rc3
> >
> > Alexandre Ghiti (9):
> > riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code
> > riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes
> > mm: Use common huge_ptep_get() function for riscv/arm64
> > mm: Use common set_huge_pte_at() function for riscv/arm64
> > mm: Use common huge_pte_clear() function for riscv/arm64
> > mm: Use common huge_ptep_get_and_clear() function for riscv/arm64
> > mm: Use common huge_ptep_set_access_flags() function for riscv/arm64
> > mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64
> > mm: Use common huge_ptep_clear_flush() function for riscv/arm64
> >
> > arch/arm64/Kconfig | 1 +
> > arch/arm64/include/asm/pgtable.h | 56 +++++-
> > arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
> > arch/riscv/Kconfig | 1 +
> > arch/riscv/include/asm/hugetlb.h | 2 +-
> > arch/riscv/include/asm/pgtable-64.h | 11 ++
> > arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
> > arch/riscv/mm/hugetlbpage.c | 227 ----------------------
> > arch/riscv/mm/pgtable.c | 6 +-
> > mm/Kconfig | 3 +
> > mm/Makefile | 1 +
> > mm/contpte.c | 272 ++++++++++++++++++++++++++
> > 12 files changed, 480 insertions(+), 544 deletions(-)
> > create mode 100644 mm/contpte.c
> >
>
On 12/05/2024 18:25, Alexandre Ghiti wrote:
> Hi Ryan,
>
> On Fri, May 10, 2024 at 3:49 PM Ryan Roberts <[email protected]> wrote:
>>
>> On 08/05/2024 12:34, Alexandre Ghiti wrote:
>>> This patchset intends to merge the contiguous ptes hugetlbfs implementation
>>> of arm64 and riscv.
>>>
>>> Both arm64 and riscv support the use of contiguous ptes to map pages that
>>> are larger than the default page table size, respectively called contpte
>>> and svnapot.
>>>
>>> The riscv implementation differs from the arm64's in that the LSBs of the
>>> pfn of a svnapot pte are used to store the size of the mapping, allowing
>>> for future sizes to be added (for now only 64KB is supported). That's an
>>> issue for the core mm code which expects to find the *real* pfn a pte points
>>> to. Patch 1 fixes that by always returning svnapot ptes with the real pfn
>>> and restores the size of the mapping when it is written to a page table.
>>>
>>> The following patches are just merges of the 2 different implementations
>>> that currently exist in arm64 and riscv which are very similar. It paves
>>> the way to the reuse of the recent contpte THP work by Ryan [1] to avoid
>>> reimplementing the same in riscv.
>>
>> Hi Alexandre,
>>
>> I've skimmed through this series and the one that moves contpte. I can see there
>> is definitely value in sharing the implementation, and the rough shape of things
>> seems appropriate. I had some minor concerns about making it harder to implement
>> potential future arm64 errata workarounds but on reflection, most of the
>> now-shared code is really just wrapping the primitives that are still arch-specific.
>>
>> I'm going to need to spend proper time reviewing it to give detailed feedback,
>> but I'll be out on paternity leave for 3 weeks from end of Monday at the latest.
>
> Too bad, I expected to discuss that with you at LSF/MM...But congrats!
> Hope your wife is fine :)
Thanks! Yes its unfortunate timing - there are a few topics I would have liked
to get involved with. There's always next year...
>
>> So realistically I won't be able to do the detailed review until at least the
>> first week of June.
>>
>> Some high level thoughts:
>>
>> - huge_ptep_* functions could be working on different sized huge ptes - arm64
>> supports contpte, pmd, contpmd and pud. Is keeping them in contpte.c
>> appropriate?
>
> Hmm indeed, I'll see what I can do.
>
>> Perhaps it's better to keep huge_pte and contpte separate? Also, it
>> only works on arm64 because we can get away with calling the lower-level pte
>> functions even when the huge_pte is actually a contpmd/pmd/pud, because the
>> format is the same. That might present challenges to other arches if the format
>> is different?
>
> Yes, but I think that if that happens, we could get away with it by
> choosing the right function depending on the size of the mapping?
Yes possibly. One potential future new user of this common code would be arm32
(arch/arm), which also has the contig bit. But AIUI, the pmd an pte formats are
quite different. It's likely that arm would want to opt-in to contpte but not
huge_ptep, so separate selectors may be valuable.
>
>>
>> - It might be easier to review if the arm64 stuff is first moved (without
>> changes) then modified to make it suitable for riscv, then for riscv to be
>> hooked up. At the moment I'm trying to follow all 3 parts per-function.
>
> Ok, let me give it a try during your paternity leave!
Thanks! If it's too difficult then it's not a deal-breaker. Perhaps just an
initial patch to move the existing arm functions to core-mm without change, then
all your existing patches on top of that would do the job?
>
>>
>> Thanks,
>> Ryan
>
> Thanks,
>
> Alex
>
>>
>>
>>>
>>> This patchset was tested by running the libhugetlbfs testsuite with 64KB
>>> and 2MB pages on both architectures (on a 4KB base page size arm64 kernel).
>>>
>>> [1] https://lore.kernel.org/linux-arm-kernel/[email protected]/
>>>
>>> Changes in v2:
>>> - Rebase on top of 6.9-rc3
>>>
>>> Alexandre Ghiti (9):
>>> riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code
>>> riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes
>>> mm: Use common huge_ptep_get() function for riscv/arm64
>>> mm: Use common set_huge_pte_at() function for riscv/arm64
>>> mm: Use common huge_pte_clear() function for riscv/arm64
>>> mm: Use common huge_ptep_get_and_clear() function for riscv/arm64
>>> mm: Use common huge_ptep_set_access_flags() function for riscv/arm64
>>> mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64
>>> mm: Use common huge_ptep_clear_flush() function for riscv/arm64
>>>
>>> arch/arm64/Kconfig | 1 +
>>> arch/arm64/include/asm/pgtable.h | 56 +++++-
>>> arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
>>> arch/riscv/Kconfig | 1 +
>>> arch/riscv/include/asm/hugetlb.h | 2 +-
>>> arch/riscv/include/asm/pgtable-64.h | 11 ++
>>> arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
>>> arch/riscv/mm/hugetlbpage.c | 227 ----------------------
>>> arch/riscv/mm/pgtable.c | 6 +-
>>> mm/Kconfig | 3 +
>>> mm/Makefile | 1 +
>>> mm/contpte.c | 272 ++++++++++++++++++++++++++
>>> 12 files changed, 480 insertions(+), 544 deletions(-)
>>> create mode 100644 mm/contpte.c
>>>
>>
Hi Ryan,
On Fri, May 10, 2024 at 2:20 PM Ryan Roberts <[email protected]> wrote:
>
> On 08/05/2024 12:34, Alexandre Ghiti wrote:
> > The core mm code expects to be able to extract the pfn from a pte. NAPOT
> > mappings work differently since its ptes actually point to the first pfn
> > of the mapping, the other bits being used to encode the size of the
> > mapping.
> >
> > So modify ptep_get() so that it returns a pte value that contains the
> > *real* pfn (which is then different from what the HW expects) and right
> > before storing the ptes to the page table, reset the pfn LSBs to the
> > size of the mapping.
>
> Did you consider leaving the pte as is and instead modifying your pte_pfn()
> implementation?
>
> For arm64 at least, it is beneficial to keep the pte marked as contiguous when
> passing it up to core-mm because there are other helpers which need to parse the
> contiguous bit (e.g. pte_leaf_size()). If we were to clear the cont bit in
> ptep_get() that info would be lost and perf_get_pgtable_size() would always
> conclude the leaf size is 4K even when it is actually 64K.
I don't clear the contpte bit here (ie the napot bit), I'm just
setting the right pfn so that the core-mm code knows exactly which
page is targeted by each pte of a contpte region (remember riscv napot
extension uses the lsb of the pte pfn to encode the size the mapping,
so all ptes of a contpte region will return the same pfn).
And from pte_pfn(), we have no way of knowing from the pte value alone
which page is targeted, we need to know its position in the page table
to "guess" the right pfn.
>
> >
> > And make sure that all NAPOT mappings are set using set_ptes().
> >
> > Signed-off-by: Alexandre Ghiti <[email protected]>
> > ---
> > arch/riscv/include/asm/pgtable-64.h | 11 +++
> > arch/riscv/include/asm/pgtable.h | 105 ++++++++++++++++++++++++++--
> > arch/riscv/mm/hugetlbpage.c | 38 +++++-----
> > 3 files changed, 128 insertions(+), 26 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
> > index 221a5c1ee287..9fe076fc503e 100644
> > --- a/arch/riscv/include/asm/pgtable-64.h
> > +++ b/arch/riscv/include/asm/pgtable-64.h
> > @@ -106,6 +106,17 @@ enum napot_cont_order {
> > #define napot_cont_mask(order) (~(napot_cont_size(order) - 1UL))
> > #define napot_pte_num(order) BIT(order)
> >
> > +static inline bool is_napot_order(unsigned int order)
> > +{
> > + unsigned int napot_order;
> > +
> > + for_each_napot_order(napot_order)
> > + if (order == napot_order)
> > + return true;
> > +
> > + return false;
> > +}
> > +
> > #ifdef CONFIG_RISCV_ISA_SVNAPOT
> > #define HUGE_MAX_HSTATE (2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
> > #else
> > diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
> > index 9f8ea0e33eb1..268c828f5152 100644
> > --- a/arch/riscv/include/asm/pgtable.h
> > +++ b/arch/riscv/include/asm/pgtable.h
> > @@ -297,6 +297,8 @@ static inline unsigned long pte_napot(pte_t pte)
> > return pte_val(pte) & _PAGE_NAPOT;
> > }
> >
> > +#define pte_valid_napot(pte) (pte_present(pte) && pte_napot(pte))
> > +
> > static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> > {
> > int pos = order - 1 + _PAGE_PFN_SHIFT;
> > @@ -306,6 +308,12 @@ static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
> > return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
> > }
> >
> > +/* pte at entry must *not* encode the mapping size in the pfn LSBs. */
> > +static inline pte_t pte_clear_napot(pte_t pte)
> > +{
> > + return __pte(pte_val(pte) & ~_PAGE_NAPOT);
> > +}
> > +
> > #else
> >
> > static __always_inline bool has_svnapot(void) { return false; }
> > @@ -315,17 +323,14 @@ static inline unsigned long pte_napot(pte_t pte)
> > return 0;
> > }
> >
> > +#define pte_valid_napot(pte) false
> > +
> > #endif /* CONFIG_RISCV_ISA_SVNAPOT */
> >
> > /* Yields the page frame number (PFN) of a page table entry */
> > static inline unsigned long pte_pfn(pte_t pte)
> > {
> > - unsigned long res = __page_val_to_pfn(pte_val(pte));
> > -
> > - if (has_svnapot() && pte_napot(pte))
> > - res = res & (res - 1UL);
> > -
> > - return res;
> > + return __page_val_to_pfn(pte_val(pte));
> > }
> >
> > #define pte_page(x) pfn_to_page(pte_pfn(x))
> > @@ -525,9 +530,91 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
> >
> > #define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
> >
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > +static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
> > + size_t *pgsize)
> > +{
> > + pte_t __pte;
> > +
> > + /* We must read the raw value of the pte to get the size of the mapping */
> > + __pte = READ_ONCE(*ptep);
> > +
> > + if (pgsize) {
> > + if (size >= PGDIR_SIZE)
> > + *pgsize = PGDIR_SIZE;
> > + else if (size >= P4D_SIZE)
> > + *pgsize = P4D_SIZE;
> > + else if (size >= PUD_SIZE)
> > + *pgsize = PUD_SIZE;
> > + else if (size >= PMD_SIZE)
> > + *pgsize = PMD_SIZE;
> > + else
> > + *pgsize = PAGE_SIZE;
> > + }
> > +
> > + /* Make sure __pte is not a swap entry */
> > + if (pte_valid_napot(__pte))
> > + return napot_pte_num(napot_cont_order(__pte));
> > +
> > + return 1;
> > +}
> > +#endif
> > +
> > +static inline pte_t ptep_get(pte_t *ptep)
> > +{
> > + pte_t pte = READ_ONCE(*ptep);
> > +
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > + /*
> > + * The pte we load has the N bit set and the size of the mapping in
> > + * the pfn LSBs: keep the N bit and replace the mapping size with
> > + * the *real* pfn since the core mm code expects to find it there.
> > + * The mapping size will be reset just before being written to the
> > + * page table in set_ptes().
> > + */
> > + if (unlikely(pte_valid_napot(pte))) {
> > + unsigned int order = napot_cont_order(pte);
> > + int pos = order - 1 + _PAGE_PFN_SHIFT;
> > + unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
> > + pte_t *orig_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * napot_pte_num(order));
> > +
> > + pte = __pte((pte_val(pte) & napot_mask) + ((ptep - orig_ptep) << _PAGE_PFN_SHIFT));
> > + }
> > +#endif
> > +
> > + return pte;
> > +}
> > +#define ptep_get ptep_get
> > +
> > static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
> > pte_t *ptep, pte_t pteval, unsigned int nr)
> > {
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > + if (unlikely(pte_valid_napot(pteval))) {
> > + unsigned int order = ilog2(nr);
> > +
> > + if (!is_napot_order(order)) {
> > + /*
> > + * Something's weird, we are given a NAPOT pte but the
> > + * size of the mapping is not a known NAPOT mapping
> > + * size, so clear the NAPOT bit and map this without
> > + * NAPOT support: core mm only manipulates pte with the
> > + * real pfn so we know the pte is valid without the N
> > + * bit.
> > + */
> > + pr_err("Incorrect NAPOT mapping, resetting.\n");
> > + pteval = pte_clear_napot(pteval);
> > + } else {
> > + /*
> > + * NAPOT ptes that arrive here only have the N bit set
> > + * and their pfn does not contain the mapping size, so
> > + * set that here.
> > + */
> > + pteval = pte_mknapot(pteval, order);
> > + }
> > + }
> > +#endif
>
> I think all this complexity comes along due to using this function both as a
> public interface that the core-mm uses (which never sets napot)
> and also using
> it as an internal interface that riscv-hugetlb uses (which does set napot)? It
> might be more understandable if you layer it into a lower level/internal API and
> a higher level/public API (similar to arm64)?
I think you're right here, I'll try to do that too.
Thanks for your comments,
Alex
>
> > +
> > page_table_check_ptes_set(mm, ptep, pteval, nr);
> >
> > for (;;) {
> > @@ -535,6 +622,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
> > if (--nr == 0)
> > break;
> > ptep++;
> > +
> > +#ifdef CONFIG_RISCV_ISA_SVNAPOT
> > + if (unlikely(pte_valid_napot(pteval)))
> > + continue;
> > +#endif
> > +
> > pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
> > }
> > }
> > diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
> > index 5ef2a6891158..fe8067ee71b4 100644
> > --- a/arch/riscv/mm/hugetlbpage.c
> > +++ b/arch/riscv/mm/hugetlbpage.c
> > @@ -256,8 +256,7 @@ void set_huge_pte_at(struct mm_struct *mm,
> >
> > clear_flush(mm, addr, ptep, pgsize, pte_num);
> >
> > - for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
> > - set_pte_at(mm, addr, ptep, pte);
> > + set_ptes(mm, addr, ptep, pte, pte_num);
> > }
> >
> > int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> > @@ -267,16 +266,16 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> > int dirty)
> > {
> > struct mm_struct *mm = vma->vm_mm;
> > - unsigned long order;
> > + size_t pgsize;
> > pte_t orig_pte;
> > - int i, pte_num;
> > + int pte_num;
> >
> > if (!pte_napot(pte))
> > return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
> >
> > - order = napot_cont_order(pte);
> > - pte_num = napot_pte_num(order);
> > - ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
> > + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> > + ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
> > +
> > orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
> >
> > if (pte_dirty(orig_pte))
> > @@ -285,8 +284,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
> > if (pte_young(orig_pte))
> > pte = pte_mkyoung(pte);
> >
> > - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> > - set_pte_at(mm, addr, ptep, pte);
> > + set_ptes(mm, addr, ptep, pte, pte_num);
> >
> > return true;
> > }
> > @@ -301,7 +299,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
> > if (!pte_napot(orig_pte))
> > return ptep_get_and_clear(mm, addr, ptep);
> >
> > - pte_num = napot_pte_num(napot_cont_order(orig_pte));
> > + pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
> >
> > return get_clear_contig(mm, addr, ptep, pte_num);
> > }
> > @@ -311,24 +309,23 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
> > pte_t *ptep)
> > {
> > pte_t pte = ptep_get(ptep);
> > - unsigned long order;
> > + size_t pgsize;
> > pte_t orig_pte;
> > - int i, pte_num;
> > + int pte_num;
> >
> > if (!pte_napot(pte)) {
> > ptep_set_wrprotect(mm, addr, ptep);
> > return;
> > }
> >
> > - order = napot_cont_order(pte);
> > - pte_num = napot_pte_num(order);
> > - ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
> > + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> > + ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
> > +
> > orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
> >
> > orig_pte = pte_wrprotect(orig_pte);
> >
> > - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> > - set_pte_at(mm, addr, ptep, orig_pte);
> > + set_ptes(mm, addr, ptep, orig_pte, pte_num);
> > }
> >
> > pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> > @@ -341,7 +338,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
> > if (!pte_napot(pte))
> > return ptep_clear_flush(vma, addr, ptep);
> >
> > - pte_num = napot_pte_num(napot_cont_order(pte));
> > + pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
> >
> > return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
> > }
> > @@ -351,6 +348,7 @@ void huge_pte_clear(struct mm_struct *mm,
> > pte_t *ptep,
> > unsigned long sz)
> > {
> > + size_t pgsize;
> > pte_t pte = ptep_get(ptep);
> > int i, pte_num;
> >
> > @@ -359,8 +357,8 @@ void huge_pte_clear(struct mm_struct *mm,
> > return;
> > }
> >
> > - pte_num = napot_pte_num(napot_cont_order(pte));
> > - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
> > + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
> > + for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
> > pte_clear(mm, addr, ptep);
> > }
> >
>
On 13/05/2024 14:06, Alexandre Ghiti wrote:
> Hi Ryan,
>
> On Fri, May 10, 2024 at 2:20 PM Ryan Roberts <[email protected]> wrote:
>>
>> On 08/05/2024 12:34, Alexandre Ghiti wrote:
>>> The core mm code expects to be able to extract the pfn from a pte. NAPOT
>>> mappings work differently since its ptes actually point to the first pfn
>>> of the mapping, the other bits being used to encode the size of the
>>> mapping.
>>>
>>> So modify ptep_get() so that it returns a pte value that contains the
>>> *real* pfn (which is then different from what the HW expects) and right
>>> before storing the ptes to the page table, reset the pfn LSBs to the
>>> size of the mapping.
>>
>> Did you consider leaving the pte as is and instead modifying your pte_pfn()
>> implementation?
>>
>> For arm64 at least, it is beneficial to keep the pte marked as contiguous when
>> passing it up to core-mm because there are other helpers which need to parse the
>> contiguous bit (e.g. pte_leaf_size()). If we were to clear the cont bit in
>> ptep_get() that info would be lost and perf_get_pgtable_size() would always
>> conclude the leaf size is 4K even when it is actually 64K.
>
> I don't clear the contpte bit here (ie the napot bit), I'm just
> setting the right pfn so that the core-mm code knows exactly which
> page is targeted by each pte of a contpte region (remember riscv napot
> extension uses the lsb of the pte pfn to encode the size the mapping,
> so all ptes of a contpte region will return the same pfn).
>
> And from pte_pfn(), we have no way of knowing from the pte value alone
> which page is targeted, we need to know its position in the page table
> to "guess" the right pfn.
Ahh yes - good point!
>
>>
>>>
>>> And make sure that all NAPOT mappings are set using set_ptes().
>>>
>>> Signed-off-by: Alexandre Ghiti <[email protected]>
>>> ---
>>> arch/riscv/include/asm/pgtable-64.h | 11 +++
>>> arch/riscv/include/asm/pgtable.h | 105 ++++++++++++++++++++++++++--
>>> arch/riscv/mm/hugetlbpage.c | 38 +++++-----
>>> 3 files changed, 128 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
>>> index 221a5c1ee287..9fe076fc503e 100644
>>> --- a/arch/riscv/include/asm/pgtable-64.h
>>> +++ b/arch/riscv/include/asm/pgtable-64.h
>>> @@ -106,6 +106,17 @@ enum napot_cont_order {
>>> #define napot_cont_mask(order) (~(napot_cont_size(order) - 1UL))
>>> #define napot_pte_num(order) BIT(order)
>>>
>>> +static inline bool is_napot_order(unsigned int order)
>>> +{
>>> + unsigned int napot_order;
>>> +
>>> + for_each_napot_order(napot_order)
>>> + if (order == napot_order)
>>> + return true;
>>> +
>>> + return false;
>>> +}
>>> +
>>> #ifdef CONFIG_RISCV_ISA_SVNAPOT
>>> #define HUGE_MAX_HSTATE (2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
>>> #else
>>> diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
>>> index 9f8ea0e33eb1..268c828f5152 100644
>>> --- a/arch/riscv/include/asm/pgtable.h
>>> +++ b/arch/riscv/include/asm/pgtable.h
>>> @@ -297,6 +297,8 @@ static inline unsigned long pte_napot(pte_t pte)
>>> return pte_val(pte) & _PAGE_NAPOT;
>>> }
>>>
>>> +#define pte_valid_napot(pte) (pte_present(pte) && pte_napot(pte))
>>> +
>>> static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
>>> {
>>> int pos = order - 1 + _PAGE_PFN_SHIFT;
>>> @@ -306,6 +308,12 @@ static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
>>> return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
>>> }
>>>
>>> +/* pte at entry must *not* encode the mapping size in the pfn LSBs. */
>>> +static inline pte_t pte_clear_napot(pte_t pte)
>>> +{
>>> + return __pte(pte_val(pte) & ~_PAGE_NAPOT);
>>> +}
>>> +
>>> #else
>>>
>>> static __always_inline bool has_svnapot(void) { return false; }
>>> @@ -315,17 +323,14 @@ static inline unsigned long pte_napot(pte_t pte)
>>> return 0;
>>> }
>>>
>>> +#define pte_valid_napot(pte) false
>>> +
>>> #endif /* CONFIG_RISCV_ISA_SVNAPOT */
>>>
>>> /* Yields the page frame number (PFN) of a page table entry */
>>> static inline unsigned long pte_pfn(pte_t pte)
>>> {
>>> - unsigned long res = __page_val_to_pfn(pte_val(pte));
>>> -
>>> - if (has_svnapot() && pte_napot(pte))
>>> - res = res & (res - 1UL);
>>> -
>>> - return res;
>>> + return __page_val_to_pfn(pte_val(pte));
>>> }
>>>
>>> #define pte_page(x) pfn_to_page(pte_pfn(x))
>>> @@ -525,9 +530,91 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
>>>
>>> #define PFN_PTE_SHIFT _PAGE_PFN_SHIFT
>>>
>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>>> +static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
>>> + size_t *pgsize)
>>> +{
>>> + pte_t __pte;
>>> +
>>> + /* We must read the raw value of the pte to get the size of the mapping */
>>> + __pte = READ_ONCE(*ptep);
>>> +
>>> + if (pgsize) {
>>> + if (size >= PGDIR_SIZE)
>>> + *pgsize = PGDIR_SIZE;
>>> + else if (size >= P4D_SIZE)
>>> + *pgsize = P4D_SIZE;
>>> + else if (size >= PUD_SIZE)
>>> + *pgsize = PUD_SIZE;
>>> + else if (size >= PMD_SIZE)
>>> + *pgsize = PMD_SIZE;
>>> + else
>>> + *pgsize = PAGE_SIZE;
>>> + }
>>> +
>>> + /* Make sure __pte is not a swap entry */
>>> + if (pte_valid_napot(__pte))
>>> + return napot_pte_num(napot_cont_order(__pte));
>>> +
>>> + return 1;
>>> +}
>>> +#endif
>>> +
>>> +static inline pte_t ptep_get(pte_t *ptep)
>>> +{
>>> + pte_t pte = READ_ONCE(*ptep);
>>> +
>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>>> + /*
>>> + * The pte we load has the N bit set and the size of the mapping in
>>> + * the pfn LSBs: keep the N bit and replace the mapping size with
>>> + * the *real* pfn since the core mm code expects to find it there.
>>> + * The mapping size will be reset just before being written to the
>>> + * page table in set_ptes().
>>> + */
>>> + if (unlikely(pte_valid_napot(pte))) {
>>> + unsigned int order = napot_cont_order(pte);
>>> + int pos = order - 1 + _PAGE_PFN_SHIFT;
>>> + unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
>>> + pte_t *orig_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * napot_pte_num(order));
>>> +
>>> + pte = __pte((pte_val(pte) & napot_mask) + ((ptep - orig_ptep) << _PAGE_PFN_SHIFT));
>>> + }
>>> +#endif
>>> +
>>> + return pte;
>>> +}
>>> +#define ptep_get ptep_get
>>> +
>>> static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
>>> pte_t *ptep, pte_t pteval, unsigned int nr)
>>> {
>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>>> + if (unlikely(pte_valid_napot(pteval))) {
>>> + unsigned int order = ilog2(nr);
>>> +
>>> + if (!is_napot_order(order)) {
>>> + /*
>>> + * Something's weird, we are given a NAPOT pte but the
>>> + * size of the mapping is not a known NAPOT mapping
>>> + * size, so clear the NAPOT bit and map this without
>>> + * NAPOT support: core mm only manipulates pte with the
>>> + * real pfn so we know the pte is valid without the N
>>> + * bit.
>>> + */
>>> + pr_err("Incorrect NAPOT mapping, resetting.\n");
>>> + pteval = pte_clear_napot(pteval);
>>> + } else {
>>> + /*
>>> + * NAPOT ptes that arrive here only have the N bit set
>>> + * and their pfn does not contain the mapping size, so
>>> + * set that here.
>>> + */
>>> + pteval = pte_mknapot(pteval, order);
>>> + }
>>> + }
>>> +#endif
>>
>> I think all this complexity comes along due to using this function both as a
>> public interface that the core-mm uses (which never sets napot)
>> and also using
>> it as an internal interface that riscv-hugetlb uses (which does set napot)? It
>> might be more understandable if you layer it into a lower level/internal API and
>> a higher level/public API (similar to arm64)?
>
> I think you're right here, I'll try to do that too.
>
> Thanks for your comments,
>
> Alex
>
>>
>>> +
>>> page_table_check_ptes_set(mm, ptep, pteval, nr);
>>>
>>> for (;;) {
>>> @@ -535,6 +622,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
>>> if (--nr == 0)
>>> break;
>>> ptep++;
>>> +
>>> +#ifdef CONFIG_RISCV_ISA_SVNAPOT
>>> + if (unlikely(pte_valid_napot(pteval)))
>>> + continue;
>>> +#endif
>>> +
>>> pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
>>> }
>>> }
>>> diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
>>> index 5ef2a6891158..fe8067ee71b4 100644
>>> --- a/arch/riscv/mm/hugetlbpage.c
>>> +++ b/arch/riscv/mm/hugetlbpage.c
>>> @@ -256,8 +256,7 @@ void set_huge_pte_at(struct mm_struct *mm,
>>>
>>> clear_flush(mm, addr, ptep, pgsize, pte_num);
>>>
>>> - for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
>>> - set_pte_at(mm, addr, ptep, pte);
>>> + set_ptes(mm, addr, ptep, pte, pte_num);
>>> }
>>>
>>> int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>>> @@ -267,16 +266,16 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>>> int dirty)
>>> {
>>> struct mm_struct *mm = vma->vm_mm;
>>> - unsigned long order;
>>> + size_t pgsize;
>>> pte_t orig_pte;
>>> - int i, pte_num;
>>> + int pte_num;
>>>
>>> if (!pte_napot(pte))
>>> return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
>>>
>>> - order = napot_cont_order(pte);
>>> - pte_num = napot_pte_num(order);
>>> - ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
>>> + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
>>> + ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
>>> +
>>> orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
>>>
>>> if (pte_dirty(orig_pte))
>>> @@ -285,8 +284,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
>>> if (pte_young(orig_pte))
>>> pte = pte_mkyoung(pte);
>>>
>>> - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>>> - set_pte_at(mm, addr, ptep, pte);
>>> + set_ptes(mm, addr, ptep, pte, pte_num);
>>>
>>> return true;
>>> }
>>> @@ -301,7 +299,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
>>> if (!pte_napot(orig_pte))
>>> return ptep_get_and_clear(mm, addr, ptep);
>>>
>>> - pte_num = napot_pte_num(napot_cont_order(orig_pte));
>>> + pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
>>>
>>> return get_clear_contig(mm, addr, ptep, pte_num);
>>> }
>>> @@ -311,24 +309,23 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
>>> pte_t *ptep)
>>> {
>>> pte_t pte = ptep_get(ptep);
>>> - unsigned long order;
>>> + size_t pgsize;
>>> pte_t orig_pte;
>>> - int i, pte_num;
>>> + int pte_num;
>>>
>>> if (!pte_napot(pte)) {
>>> ptep_set_wrprotect(mm, addr, ptep);
>>> return;
>>> }
>>>
>>> - order = napot_cont_order(pte);
>>> - pte_num = napot_pte_num(order);
>>> - ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
>>> + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
>>> + ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
>>> +
>>> orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
>>>
>>> orig_pte = pte_wrprotect(orig_pte);
>>>
>>> - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>>> - set_pte_at(mm, addr, ptep, orig_pte);
>>> + set_ptes(mm, addr, ptep, orig_pte, pte_num);
>>> }
>>>
>>> pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>>> @@ -341,7 +338,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
>>> if (!pte_napot(pte))
>>> return ptep_clear_flush(vma, addr, ptep);
>>>
>>> - pte_num = napot_pte_num(napot_cont_order(pte));
>>> + pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
>>>
>>> return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
>>> }
>>> @@ -351,6 +348,7 @@ void huge_pte_clear(struct mm_struct *mm,
>>> pte_t *ptep,
>>> unsigned long sz)
>>> {
>>> + size_t pgsize;
>>> pte_t pte = ptep_get(ptep);
>>> int i, pte_num;
>>>
>>> @@ -359,8 +357,8 @@ void huge_pte_clear(struct mm_struct *mm,
>>> return;
>>> }
>>>
>>> - pte_num = napot_pte_num(napot_cont_order(pte));
>>> - for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
>>> + pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
>>> + for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
>>> pte_clear(mm, addr, ptep);
>>> }
>>>
>>
Hi Ryan,
On 12/05/2024 19:25, Alexandre Ghiti wrote:
> Hi Ryan,
>
> On Fri, May 10, 2024 at 3:49 PM Ryan Roberts <[email protected]> wrote:
>> On 08/05/2024 12:34, Alexandre Ghiti wrote:
>>> This patchset intends to merge the contiguous ptes hugetlbfs implementation
>>> of arm64 and riscv.
>>>
>>> Both arm64 and riscv support the use of contiguous ptes to map pages that
>>> are larger than the default page table size, respectively called contpte
>>> and svnapot.
>>>
>>> The riscv implementation differs from the arm64's in that the LSBs of the
>>> pfn of a svnapot pte are used to store the size of the mapping, allowing
>>> for future sizes to be added (for now only 64KB is supported). That's an
>>> issue for the core mm code which expects to find the *real* pfn a pte points
>>> to. Patch 1 fixes that by always returning svnapot ptes with the real pfn
>>> and restores the size of the mapping when it is written to a page table.
>>>
>>> The following patches are just merges of the 2 different implementations
>>> that currently exist in arm64 and riscv which are very similar. It paves
>>> the way to the reuse of the recent contpte THP work by Ryan [1] to avoid
>>> reimplementing the same in riscv.
>> Hi Alexandre,
>>
>> I've skimmed through this series and the one that moves contpte. I can see there
>> is definitely value in sharing the implementation, and the rough shape of things
>> seems appropriate. I had some minor concerns about making it harder to implement
>> potential future arm64 errata workarounds but on reflection, most of the
>> now-shared code is really just wrapping the primitives that are still arch-specific.
>>
>> I'm going to need to spend proper time reviewing it to give detailed feedback,
>> but I'll be out on paternity leave for 3 weeks from end of Monday at the latest.
> Too bad, I expected to discuss that with you at LSF/MM...But congrats!
> Hope your wife is fine :)
>
>> So realistically I won't be able to do the detailed review until at least the
>> first week of June.
>>
>> Some high level thoughts:
>>
>> - huge_ptep_* functions could be working on different sized huge ptes - arm64
>> supports contpte, pmd, contpmd and pud. Is keeping them in contpte.c
>> appropriate?
> Hmm indeed, I'll see what I can do.
So I took a look at that. It amounts to doing the same as what we do for
THP contptes, ie having both contpte-aware and "normal" APIs. Let's take
for example huge_ptep_get(), below is what I get. To me it's not that
bad, so I'll implement this unless there is strong opposition.
diff --git a/arch/arm64/include/asm/pgtable.h
b/arch/arm64/include/asm/pgtable.h
index f8efbc128446..869a9aae6c68 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1715,6 +1715,16 @@ static inline void clear_young_dirty_ptes(struct
vm_area_struct *vma,
contpte_clear_young_dirty_ptes(vma, addr, ptep, nr, flags);
}
+static inline pte_t huge_ptep_get(pte_t *ptep)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (!pte_present(orig_pte) || !pte_cont(orig_pte))
+ return orig_pte;
+
+ return contpte_huge_ptep_get(ptep);
+}
+
#else /* CONFIG_ARM64_CONTPTE */
#define ptep_get __ptep_get
@@ -1736,6 +1746,8 @@ static inline void clear_young_dirty_ptes(struct
vm_area_struct *vma,
#define ptep_set_access_flags __ptep_set_access_flags
#define clear_young_dirty_ptes __clear_young_dirty_ptes
+#define huge_ptep_get __ptep_get
+
#endif /* CONFIG_ARM64_CONTPTE */
#endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 3f09ac73cce3..aa0ee3f02226 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -127,28 +127,6 @@ static inline int num_contig_ptes(unsigned long
size, size_t *pgsize)
return contig_ptes;
}
-pte_t huge_ptep_get(pte_t *ptep)
-{
- int ncontig, i;
- size_t pgsize;
- pte_t orig_pte = __ptep_get(ptep);
-
- if (!pte_present(orig_pte) || !pte_cont(orig_pte))
- return orig_pte;
-
- ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
- for (i = 0; i < ncontig; i++, ptep++) {
- pte_t pte = __ptep_get(ptep);
-
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
- return orig_pte;
-}
-
/*
* Changing some bits of contiguous entries requires us to follow a
* Break-Before-Make approach, breaking the whole contiguous set
diff --git a/mm/contpte.c b/mm/contpte.c
new file mode 100644
index 000000000000..4e742cf00b6f
--- /dev/null
+++ b/mm/contpte.c
@@ -0,0 +1,17 @@
+pte_t contpte_huge_ptep_get(pte_t *ptep)
+{
+ int ncontig, i;
+ size_t pgsize;
+
+ ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
+ for (i = 0; i < ncontig; i++, ptep++) {
+ pte_t pte = __ptep_get(ptep);
+
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+ return orig_pte;
+}
>
>> Perhaps it's better to keep huge_pte and contpte separate? Also, it
>> only works on arm64 because we can get away with calling the lower-level pte
>> functions even when the huge_pte is actually a contpmd/pmd/pud, because the
>> format is the same. That might present challenges to other arches if the format
>> is different?
> Yes, but I think that if that happens, we could get away with it by
> choosing the right function depending on the size of the mapping?
>
>> - It might be easier to review if the arm64 stuff is first moved (without
>> changes) then modified to make it suitable for riscv, then for riscv to be
>> hooked up. At the moment I'm trying to follow all 3 parts per-function.
> Ok, let me give it a try during your paternity leave!
>
>> Thanks,
>> Ryan
> Thanks,
>
> Alex
>
>>
>>> This patchset was tested by running the libhugetlbfs testsuite with 64KB
>>> and 2MB pages on both architectures (on a 4KB base page size arm64 kernel).
>>>
>>> [1] https://lore.kernel.org/linux-arm-kernel/[email protected]/
>>>
>>> Changes in v2:
>>> - Rebase on top of 6.9-rc3
>>>
>>> Alexandre Ghiti (9):
>>> riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code
>>> riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes
>>> mm: Use common huge_ptep_get() function for riscv/arm64
>>> mm: Use common set_huge_pte_at() function for riscv/arm64
>>> mm: Use common huge_pte_clear() function for riscv/arm64
>>> mm: Use common huge_ptep_get_and_clear() function for riscv/arm64
>>> mm: Use common huge_ptep_set_access_flags() function for riscv/arm64
>>> mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64
>>> mm: Use common huge_ptep_clear_flush() function for riscv/arm64
>>>
>>> arch/arm64/Kconfig | 1 +
>>> arch/arm64/include/asm/pgtable.h | 56 +++++-
>>> arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
>>> arch/riscv/Kconfig | 1 +
>>> arch/riscv/include/asm/hugetlb.h | 2 +-
>>> arch/riscv/include/asm/pgtable-64.h | 11 ++
>>> arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
>>> arch/riscv/mm/hugetlbpage.c | 227 ----------------------
>>> arch/riscv/mm/pgtable.c | 6 +-
>>> mm/Kconfig | 3 +
>>> mm/Makefile | 1 +
>>> mm/contpte.c | 272 ++++++++++++++++++++++++++
>>> 12 files changed, 480 insertions(+), 544 deletions(-)
>>> create mode 100644 mm/contpte.c
>>>
> _______________________________________________
> linux-riscv mailing list
> [email protected]
> http://lists.infradead.org/mailman/listinfo/linux-riscv