2024-05-08 11:34:49

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 0/9] Merge arm64/riscv hugetlbfs contpte support

This patchset intends to merge the contiguous ptes hugetlbfs implementation
of arm64 and riscv.

Both arm64 and riscv support the use of contiguous ptes to map pages that
are larger than the default page table size, respectively called contpte
and svnapot.

The riscv implementation differs from the arm64's in that the LSBs of the
pfn of a svnapot pte are used to store the size of the mapping, allowing
for future sizes to be added (for now only 64KB is supported). That's an
issue for the core mm code which expects to find the *real* pfn a pte points
to. Patch 1 fixes that by always returning svnapot ptes with the real pfn
and restores the size of the mapping when it is written to a page table.

The following patches are just merges of the 2 different implementations
that currently exist in arm64 and riscv which are very similar. It paves
the way to the reuse of the recent contpte THP work by Ryan [1] to avoid
reimplementing the same in riscv.

This patchset was tested by running the libhugetlbfs testsuite with 64KB
and 2MB pages on both architectures (on a 4KB base page size arm64 kernel).

[1] https://lore.kernel.org/linux-arm-kernel/[email protected]/

Changes in v2:
- Rebase on top of 6.9-rc3

Alexandre Ghiti (9):
riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code
riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes
mm: Use common huge_ptep_get() function for riscv/arm64
mm: Use common set_huge_pte_at() function for riscv/arm64
mm: Use common huge_pte_clear() function for riscv/arm64
mm: Use common huge_ptep_get_and_clear() function for riscv/arm64
mm: Use common huge_ptep_set_access_flags() function for riscv/arm64
mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64
mm: Use common huge_ptep_clear_flush() function for riscv/arm64

arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/pgtable.h | 56 +++++-
arch/arm64/mm/hugetlbpage.c | 291 +---------------------------
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/hugetlb.h | 2 +-
arch/riscv/include/asm/pgtable-64.h | 11 ++
arch/riscv/include/asm/pgtable.h | 153 +++++++++++++--
arch/riscv/mm/hugetlbpage.c | 227 ----------------------
arch/riscv/mm/pgtable.c | 6 +-
mm/Kconfig | 3 +
mm/Makefile | 1 +
mm/contpte.c | 272 ++++++++++++++++++++++++++
12 files changed, 480 insertions(+), 544 deletions(-)
create mode 100644 mm/contpte.c

--
2.39.2



2024-05-08 11:35:37

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 1/9] riscv: Restore the pfn in a NAPOT pte when manipulated by core mm code

The core mm code expects to be able to extract the pfn from a pte. NAPOT
mappings work differently since its ptes actually point to the first pfn
of the mapping, the other bits being used to encode the size of the
mapping.

So modify ptep_get() so that it returns a pte value that contains the
*real* pfn (which is then different from what the HW expects) and right
before storing the ptes to the page table, reset the pfn LSBs to the
size of the mapping.

And make sure that all NAPOT mappings are set using set_ptes().

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/riscv/include/asm/pgtable-64.h | 11 +++
arch/riscv/include/asm/pgtable.h | 105 ++++++++++++++++++++++++++--
arch/riscv/mm/hugetlbpage.c | 38 +++++-----
3 files changed, 128 insertions(+), 26 deletions(-)

diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 221a5c1ee287..9fe076fc503e 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -106,6 +106,17 @@ enum napot_cont_order {
#define napot_cont_mask(order) (~(napot_cont_size(order) - 1UL))
#define napot_pte_num(order) BIT(order)

+static inline bool is_napot_order(unsigned int order)
+{
+ unsigned int napot_order;
+
+ for_each_napot_order(napot_order)
+ if (order == napot_order)
+ return true;
+
+ return false;
+}
+
#ifdef CONFIG_RISCV_ISA_SVNAPOT
#define HUGE_MAX_HSTATE (2 + (NAPOT_ORDER_MAX - NAPOT_CONT_ORDER_BASE))
#else
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 9f8ea0e33eb1..268c828f5152 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -297,6 +297,8 @@ static inline unsigned long pte_napot(pte_t pte)
return pte_val(pte) & _PAGE_NAPOT;
}

+#define pte_valid_napot(pte) (pte_present(pte) && pte_napot(pte))
+
static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
{
int pos = order - 1 + _PAGE_PFN_SHIFT;
@@ -306,6 +308,12 @@ static inline pte_t pte_mknapot(pte_t pte, unsigned int order)
return __pte((pte_val(pte) & napot_mask) | napot_bit | _PAGE_NAPOT);
}

+/* pte at entry must *not* encode the mapping size in the pfn LSBs. */
+static inline pte_t pte_clear_napot(pte_t pte)
+{
+ return __pte(pte_val(pte) & ~_PAGE_NAPOT);
+}
+
#else

static __always_inline bool has_svnapot(void) { return false; }
@@ -315,17 +323,14 @@ static inline unsigned long pte_napot(pte_t pte)
return 0;
}

+#define pte_valid_napot(pte) false
+
#endif /* CONFIG_RISCV_ISA_SVNAPOT */

/* Yields the page frame number (PFN) of a page table entry */
static inline unsigned long pte_pfn(pte_t pte)
{
- unsigned long res = __page_val_to_pfn(pte_val(pte));
-
- if (has_svnapot() && pte_napot(pte))
- res = res & (res - 1UL);
-
- return res;
+ return __page_val_to_pfn(pte_val(pte));
}

#define pte_page(x) pfn_to_page(pte_pfn(x))
@@ -525,9 +530,91 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)

#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT

+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+ size_t *pgsize)
+{
+ pte_t __pte;
+
+ /* We must read the raw value of the pte to get the size of the mapping */
+ __pte = READ_ONCE(*ptep);
+
+ if (pgsize) {
+ if (size >= PGDIR_SIZE)
+ *pgsize = PGDIR_SIZE;
+ else if (size >= P4D_SIZE)
+ *pgsize = P4D_SIZE;
+ else if (size >= PUD_SIZE)
+ *pgsize = PUD_SIZE;
+ else if (size >= PMD_SIZE)
+ *pgsize = PMD_SIZE;
+ else
+ *pgsize = PAGE_SIZE;
+ }
+
+ /* Make sure __pte is not a swap entry */
+ if (pte_valid_napot(__pte))
+ return napot_pte_num(napot_cont_order(__pte));
+
+ return 1;
+}
+#endif
+
+static inline pte_t ptep_get(pte_t *ptep)
+{
+ pte_t pte = READ_ONCE(*ptep);
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+ /*
+ * The pte we load has the N bit set and the size of the mapping in
+ * the pfn LSBs: keep the N bit and replace the mapping size with
+ * the *real* pfn since the core mm code expects to find it there.
+ * The mapping size will be reset just before being written to the
+ * page table in set_ptes().
+ */
+ if (unlikely(pte_valid_napot(pte))) {
+ unsigned int order = napot_cont_order(pte);
+ int pos = order - 1 + _PAGE_PFN_SHIFT;
+ unsigned long napot_mask = ~GENMASK(pos, _PAGE_PFN_SHIFT);
+ pte_t *orig_ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * napot_pte_num(order));
+
+ pte = __pte((pte_val(pte) & napot_mask) + ((ptep - orig_ptep) << _PAGE_PFN_SHIFT));
+ }
+#endif
+
+ return pte;
+}
+#define ptep_get ptep_get
+
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval, unsigned int nr)
{
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+ if (unlikely(pte_valid_napot(pteval))) {
+ unsigned int order = ilog2(nr);
+
+ if (!is_napot_order(order)) {
+ /*
+ * Something's weird, we are given a NAPOT pte but the
+ * size of the mapping is not a known NAPOT mapping
+ * size, so clear the NAPOT bit and map this without
+ * NAPOT support: core mm only manipulates pte with the
+ * real pfn so we know the pte is valid without the N
+ * bit.
+ */
+ pr_err("Incorrect NAPOT mapping, resetting.\n");
+ pteval = pte_clear_napot(pteval);
+ } else {
+ /*
+ * NAPOT ptes that arrive here only have the N bit set
+ * and their pfn does not contain the mapping size, so
+ * set that here.
+ */
+ pteval = pte_mknapot(pteval, order);
+ }
+ }
+#endif
+
page_table_check_ptes_set(mm, ptep, pteval, nr);

for (;;) {
@@ -535,6 +622,12 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
if (--nr == 0)
break;
ptep++;
+
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+ if (unlikely(pte_valid_napot(pteval)))
+ continue;
+#endif
+
pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
}
}
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 5ef2a6891158..fe8067ee71b4 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -256,8 +256,7 @@ void set_huge_pte_at(struct mm_struct *mm,

clear_flush(mm, addr, ptep, pgsize, pte_num);

- for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
- set_pte_at(mm, addr, ptep, pte);
+ set_ptes(mm, addr, ptep, pte, pte_num);
}

int huge_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -267,16 +266,16 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
int dirty)
{
struct mm_struct *mm = vma->vm_mm;
- unsigned long order;
+ size_t pgsize;
pte_t orig_pte;
- int i, pte_num;
+ int pte_num;

if (!pte_napot(pte))
return ptep_set_access_flags(vma, addr, ptep, pte, dirty);

- order = napot_cont_order(pte);
- pte_num = napot_pte_num(order);
- ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
+ pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
+
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);

if (pte_dirty(orig_pte))
@@ -285,8 +284,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
if (pte_young(orig_pte))
pte = pte_mkyoung(pte);

- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
- set_pte_at(mm, addr, ptep, pte);
+ set_ptes(mm, addr, ptep, pte, pte_num);

return true;
}
@@ -301,7 +299,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
if (!pte_napot(orig_pte))
return ptep_get_and_clear(mm, addr, ptep);

- pte_num = napot_pte_num(napot_cont_order(orig_pte));
+ pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);

return get_clear_contig(mm, addr, ptep, pte_num);
}
@@ -311,24 +309,23 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
pte_t *ptep)
{
pte_t pte = ptep_get(ptep);
- unsigned long order;
+ size_t pgsize;
pte_t orig_pte;
- int i, pte_num;
+ int pte_num;

if (!pte_napot(pte)) {
ptep_set_wrprotect(mm, addr, ptep);
return;
}

- order = napot_cont_order(pte);
- pte_num = napot_pte_num(order);
- ptep = huge_pte_offset(mm, addr, napot_cont_size(order));
+ pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ ptep = huge_pte_offset(mm, addr, pte_num * pgsize);
+
orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);

orig_pte = pte_wrprotect(orig_pte);

- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
- set_pte_at(mm, addr, ptep, orig_pte);
+ set_ptes(mm, addr, ptep, orig_pte, pte_num);
}

pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -341,7 +338,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
if (!pte_napot(pte))
return ptep_clear_flush(vma, addr, ptep);

- pte_num = napot_pte_num(napot_cont_order(pte));
+ pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);

return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
}
@@ -351,6 +348,7 @@ void huge_pte_clear(struct mm_struct *mm,
pte_t *ptep,
unsigned long sz)
{
+ size_t pgsize;
pte_t pte = ptep_get(ptep);
int i, pte_num;

@@ -359,8 +357,8 @@ void huge_pte_clear(struct mm_struct *mm,
return;
}

- pte_num = napot_pte_num(napot_cont_order(pte));
- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+ pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
pte_clear(mm, addr, ptep);
}

--
2.39.2


2024-05-08 11:36:36

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 2/9] riscv: Safely remove huge_pte_offset() when manipulating NAPOT ptes

The pte_t pointer is expected to point to the first entry of the NAPOT
mapping so no need to use huge_pte_offset(), similarly to what is done
in arm64.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/riscv/mm/hugetlbpage.c | 3 ---
1 file changed, 3 deletions(-)

diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index fe8067ee71b4..f042f5c8bdb7 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -274,7 +274,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return ptep_set_access_flags(vma, addr, ptep, pte, dirty);

pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
- ptep = huge_pte_offset(mm, addr, pte_num * pgsize);

orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);

@@ -319,10 +318,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
}

pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
- ptep = huge_pte_offset(mm, addr, pte_num * pgsize);

orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
-
orig_pte = pte_wrprotect(orig_pte);

set_ptes(mm, addr, ptep, orig_pte, pte_num);
--
2.39.2


2024-05-08 11:37:39

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 3/9] mm: Use common huge_ptep_get() function for riscv/arm64

For that, we need to introduce:

- a new config: ARCH_HAS_CONTPTE,
- a new arch specific function which returns the number of contiguous PTE
in a mapping and its base page size,
- a pte_cont() helper, only introduced for riscv since we keep the arm64
naming (contpte) which is more explicit than the riscv's (napot).

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/Kconfig | 1 +
arch/arm64/include/asm/pgtable.h | 30 +++++++++++++++++
arch/arm64/mm/hugetlbpage.c | 55 ++------------------------------
arch/riscv/Kconfig | 1 +
arch/riscv/include/asm/hugetlb.h | 2 +-
arch/riscv/include/asm/pgtable.h | 6 ++--
arch/riscv/mm/hugetlbpage.c | 24 --------------
mm/Kconfig | 3 ++
mm/Makefile | 1 +
mm/contpte.c | 45 ++++++++++++++++++++++++++
10 files changed, 88 insertions(+), 80 deletions(-)
create mode 100644 mm/contpte.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b11c98b3e84..ac2f6d906cc3 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -20,6 +20,7 @@ config ARM64
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_CACHE_LINE_SIZE
+ select ARCH_HAS_CONTPTE
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL
select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index afdd56d26ad7..e30149a128f2 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1638,6 +1638,36 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,

#endif /* CONFIG_ARM64_CONTPTE */

+static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+ size_t *pgsize)
+{
+ int contig_ptes = 0;
+
+ *pgsize = size;
+
+ switch (size) {
+#ifndef __PAGETABLE_PMD_FOLDED
+ case PUD_SIZE:
+ if (pud_sect_supported())
+ contig_ptes = 1;
+ break;
+#endif
+ case PMD_SIZE:
+ contig_ptes = 1;
+ break;
+ case CONT_PMD_SIZE:
+ *pgsize = PMD_SIZE;
+ contig_ptes = CONT_PMDS;
+ break;
+ case CONT_PTE_SIZE:
+ *pgsize = PAGE_SIZE;
+ contig_ptes = CONT_PTES;
+ break;
+ }
+
+ return contig_ptes;
+}
+
#endif /* !__ASSEMBLY__ */

#endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 0f0e10bb0a95..9e9c80ec6e74 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -112,57 +112,6 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr,
return CONT_PTES;
}

-static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
-{
- int contig_ptes = 0;
-
- *pgsize = size;
-
- switch (size) {
-#ifndef __PAGETABLE_PMD_FOLDED
- case PUD_SIZE:
- if (pud_sect_supported())
- contig_ptes = 1;
- break;
-#endif
- case PMD_SIZE:
- contig_ptes = 1;
- break;
- case CONT_PMD_SIZE:
- *pgsize = PMD_SIZE;
- contig_ptes = CONT_PMDS;
- break;
- case CONT_PTE_SIZE:
- *pgsize = PAGE_SIZE;
- contig_ptes = CONT_PTES;
- break;
- }
-
- return contig_ptes;
-}
-
-pte_t huge_ptep_get(pte_t *ptep)
-{
- int ncontig, i;
- size_t pgsize;
- pte_t orig_pte = __ptep_get(ptep);
-
- if (!pte_present(orig_pte) || !pte_cont(orig_pte))
- return orig_pte;
-
- ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
- for (i = 0; i < ncontig; i++, ptep++) {
- pte_t pte = __ptep_get(ptep);
-
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
- return orig_pte;
-}
-
/*
* Changing some bits of contiguous entries requires us to follow a
* Break-Before-Make approach, breaking the whole contiguous set
@@ -243,7 +192,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
unsigned long pfn, dpfn;
pgprot_t hugeprot;

- ncontig = num_contig_ptes(sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);

if (!pte_present(pte)) {
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
@@ -390,7 +339,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
int i, ncontig;
size_t pgsize;

- ncontig = num_contig_ptes(sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);

for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
__pte_clear(mm, addr, ptep);
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 8a0f403432e8..38d93cf44456 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -19,6 +19,7 @@ config RISCV
select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_HAS_BINFMT_FLAT
+ select ARCH_HAS_CONTPTE if RISCV_ISA_SVNAPOT
select ARCH_HAS_CURRENT_STACK_POINTER
select ARCH_HAS_DEBUG_VIRTUAL if MMU
select ARCH_HAS_DEBUG_VM_PGTABLE
diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h
index 22deb7a2a6ec..f195f611722b 100644
--- a/arch/riscv/include/asm/hugetlb.h
+++ b/arch/riscv/include/asm/hugetlb.h
@@ -49,7 +49,7 @@ pte_t huge_ptep_get(pte_t *ptep);
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags);
#define arch_make_huge_pte arch_make_huge_pte

-#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
+#endif /* CONFIG_RISCV_ISA_SVNAPOT */

#include <asm-generic/hugetlb.h>

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 268c828f5152..66061002ff36 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -296,6 +296,7 @@ static inline unsigned long pte_napot(pte_t pte)
{
return pte_val(pte) & _PAGE_NAPOT;
}
+#define pte_cont pte_napot

#define pte_valid_napot(pte) (pte_present(pte) && pte_napot(pte))

@@ -560,7 +561,7 @@ static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
}
#endif

-static inline pte_t ptep_get(pte_t *ptep)
+static inline pte_t __ptep_get(pte_t *ptep)
{
pte_t pte = READ_ONCE(*ptep);

@@ -584,7 +585,6 @@ static inline pte_t ptep_get(pte_t *ptep)

return pte;
}
-#define ptep_get ptep_get

static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval, unsigned int nr)
@@ -686,6 +686,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
return ptep_test_and_clear_young(vma, address, ptep);
}

+#define ptep_get __ptep_get
+
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
{
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index f042f5c8bdb7..be129f4f1503 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -3,30 +3,6 @@
#include <linux/err.h>

#ifdef CONFIG_RISCV_ISA_SVNAPOT
-pte_t huge_ptep_get(pte_t *ptep)
-{
- unsigned long pte_num;
- int i;
- pte_t orig_pte = ptep_get(ptep);
-
- if (!pte_present(orig_pte) || !pte_napot(orig_pte))
- return orig_pte;
-
- pte_num = napot_pte_num(napot_cont_order(orig_pte));
-
- for (i = 0; i < pte_num; i++, ptep++) {
- pte_t pte = ptep_get(ptep);
-
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
-
- return orig_pte;
-}
-
pte_t *huge_pte_alloc(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long addr,
diff --git a/mm/Kconfig b/mm/Kconfig
index b1448aa81e15..c325003d6552 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -981,6 +981,9 @@ config ARCH_HAS_CPU_CACHE_ALIASING
config ARCH_HAS_CACHE_LINE_SIZE
bool

+config ARCH_HAS_CONTPTE
+ bool
+
config ARCH_HAS_CURRENT_STACK_POINTER
bool
help
diff --git a/mm/Makefile b/mm/Makefile
index 4abb40b911ec..605ead58403b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_NUMA) += memory-tiers.o
obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
+obj-$(CONFIG_ARCH_HAS_CONTPTE) += contpte.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
ifdef CONFIG_SWAP
diff --git a/mm/contpte.c b/mm/contpte.c
new file mode 100644
index 000000000000..e8574051d0b9
--- /dev/null
+++ b/mm/contpte.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024 Rivos Inc.
+ */
+
+#include <linux/mm.h>
+#include <linux/pgtable.h>
+#include <linux/hugetlb.h>
+
+/*
+ * Any arch that wants to use that needs to define:
+ * - __ptep_get()
+ * - pte_cont()
+ * - arch_contpte_get_num_contig()
+ */
+
+/*
+ * This file implements the following contpte aware API:
+ * - huge_ptep_get()
+ */
+
+pte_t huge_ptep_get(pte_t *ptep)
+{
+ int ncontig, i;
+ size_t pgsize;
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (!pte_present(orig_pte) || !pte_cont(orig_pte))
+ return orig_pte;
+
+ ncontig = arch_contpte_get_num_contig(ptep,
+ page_size(pte_page(orig_pte)),
+ &pgsize);
+
+ for (i = 0; i < ncontig; i++, ptep++) {
+ pte_t pte = __ptep_get(ptep);
+
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+ return orig_pte;
+}
--
2.39.2


2024-05-08 11:39:13

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 4/9] mm: Use common set_huge_pte_at() function for riscv/arm64

After some adjustments, both architectures have the same implementation
so move it to generic code.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 14 +++++---
arch/arm64/mm/hugetlbpage.c | 56 -----------------------------
arch/riscv/include/asm/pgtable.h | 39 +++++++++++++-------
arch/riscv/mm/hugetlbpage.c | 62 --------------------------------
mm/contpte.c | 59 ++++++++++++++++++++++++++++++
5 files changed, 95 insertions(+), 135 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index e30149a128f2..2e0415fd5083 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -372,9 +372,10 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
}

-static inline void __set_ptes(struct mm_struct *mm,
- unsigned long __always_unused addr,
- pte_t *ptep, pte_t pte, unsigned int nr)
+static inline void ___set_ptes(struct mm_struct *mm,
+ unsigned long __always_unused addr,
+ pte_t *ptep, pte_t pte, unsigned int nr,
+ size_t pgsize)
{
page_table_check_ptes_set(mm, ptep, pte, nr);
__sync_cache_and_tags(pte, nr);
@@ -385,10 +386,15 @@ static inline void __set_ptes(struct mm_struct *mm,
if (--nr == 0)
break;
ptep++;
- pte = pte_advance_pfn(pte, 1);
+ pte = pte_advance_pfn(pte, pgsize >> PAGE_SHIFT);
}
}

+#define __set_ptes(mm, addr, ptep, pte, nr) \
+ ___set_ptes(mm, addr, ptep, pte, nr, PAGE_SIZE)
+
+#define set_contptes ___set_ptes
+
/*
* Huge pte definitions.
*/
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 9e9c80ec6e74..b8353b0a273c 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -159,62 +159,6 @@ static pte_t get_clear_contig_flush(struct mm_struct *mm,
return orig_pte;
}

-/*
- * Changing some bits of contiguous entries requires us to follow a
- * Break-Before-Make approach, breaking the whole contiguous set
- * before we can change any entries. See ARM DDI 0487A.k_iss10775,
- * "Misprogramming of the Contiguous bit", page D4-1762.
- *
- * This helper performs the break step for use cases where the
- * original pte is not needed.
- */
-static void clear_flush(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pgsize,
- unsigned long ncontig)
-{
- struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
- unsigned long i, saddr = addr;
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- __ptep_get_and_clear(mm, addr, ptep);
-
- flush_tlb_range(&vma, saddr, addr);
-}
-
-void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pte, unsigned long sz)
-{
- size_t pgsize;
- int i;
- int ncontig;
- unsigned long pfn, dpfn;
- pgprot_t hugeprot;
-
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
-
- if (!pte_present(pte)) {
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
- __set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- if (!pte_cont(pte)) {
- __set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- pfn = pte_pfn(pte);
- dpfn = pgsize >> PAGE_SHIFT;
- hugeprot = pte_pgprot(pte);
-
- clear_flush(mm, addr, ptep, pgsize, ncontig);
-
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
-}
-
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 66061002ff36..5d1d3a6c7c44 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -535,29 +535,39 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
size_t *pgsize)
{
+ unsigned long hugepage_shift;
pte_t __pte;

/* We must read the raw value of the pte to get the size of the mapping */
__pte = READ_ONCE(*ptep);

- if (pgsize) {
- if (size >= PGDIR_SIZE)
+ if (size >= PGDIR_SIZE) {
+ if (pgsize)
*pgsize = PGDIR_SIZE;
- else if (size >= P4D_SIZE)
+ hugepage_shift = PGDIR_SHIFT;
+ } else if (size >= P4D_SIZE) {
+ if (pgsize)
*pgsize = P4D_SIZE;
- else if (size >= PUD_SIZE)
+ hugepage_shift = P4D_SHIFT;
+ } else if (size >= PUD_SIZE) {
+ if (pgsize)
*pgsize = PUD_SIZE;
- else if (size >= PMD_SIZE)
+ hugepage_shift = PUD_SHIFT;
+ } else if (size >= PMD_SIZE) {
+ if (pgsize)
*pgsize = PMD_SIZE;
- else
+ hugepage_shift = PMD_SHIFT;
+ } else {
+ if (pgsize)
*pgsize = PAGE_SIZE;
+ hugepage_shift = PAGE_SHIFT;
}

/* Make sure __pte is not a swap entry */
if (pte_valid_napot(__pte))
return napot_pte_num(napot_cont_order(__pte));

- return 1;
+ return size >> hugepage_shift;
}
#endif

@@ -586,8 +596,8 @@ static inline pte_t __ptep_get(pte_t *ptep)
return pte;
}

-static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval, unsigned int nr)
+static inline void __set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pteval, unsigned int nr)
{
#ifdef CONFIG_RISCV_ISA_SVNAPOT
if (unlikely(pte_valid_napot(pteval))) {
@@ -631,7 +641,8 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_val(pteval) += 1 << _PAGE_PFN_SHIFT;
}
}
-#define set_ptes set_ptes
+#define set_contptes(mm, addr, ptep, pte, nr, pgsize) \
+ __set_ptes(mm, addr, ptep, pte, nr)

static inline void pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
@@ -646,9 +657,8 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addre
extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep);

-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
+static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
{
pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0));

@@ -687,6 +697,9 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
}

#define ptep_get __ptep_get
+#define set_ptes __set_ptes
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear __ptep_get_and_clear

#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index be129f4f1503..d8f07aef758b 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -173,68 +173,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-static void clear_flush(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pgsize,
- unsigned long ncontig)
-{
- struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
- unsigned long i, saddr = addr;
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- ptep_get_and_clear(mm, addr, ptep);
-
- flush_tlb_range(&vma, saddr, addr);
-}
-
-/*
- * When dealing with NAPOT mappings, the privileged specification indicates that
- * "if an update needs to be made, the OS generally should first mark all of the
- * PTEs invalid, then issue SFENCE.VMA instruction(s) covering all 4 KiB regions
- * within the range, [...] then update the PTE(s), as described in Section
- * 4.2.1.". That's the equivalent of the Break-Before-Make approach used by
- * arm64.
- */
-void set_huge_pte_at(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- pte_t pte,
- unsigned long sz)
-{
- unsigned long hugepage_shift, pgsize;
- int i, pte_num;
-
- if (sz >= PGDIR_SIZE)
- hugepage_shift = PGDIR_SHIFT;
- else if (sz >= P4D_SIZE)
- hugepage_shift = P4D_SHIFT;
- else if (sz >= PUD_SIZE)
- hugepage_shift = PUD_SHIFT;
- else if (sz >= PMD_SIZE)
- hugepage_shift = PMD_SHIFT;
- else
- hugepage_shift = PAGE_SHIFT;
-
- pte_num = sz >> hugepage_shift;
- pgsize = 1 << hugepage_shift;
-
- if (!pte_present(pte)) {
- for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
- set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- if (!pte_napot(pte)) {
- set_ptes(mm, addr, ptep, pte, 1);
- return;
- }
-
- clear_flush(mm, addr, ptep, pgsize, pte_num);
-
- set_ptes(mm, addr, ptep, pte, pte_num);
-}
-
int huge_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep,
diff --git a/mm/contpte.c b/mm/contpte.c
index e8574051d0b9..2320ee23478a 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -10,6 +10,8 @@
/*
* Any arch that wants to use that needs to define:
* - __ptep_get()
+ * - __set_ptes()
+ * - __ptep_get_and_clear()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -17,6 +19,7 @@
/*
* This file implements the following contpte aware API:
* - huge_ptep_get()
+ * - set_huge_pte_at()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -43,3 +46,59 @@ pte_t huge_ptep_get(pte_t *ptep)
}
return orig_pte;
}
+
+/*
+ * ARM64: Changing some bits of contiguous entries requires us to follow a
+ * Break-Before-Make approach, breaking the whole contiguous set
+ * before we can change any entries. See ARM DDI 0487A.k_iss10775,
+ * "Misprogramming of the Contiguous bit", page D4-1762.
+ *
+ * RISCV: When dealing with NAPOT mappings, the privileged specification
+ * indicates that "if an update needs to be made, the OS generally should first
+ * mark all of the PTEs invalid, then issue SFENCE.VMA instruction(s) covering
+ * all 4 KiB regions within the range, [...] then update the PTE(s), as
+ * described in Section 4.2.1.". That's the equivalent of the Break-Before-Make
+ * approach used by arm64.
+ *
+ * This helper performs the break step for use cases where the
+ * original pte is not needed.
+ */
+static void clear_flush(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned long pgsize,
+ unsigned long ncontig)
+{
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ unsigned long i, saddr = addr;
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
+ __ptep_get_and_clear(mm, addr, ptep);
+
+ flush_tlb_range(&vma, saddr, addr);
+}
+
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned long sz)
+{
+ size_t pgsize;
+ int i;
+ int ncontig;
+
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+
+ if (!pte_present(pte)) {
+ for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
+ __set_ptes(mm, addr, ptep, pte, 1);
+ return;
+ }
+
+ if (!pte_cont(pte)) {
+ __set_ptes(mm, addr, ptep, pte, 1);
+ return;
+ }
+
+ clear_flush(mm, addr, ptep, pgsize, ncontig);
+
+ set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
+}
--
2.39.2


2024-05-08 11:42:19

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 5/9] mm: Use common huge_pte_clear() function for riscv/arm64

Both architectures have the same implementation so move it to generic code.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 12 ------------
arch/riscv/include/asm/pgtable.h | 5 +++--
arch/riscv/mm/hugetlbpage.c | 19 -------------------
mm/contpte.c | 14 ++++++++++++++
4 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index b8353b0a273c..cf44837369be 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -277,18 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned long sz)
-{
- int i, ncontig;
- size_t pgsize;
-
- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
- __pte_clear(mm, addr, ptep);
-}
-
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 5d1d3a6c7c44..0847a7fb8661 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -644,8 +644,8 @@ static inline void __set_ptes(struct mm_struct *mm, unsigned long addr,
#define set_contptes(mm, addr, ptep, pte, nr, pgsize) \
__set_ptes(mm, addr, ptep, pte, nr)

-static inline void pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
+static inline void __pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
{
__set_pte_at(mm, ptep, __pte(0));
}
@@ -700,6 +700,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define set_ptes __set_ptes
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define ptep_get_and_clear __ptep_get_and_clear
+#define pte_clear __pte_clear

#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index d8f07aef758b..437b1df059eb 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -254,25 +254,6 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
}

-void huge_pte_clear(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long sz)
-{
- size_t pgsize;
- pte_t pte = ptep_get(ptep);
- int i, pte_num;
-
- if (!pte_napot(pte)) {
- pte_clear(mm, addr, ptep);
- return;
- }
-
- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
- for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
- pte_clear(mm, addr, ptep);
-}
-
static bool is_napot_size(unsigned long size)
{
unsigned long order;
diff --git a/mm/contpte.c b/mm/contpte.c
index 2320ee23478a..22e0de197bd3 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -12,6 +12,7 @@
* - __ptep_get()
* - __set_ptes()
* - __ptep_get_and_clear()
+ * - __pte_clear()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -20,6 +21,7 @@
* This file implements the following contpte aware API:
* - huge_ptep_get()
* - set_huge_pte_at()
+ * - huge_pte_clear()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -102,3 +104,15 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,

set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
}
+
+void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned long sz)
+{
+ int i, ncontig;
+ size_t pgsize;
+
+ ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
+ __pte_clear(mm, addr, ptep);
+}
--
2.39.2


2024-05-08 11:42:20

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 7/9] mm: Use common huge_ptep_set_access_flags() function for riscv/arm64

Both architectures have almost the same implementation:
__cont_access_flags_changed() is also correct on riscv and brings the
same benefits (ie don't do anything if the flags are unchanged).

As in the previous commit, get_clear_contig_flush() is duplicated in both
the arch and the generic codes, it will be removed from the arch code when
the last reference there gets moved to the generic code.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 65 ---------------------------
arch/riscv/include/asm/pgtable.h | 7 +--
arch/riscv/mm/hugetlbpage.c | 29 ------------
arch/riscv/mm/pgtable.c | 6 +--
mm/contpte.c | 75 ++++++++++++++++++++++++++++++++
5 files changed, 82 insertions(+), 100 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 5ace4bf7ce35..052a5bf2926c 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -277,71 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-/*
- * huge_ptep_set_access_flags will update access flags (dirty, accesssed)
- * and write permission.
- *
- * For a contiguous huge pte range we need to check whether or not write
- * permission has to change only on the first pte in the set. Then for
- * all the contiguous ptes we need to check whether or not there is a
- * discrepancy between dirty or young.
- */
-static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
-{
- int i;
-
- if (pte_write(pte) != pte_write(__ptep_get(ptep)))
- return 1;
-
- for (i = 0; i < ncontig; i++) {
- pte_t orig_pte = __ptep_get(ptep + i);
-
- if (pte_dirty(pte) != pte_dirty(orig_pte))
- return 1;
-
- if (pte_young(pte) != pte_young(orig_pte))
- return 1;
- }
-
- return 0;
-}
-
-int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t pte, int dirty)
-{
- int ncontig, i;
- size_t pgsize = 0;
- unsigned long pfn = pte_pfn(pte), dpfn;
- struct mm_struct *mm = vma->vm_mm;
- pgprot_t hugeprot;
- pte_t orig_pte;
-
- if (!pte_cont(pte))
- return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
- dpfn = pgsize >> PAGE_SHIFT;
-
- if (!__cont_access_flags_changed(ptep, pte, ncontig))
- return 0;
-
- orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
-
- /* Make sure we don't lose the dirty or young state */
- if (pte_dirty(orig_pte))
- pte = pte_mkdirty(pte);
-
- if (pte_young(orig_pte))
- pte = pte_mkyoung(pte);
-
- hugeprot = pte_pgprot(pte);
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
-
- return 1;
-}
-
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index d976113a370d..20f62505d0bc 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -652,9 +652,8 @@ static inline void __pte_clear(struct mm_struct *mm,
__set_pte_at(mm, ptep, __pte(0));
}

-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* defined in mm/pgtable.c */
-extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep, pte_t entry, int dirty);
+extern int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep, pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG /* defined in mm/pgtable.c */
extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep);
@@ -703,6 +702,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define ptep_get_and_clear __ptep_get_and_clear
#define pte_clear __pte_clear
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags __ptep_set_access_flags

#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index a757e0b2f090..a59b776e9c8b 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -173,35 +173,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-int huge_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr,
- pte_t *ptep,
- pte_t pte,
- int dirty)
-{
- struct mm_struct *mm = vma->vm_mm;
- size_t pgsize;
- pte_t orig_pte;
- int pte_num;
-
- if (!pte_napot(pte))
- return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
-
- pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);
-
- orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
-
- if (pte_dirty(orig_pte))
- pte = pte_mkdirty(pte);
-
- if (pte_young(orig_pte))
- pte = pte_mkyoung(pte);
-
- set_ptes(mm, addr, ptep, pte, pte_num);
-
- return true;
-}
-
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep)
diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c
index 533ec9055fa0..e86df7ef193c 100644
--- a/arch/riscv/mm/pgtable.c
+++ b/arch/riscv/mm/pgtable.c
@@ -5,9 +5,9 @@
#include <linux/kernel.h>
#include <linux/pgtable.h>

-int ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep,
- pte_t entry, int dirty)
+int __ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
{
if (!pte_same(ptep_get(ptep), entry))
__set_pte_at(vma->vm_mm, ptep, entry);
diff --git a/mm/contpte.c b/mm/contpte.c
index 68eb1634b922..9c7a9f250bca 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -13,6 +13,7 @@
* - __set_ptes()
* - __ptep_get_and_clear()
* - __pte_clear()
+ * - __ptep_set_access_flags()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -23,6 +24,7 @@
* - set_huge_pte_at()
* - huge_pte_clear()
* - huge_ptep_get_and_clear()
+ * - huge_ptep_set_access_flags()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -158,3 +160,76 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,

return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
}
+
+/*
+ * huge_ptep_set_access_flags will update access flags (dirty, accesssed)
+ * and write permission.
+ *
+ * For a contiguous huge pte range we need to check whether or not write
+ * permission has to change only on the first pte in the set. Then for
+ * all the contiguous ptes we need to check whether or not there is a
+ * discrepancy between dirty or young.
+ */
+static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
+{
+ int i;
+
+ if (pte_write(pte) != pte_write(__ptep_get(ptep)))
+ return 1;
+
+ for (i = 0; i < ncontig; i++) {
+ pte_t orig_pte = __ptep_get(ptep + i);
+
+ if (pte_dirty(pte) != pte_dirty(orig_pte))
+ return 1;
+
+ if (pte_young(pte) != pte_young(orig_pte))
+ return 1;
+ }
+
+ return 0;
+}
+
+static pte_t get_clear_contig_flush(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned long pgsize,
+ unsigned long ncontig)
+{
+ pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig);
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+
+ flush_tlb_range(&vma, addr, addr + (pgsize * ncontig));
+ return orig_pte;
+}
+
+int huge_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t pte, int dirty)
+{
+ int ncontig;
+ size_t pgsize = 0;
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t orig_pte;
+
+ if (!pte_cont(pte))
+ return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+
+ ncontig = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);
+
+ if (!__cont_access_flags_changed(ptep, pte, ncontig))
+ return 0;
+
+ orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
+
+ /* Make sure we don't lose the dirty or young state */
+ if (pte_dirty(orig_pte))
+ pte = pte_mkdirty(pte);
+
+ if (pte_young(orig_pte))
+ pte = pte_mkyoung(pte);
+
+ set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
+
+ return 1;
+}
--
2.39.2


2024-05-08 11:43:18

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 8/9] mm: Use common huge_ptep_set_wrprotect() function for riscv/arm64

After some adjustments, both architectures have the same implementation
so move it to the generic code.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 27 ---------------------------
arch/riscv/include/asm/pgtable.h | 7 ++++---
arch/riscv/mm/hugetlbpage.c | 22 ----------------------
mm/contpte.c | 22 ++++++++++++++++++++++
4 files changed, 26 insertions(+), 52 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 052a5bf2926c..e56f2c8ec7e7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -277,33 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- unsigned long pfn, dpfn;
- pgprot_t hugeprot;
- int ncontig, i;
- size_t pgsize;
- pte_t pte;
-
- if (!pte_cont(__ptep_get(ptep))) {
- __ptep_set_wrprotect(mm, addr, ptep);
- return;
- }
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
- dpfn = pgsize >> PAGE_SHIFT;
-
- pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
- pte = pte_wrprotect(pte);
-
- hugeprot = pte_pgprot(pte);
- pfn = pte_pfn(pte);
-
- for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
- __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
-}
-
pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 20f62505d0bc..9e397935536e 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -668,9 +668,8 @@ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
return pte;
}

-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep)
{
atomic_long_and(~(unsigned long)_PAGE_WRITE, (atomic_long_t *)ptep);
}
@@ -704,6 +703,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
#define pte_clear __pte_clear
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect __ptep_set_wrprotect

#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index a59b776e9c8b..440d3bde88f2 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -173,28 +173,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-void huge_ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep)
-{
- pte_t pte = ptep_get(ptep);
- size_t pgsize;
- pte_t orig_pte;
- int pte_num;
-
- if (!pte_napot(pte)) {
- ptep_set_wrprotect(mm, addr, ptep);
- return;
- }
-
- pte_num = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
-
- orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
- orig_pte = pte_wrprotect(orig_pte);
-
- set_ptes(mm, addr, ptep, orig_pte, pte_num);
-}
-
pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
unsigned long addr,
pte_t *ptep)
diff --git a/mm/contpte.c b/mm/contpte.c
index 9c7a9f250bca..8ad2a3099dfd 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -14,6 +14,7 @@
* - __ptep_get_and_clear()
* - __pte_clear()
* - __ptep_set_access_flags()
+ * - __ptep_set_wrprotect()
* - pte_cont()
* - arch_contpte_get_num_contig()
*/
@@ -25,6 +26,7 @@
* - huge_pte_clear()
* - huge_ptep_get_and_clear()
* - huge_ptep_set_access_flags()
+ * - huge_ptep_set_wrprotect()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -233,3 +235,23 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,

return 1;
}
+
+void huge_ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ int ncontig;
+ size_t pgsize;
+ pte_t pte;
+
+ if (!pte_cont(__ptep_get(ptep))) {
+ __ptep_set_wrprotect(mm, addr, ptep);
+ return;
+ }
+
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
+
+ pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
+ pte = pte_wrprotect(pte);
+
+ set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
+}
--
2.39.2


2024-05-08 11:43:33

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 6/9] mm: Use common huge_ptep_get_and_clear() function for riscv/arm64

After some adjustments, both architectures have the same implementation
so move it to the generic code.

Note that get_clear_contig() function is duplicated in the generic and
the arm64 code because it is still used by some arm64 functions that
will, in the next commits, be moved to the generic code. Once all have
been moved, the arm64 version will be removed.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 14 +++++++++-
arch/arm64/mm/hugetlbpage.c | 19 ++-----------
arch/riscv/include/asm/pgtable.h | 4 ++-
arch/riscv/mm/hugetlbpage.c | 21 ++------------
mm/contpte.c | 48 ++++++++++++++++++++++++++++++--
5 files changed, 66 insertions(+), 40 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 2e0415fd5083..7c2938cb70b9 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1644,11 +1644,23 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,

#endif /* CONFIG_ARM64_CONTPTE */

-static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+int find_num_contig(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, size_t *pgsize);
+
+static inline int arch_contpte_get_num_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long size,
size_t *pgsize)
{
int contig_ptes = 0;

+ /*
+ * If the size is not passed, we need to go through the page table to
+ * find out the number of contiguous ptes.
+ */
+ if (size == 0)
+ return find_num_contig(mm, addr, ptep, pgsize);
+
*pgsize = size;

switch (size) {
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index cf44837369be..5ace4bf7ce35 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -93,8 +93,8 @@ int pud_huge(pud_t pud)
#endif
}

-static int find_num_contig(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, size_t *pgsize)
+int find_num_contig(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, size_t *pgsize)
{
pgd_t *pgdp = pgd_offset(mm, addr);
p4d_t *p4dp;
@@ -277,21 +277,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- int ncontig;
- size_t pgsize;
- pte_t orig_pte = __ptep_get(ptep);
-
- if (!pte_cont(orig_pte))
- return __ptep_get_and_clear(mm, addr, ptep);
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
-
- return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
-}
-
/*
* huge_ptep_set_access_flags will update access flags (dirty, accesssed)
* and write permission.
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 0847a7fb8661..d976113a370d 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -532,7 +532,9 @@ static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
#define PFN_PTE_SHIFT _PAGE_PFN_SHIFT

#ifdef CONFIG_RISCV_ISA_SVNAPOT
-static inline int arch_contpte_get_num_contig(pte_t *ptep, unsigned long size,
+static inline int arch_contpte_get_num_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, unsigned long size,
size_t *pgsize)
{
unsigned long hugepage_shift;
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 437b1df059eb..a757e0b2f090 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -187,7 +187,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
if (!pte_napot(pte))
return ptep_set_access_flags(vma, addr, ptep, pte, dirty);

- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);

orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);

@@ -202,21 +202,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
return true;
}

-pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep)
-{
- pte_t orig_pte = ptep_get(ptep);
- int pte_num;
-
- if (!pte_napot(orig_pte))
- return ptep_get_and_clear(mm, addr, ptep);
-
- pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
-
- return get_clear_contig(mm, addr, ptep, pte_num);
-}
-
void huge_ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep)
@@ -231,7 +216,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
return;
}

- pte_num = arch_contpte_get_num_contig(ptep, 0, &pgsize);
+ pte_num = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);

orig_pte = get_clear_contig_flush(mm, addr, ptep, pte_num);
orig_pte = pte_wrprotect(orig_pte);
@@ -249,7 +234,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
if (!pte_napot(pte))
return ptep_clear_flush(vma, addr, ptep);

- pte_num = arch_contpte_get_num_contig(ptep, 0, NULL);
+ pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, NULL);

return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
}
diff --git a/mm/contpte.c b/mm/contpte.c
index 22e0de197bd3..68eb1634b922 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -22,6 +22,7 @@
* - huge_ptep_get()
* - set_huge_pte_at()
* - huge_pte_clear()
+ * - huge_ptep_get_and_clear()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -33,7 +34,7 @@ pte_t huge_ptep_get(pte_t *ptep)
if (!pte_present(orig_pte) || !pte_cont(orig_pte))
return orig_pte;

- ncontig = arch_contpte_get_num_contig(ptep,
+ ncontig = arch_contpte_get_num_contig(NULL, 0, ptep,
page_size(pte_page(orig_pte)),
&pgsize);

@@ -87,7 +88,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
int i;
int ncontig;

- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, sz, &pgsize);

if (!pte_present(pte)) {
for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
@@ -111,8 +112,49 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
int i, ncontig;
size_t pgsize;

- ncontig = arch_contpte_get_num_contig(ptep, sz, &pgsize);
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, sz, &pgsize);

for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
__pte_clear(mm, addr, ptep);
}
+
+static pte_t get_clear_contig(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep,
+ unsigned long pgsize,
+ unsigned long ncontig)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+ unsigned long i;
+
+ for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
+ pte_t pte = __ptep_get_and_clear(mm, addr, ptep);
+
+ /*
+ * If HW_AFDBM (arm64) or svadu (riscv) is enabled, then the HW
+ * could turn on the dirty or accessed bit for any page in the
+ * set, so check them all.
+ */
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+ return orig_pte;
+}
+
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ int ncontig;
+ size_t pgsize;
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (!pte_cont(orig_pte))
+ return __ptep_get_and_clear(mm, addr, ptep);
+
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
+
+ return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
+}
--
2.39.2


2024-05-08 11:51:16

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH RESEND v2 9/9] mm: Use common huge_ptep_clear_flush() function for riscv/arm64

After some adjustments, both architectures have the same implementation
so move it to the generic code.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/mm/hugetlbpage.c | 61 -------------------------------------
arch/riscv/mm/hugetlbpage.c | 51 -------------------------------
mm/contpte.c | 15 +++++++++
3 files changed, 15 insertions(+), 112 deletions(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index e56f2c8ec7e7..5869f20ca28e 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -112,53 +112,6 @@ int find_num_contig(struct mm_struct *mm, unsigned long addr,
return CONT_PTES;
}

-/*
- * Changing some bits of contiguous entries requires us to follow a
- * Break-Before-Make approach, breaking the whole contiguous set
- * before we can change any entries. See ARM DDI 0487A.k_iss10775,
- * "Misprogramming of the Contiguous bit", page D4-1762.
- *
- * This helper performs the break step.
- */
-static pte_t get_clear_contig(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pgsize,
- unsigned long ncontig)
-{
- pte_t orig_pte = __ptep_get(ptep);
- unsigned long i;
-
- for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
- pte_t pte = __ptep_get_and_clear(mm, addr, ptep);
-
- /*
- * If HW_AFDBM is enabled, then the HW could turn on
- * the dirty or accessed bit for any page in the set,
- * so check them all.
- */
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
- return orig_pte;
-}
-
-static pte_t get_clear_contig_flush(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pgsize,
- unsigned long ncontig)
-{
- pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig);
- struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
-
- flush_tlb_range(&vma, addr, addr + (pgsize * ncontig));
- return orig_pte;
-}
-
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, unsigned long sz)
{
@@ -277,20 +230,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- struct mm_struct *mm = vma->vm_mm;
- size_t pgsize;
- int ncontig;
-
- if (!pte_cont(__ptep_get(ptep)))
- return ptep_clear_flush(vma, addr, ptep);
-
- ncontig = find_num_contig(mm, addr, ptep, &pgsize);
- return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
-}
-
static int __init hugetlbpage_init(void)
{
if (pud_sect_supported())
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 440d3bde88f2..47333efa2d83 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -121,42 +121,6 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
return 0UL;
}

-static pte_t get_clear_contig(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pte_num)
-{
- pte_t orig_pte = ptep_get(ptep);
- unsigned long i;
-
- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
- pte_t pte = ptep_get_and_clear(mm, addr, ptep);
-
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
- }
-
- return orig_pte;
-}
-
-static pte_t get_clear_contig_flush(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep,
- unsigned long pte_num)
-{
- pte_t orig_pte = get_clear_contig(mm, addr, ptep, pte_num);
- struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
- bool valid = !pte_none(orig_pte);
-
- if (valid)
- flush_tlb_range(&vma, addr, addr + (PAGE_SIZE * pte_num));
-
- return orig_pte;
-}
-
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
unsigned long order;
@@ -173,21 +137,6 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
return entry;
}

-pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
- unsigned long addr,
- pte_t *ptep)
-{
- pte_t pte = ptep_get(ptep);
- int pte_num;
-
- if (!pte_napot(pte))
- return ptep_clear_flush(vma, addr, ptep);
-
- pte_num = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, NULL);
-
- return get_clear_contig_flush(vma->vm_mm, addr, ptep, pte_num);
-}
-
static bool is_napot_size(unsigned long size)
{
unsigned long order;
diff --git a/mm/contpte.c b/mm/contpte.c
index 8ad2a3099dfd..15791f6d9c41 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -27,6 +27,7 @@
* - huge_ptep_get_and_clear()
* - huge_ptep_set_access_flags()
* - huge_ptep_set_wrprotect()
+ * - huge_ptep_clear_flush()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -255,3 +256,17 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,

set_contptes(mm, addr, ptep, pte, ncontig, pgsize);
}
+
+pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ size_t pgsize;
+ int ncontig;
+
+ if (!pte_cont(__ptep_get(ptep)))
+ return ptep_clear_flush(vma, addr, ptep);
+
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
+ return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
+}
--
2.39.2