2024-05-08 19:28:19

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH 00/12] Make riscv use THP contpte support for arm64

This allows riscv to support napot (riscv equivalent to contpte) THPs by
moving arm64 contpte support into mm, the previous series [1] only merging
riscv and arm64 implementations of hugetlbfs contpte.

riscv contpte specification allows for different contpte sizes, although
only 64KB is supported for now. So in this patchset is implemented the
support of multiple contpte sizes, which introduces a few arch specific
helpers to determine what sizes are supported. Even though only one size
is supported on riscv, the implementation of the multi size support is to
show what it will look like when we support other sizes, and make sure
it does not regress arm64.

I tested arm64 using the cow kselftest and a kernel build with 4KB base
page size and 64KB contpte. riscv was tested with the same tests on *all*
contpte sizes that fit in the last page table level (support for PMD sizes
is not present here). Both arch were only tested on qemu.

Alexandre Ghiti (12):
mm, arm64: Rename ARM64_CONTPTE to THP_CONTPTE
mm, riscv, arm64: Use common ptep_get() function
mm, riscv, arm64: Use common set_ptes() function
mm, riscv, arm64: Use common ptep_get_lockless() function
mm, riscv, arm64: Use common set_pte() function
mm, riscv, arm64: Use common pte_clear() function
mm, riscv, arm64: Use common ptep_get_and_clear() function
mm, riscv, arm64: Use common ptep_test_and_clear_young() function
mm, riscv, arm64: Use common ptep_clear_flush_young() function
mm, riscv, arm64: Use common ptep_set_access_flags() function
mm, riscv, arm64: Use common ptep_set_wrprotect()/wrprotect_ptes()
functions
mm, riscv, arm64: Use common
get_and_clear_full_ptes()/clear_full_ptes() functions

arch/arm64/Kconfig | 9 -
arch/arm64/include/asm/pgtable.h | 318 +++++---------
arch/arm64/mm/Makefile | 1 -
arch/arm64/mm/contpte.c | 408 ------------------
arch/arm64/mm/hugetlbpage.c | 6 +-
arch/arm64/mm/mmu.c | 2 +-
arch/riscv/include/asm/kfence.h | 4 +-
arch/riscv/include/asm/pgtable.h | 206 +++++++++-
arch/riscv/kernel/efi.c | 4 +-
arch/riscv/kernel/hibernate.c | 2 +-
arch/riscv/kvm/mmu.c | 26 +-
arch/riscv/mm/fault.c | 2 +-
arch/riscv/mm/init.c | 4 +-
arch/riscv/mm/kasan_init.c | 16 +-
arch/riscv/mm/pageattr.c | 8 +-
arch/riscv/mm/pgtable.c | 6 +-
include/linux/contpte.h | 37 ++
mm/Kconfig | 9 +
mm/contpte.c | 685 ++++++++++++++++++++++++++++++-
19 files changed, 1056 insertions(+), 697 deletions(-)
delete mode 100644 arch/arm64/mm/contpte.c
create mode 100644 include/linux/contpte.h

--
2.39.2



2024-05-08 19:28:21

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH 07/12] mm, riscv, arm64: Use common ptep_get_and_clear() function

Make riscv use the contpte aware ptep_get_and_clear() function from arm64.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 8 ++------
arch/riscv/include/asm/pgtable.h | 7 +++++--
mm/contpte.c | 8 ++++++++
3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 74e582f2884f..ff7fe1d9cabe 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1473,12 +1473,8 @@ static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
- return __ptep_get_and_clear(mm, addr, ptep);
-}
+extern pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 41534f4b8a6d..03cd640137ed 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -794,6 +794,9 @@ extern void set_pte(pte_t *ptep, pte_t pte);
#define set_pte set_pte
extern void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
#define pte_clear pte_clear
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+extern pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);

#else /* CONFIG_THP_CONTPTE */

@@ -801,11 +804,11 @@ extern void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
#define set_ptes __set_ptes
#define set_pte __set_pte
#define pte_clear __pte_clear
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear __ptep_get_and_clear

#endif /* CONFIG_THP_CONTPTE */

-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-#define ptep_get_and_clear __ptep_get_and_clear
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
diff --git a/mm/contpte.c b/mm/contpte.c
index c9eff6426ca0..5bf939639233 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -46,6 +46,7 @@
* - ptep_get_lockless()
* - set_pte()
* - pte_clear()
+ * - ptep_get_and_clear()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -682,4 +683,11 @@ void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
__pte_clear(mm, addr, ptep);
}
+
+pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ return __ptep_get_and_clear(mm, addr, ptep);
+}
#endif /* CONFIG_THP_CONTPTE */
--
2.39.2


2024-05-08 19:29:00

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH 09/12] mm, riscv, arm64: Use common ptep_clear_flush_young() function

Make riscv use the contpte aware ptep_clear_flush_young() function from
arm64.

Note that riscv used to not flush the tlb after clearing the accessed
bit, which it does now: this will be improved when we implement svinval
support.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 22 ++++++++----------
arch/arm64/mm/contpte.c | 21 -----------------
arch/riscv/include/asm/pgtable.h | 12 +++++++---
include/linux/contpte.h | 2 ++
mm/contpte.c | 40 ++++++++++++++++++++++++++++++++
5 files changed, 61 insertions(+), 36 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9a8702d1ad00..92c12fb85cb4 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1389,8 +1389,6 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
unsigned long addr, pte_t *ptep,
unsigned int nr, int full);
-extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr);
extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
@@ -1479,16 +1477,8 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- pte_t orig_pte = __ptep_get(ptep);
-
- if (likely(!pte_valid_cont(orig_pte)))
- return __ptep_clear_flush_young(vma, addr, ptep);
-
- return contpte_ptep_clear_flush_young(vma, addr, ptep);
-}
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);

#define wrprotect_ptes wrprotect_ptes
static __always_inline void wrprotect_ptes(struct mm_struct *mm,
@@ -1616,6 +1606,14 @@ static inline void arch_contpte_flush_tlb_range(struct vm_area_struct *vma,
__flush_tlb_range(vma, start, end, stride, true, 3);
}

+static inline void arch_contpte_flush_tlb_range_nosync(struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long stride)
+{
+ __flush_tlb_range_nosync(vma, start, end, stride, true, 3);
+}
+
static inline int arch_contpte_get_first_ncontig(size_t *pgsize)
{
if (pgsize)
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 9bf471633ca4..16940511943c 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -45,27 +45,6 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);

-int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- int young;
-
- young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
-
- if (young) {
- /*
- * See comment in __ptep_clear_flush_young(); same rationale for
- * eliding the trailing DSB applies here.
- */
- addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
- __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
- PAGE_SIZE, true, 3);
- }
-
- return young;
-}
-EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young);
-
void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index d39cb24c6c4a..42c7884b8d2e 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -615,6 +615,8 @@ static inline void arch_contpte_flush_tlb_range(struct vm_area_struct *vma,
flush_tlb_mm_range(vma->vm_mm, start, end, stride);
}

+#define arch_contpte_flush_tlb_range_nosync arch_contpte_flush_tlb_range
+
static inline int arch_contpte_get_first_ncontig(size_t *pgsize)
{
if (pgsize)
@@ -758,9 +760,8 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
atomic_long_and(~(unsigned long)_PAGE_WRITE, (atomic_long_t *)ptep);
}

-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
- unsigned long address, pte_t *ptep)
+static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
{
/*
* This comment is borrowed from x86, but applies equally to RISC-V:
@@ -799,6 +800,9 @@ extern pte_t ptep_get_and_clear(struct mm_struct *mm,
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);

#else /* CONFIG_THP_CONTPTE */

@@ -810,6 +814,8 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
#define ptep_get_and_clear __ptep_get_and_clear
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
#define ptep_test_and_clear_young __ptep_test_and_clear_young
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young __ptep_clear_flush_young

#endif /* CONFIG_THP_CONTPTE */

diff --git a/include/linux/contpte.h b/include/linux/contpte.h
index 38092adbe0d4..76a49ac8b6f5 100644
--- a/include/linux/contpte.h
+++ b/include/linux/contpte.h
@@ -21,5 +21,7 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr);
int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
+int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);

#endif /* _LINUX_CONTPTE_H */
diff --git a/mm/contpte.c b/mm/contpte.c
index 220e9d81f401..600277b1196c 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -48,6 +48,7 @@
* - pte_clear()
* - ptep_get_and_clear()
* - ptep_test_and_clear_young()
+ * - ptep_clear_flush_young()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -729,4 +730,43 @@ __always_inline int ptep_test_and_clear_young(struct vm_area_struct *vma,

return contpte_ptep_test_and_clear_young(vma, addr, ptep);
}
+
+int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int young;
+
+ young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
+
+ if (young) {
+ /*
+ * See comment in __ptep_clear_flush_young(); same rationale for
+ * eliding the trailing DSB applies here.
+ */
+ size_t pgsize;
+ int ncontig;
+
+ ncontig = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep,
+ 0, &pgsize);
+
+ addr = ALIGN_DOWN(addr, ncontig * pgsize);
+ arch_contpte_flush_tlb_range_nosync(vma, addr,
+ addr + ncontig * pgsize,
+ pgsize);
+ }
+
+ return young;
+}
+EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young);
+
+__always_inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (likely(!pte_valid_cont(orig_pte)))
+ return __ptep_clear_flush_young(vma, addr, ptep);
+
+ return contpte_ptep_clear_flush_young(vma, addr, ptep);
+}
#endif /* CONFIG_THP_CONTPTE */
--
2.39.2


2024-05-08 19:29:58

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH 10/12] mm, riscv, arm64: Use common ptep_set_access_flags() function

Make riscv use the contpte aware ptep_set_access_flags() function from
arm64.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 19 ++--------
arch/arm64/mm/contpte.c | 46 -----------------------
arch/riscv/include/asm/pgtable.h | 10 +++--
include/linux/contpte.h | 3 ++
mm/contpte.c | 63 ++++++++++++++++++++++++++++++++
5 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 92c12fb85cb4..6591aab11c67 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1391,9 +1391,6 @@ extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
unsigned int nr, int full);
extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr);
-extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t entry, int dirty);

#define pte_batch_hint pte_batch_hint
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
@@ -1512,19 +1509,9 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
}

#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-static inline int ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t entry, int dirty)
-{
- pte_t orig_pte = __ptep_get(ptep);
-
- entry = pte_mknoncont(entry);
-
- if (likely(!pte_valid_cont(orig_pte)))
- return __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
-
- return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
-}
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty);

#else /* CONFIG_THP_CONTPTE */

diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 16940511943c..5675a61452ac 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -62,49 +62,3 @@ void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
__wrprotect_ptes(mm, addr, ptep, nr);
}
EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes);
-
-int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep,
- pte_t entry, int dirty)
-{
- unsigned long start_addr;
- pte_t orig_pte;
- int i;
-
- /*
- * Gather the access/dirty bits for the contiguous range. If nothing has
- * changed, its a noop.
- */
- orig_pte = pte_mknoncont(ptep_get(ptep));
- if (pte_val(orig_pte) == pte_val(entry))
- return 0;
-
- /*
- * We can fix up access/dirty bits without having to unfold the contig
- * range. But if the write bit is changing, we must unfold.
- */
- if (pte_write(orig_pte) == pte_write(entry)) {
- /*
- * For HW access management, we technically only need to update
- * the flag on a single pte in the range. But for SW access
- * management, we need to update all the ptes to prevent extra
- * faults. Avoid per-page tlb flush in __ptep_set_access_flags()
- * and instead flush the whole range at the end.
- */
- ptep = arch_contpte_align_down(ptep);
- start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
-
- for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
- __ptep_set_access_flags(vma, addr, ptep, entry, 0);
-
- if (dirty)
- __flush_tlb_range(vma, start_addr, addr,
- PAGE_SIZE, true, 3);
- } else {
- __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
- __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
- }
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 42c7884b8d2e..b151a5aa4de8 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -803,6 +803,10 @@ extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
extern int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);

#else /* CONFIG_THP_CONTPTE */

@@ -816,11 +820,11 @@ extern int ptep_clear_flush_young(struct vm_area_struct *vma,
#define ptep_test_and_clear_young __ptep_test_and_clear_young
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
#define ptep_clear_flush_young __ptep_clear_flush_young
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags __ptep_set_access_flags

#endif /* CONFIG_THP_CONTPTE */

-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags __ptep_set_access_flags
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define ptep_set_wrprotect __ptep_set_wrprotect

@@ -990,7 +994,7 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp,
pmd_t entry, int dirty)
{
- return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+ return __ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
}

#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
diff --git a/include/linux/contpte.h b/include/linux/contpte.h
index 76a49ac8b6f5..76244b0c678a 100644
--- a/include/linux/contpte.h
+++ b/include/linux/contpte.h
@@ -23,5 +23,8 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
+int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty);

#endif /* _LINUX_CONTPTE_H */
diff --git a/mm/contpte.c b/mm/contpte.c
index 600277b1196c..9cbbff1f67ad 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -769,4 +769,67 @@ __always_inline int ptep_clear_flush_young(struct vm_area_struct *vma,

return contpte_ptep_clear_flush_young(vma, addr, ptep);
}
+
+int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ unsigned long start_addr;
+ pte_t orig_pte;
+ int i;
+
+ /*
+ * Gather the access/dirty bits for the contiguous range. If nothing has
+ * changed, its a noop.
+ */
+ orig_pte = pte_mknoncont(ptep_get(ptep));
+ if (pte_val(orig_pte) == pte_val(entry))
+ return 0;
+
+ /*
+ * We can fix up access/dirty bits without having to unfold the contig
+ * range. But if the write bit is changing, we must unfold.
+ */
+ if (pte_write(orig_pte) == pte_write(entry)) {
+ /*
+ * For HW access management, we technically only need to update
+ * the flag on a single pte in the range. But for SW access
+ * management, we need to update all the ptes to prevent extra
+ * faults. Avoid per-page tlb flush in __ptep_set_access_flags()
+ * and instead flush the whole range at the end.
+ */
+ size_t pgsize;
+ int ncontig;
+
+ ptep = arch_contpte_align_down(ptep);
+ ncontig = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);
+ start_addr = addr = ALIGN_DOWN(addr, ncontig * pgsize);
+
+ for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
+ __ptep_set_access_flags(vma, addr, ptep, entry, 0);
+
+ if (dirty)
+ arch_contpte_flush_tlb_range(vma, start_addr, addr, pgsize);
+ } else {
+ __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
+ __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags);
+
+__always_inline int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ entry = pte_mknoncont(entry);
+
+ if (likely(!pte_valid_cont(orig_pte)))
+ return __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+
+ return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+}
#endif /* CONFIG_THP_CONTPTE */
--
2.39.2


2024-05-08 19:31:51

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH 11/12] mm, riscv, arm64: Use common ptep_set_wrprotect()/wrprotect_ptes() functions

Make riscv use the contpte aware ptep_set_wrprotect()/wrprotect_ptes()
function from arm64.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 56 ++++++------------------
arch/arm64/mm/contpte.c | 18 --------
arch/riscv/include/asm/pgtable.h | 25 +++++++++--
include/linux/contpte.h | 2 +
mm/contpte.c | 75 +++++++++++++++++++++++++++++++-
5 files changed, 110 insertions(+), 66 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 6591aab11c67..162efd9647dd 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1208,7 +1208,11 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

-static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
+/*
+ * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
+ */
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
unsigned long address, pte_t *ptep,
pte_t pte)
{
@@ -1222,23 +1226,13 @@ static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
} while (pte_val(pte) != pte_val(old_pte));
}

-/*
- * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
- * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
- */
-static inline void __ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
-{
- ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
-}
-
static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
pte_t *ptep, unsigned int nr)
{
unsigned int i;

for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
- __ptep_set_wrprotect(mm, address, ptep);
+ __ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1246,7 +1240,7 @@ static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
unsigned long address, pmd_t *pmdp)
{
- __ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
+ __ptep_set_wrprotect(mm, address, (pte_t *)pmdp, __ptep_get((pte_t *)pmdp));
}

#define pmdp_establish pmdp_establish
@@ -1389,8 +1383,6 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
unsigned long addr, pte_t *ptep,
unsigned int nr, int full);
-extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned int nr);

#define pte_batch_hint pte_batch_hint
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
@@ -1478,35 +1470,12 @@ extern int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);

#define wrprotect_ptes wrprotect_ptes
-static __always_inline void wrprotect_ptes(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep, unsigned int nr)
-{
- if (likely(nr == 1)) {
- /*
- * Optimization: wrprotect_ptes() can only be called for present
- * ptes so we only need to check contig bit as condition for
- * unfold, and we can remove the contig bit from the pte we read
- * to avoid re-reading. This speeds up fork() which is sensitive
- * for order-0 folios. Equivalent to contpte_try_unfold().
- */
- pte_t orig_pte = __ptep_get(ptep);
-
- if (unlikely(pte_cont(orig_pte))) {
- __contpte_try_unfold(mm, addr, ptep, orig_pte);
- orig_pte = pte_mknoncont(orig_pte);
- }
- ___ptep_set_wrprotect(mm, addr, ptep, orig_pte);
- } else {
- contpte_wrprotect_ptes(mm, addr, ptep, nr);
- }
-}
+extern void wrprotect_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, unsigned int nr);

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- wrprotect_ptes(mm, addr, ptep, 1);
-}
+extern void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
@@ -1528,7 +1497,8 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
#define ptep_clear_flush_young __ptep_clear_flush_young
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define ptep_set_wrprotect __ptep_set_wrprotect
+#define ptep_set_wrprotect(mm, addr, ptep) \
+ __ptep_set_wrprotect(mm, addr, ptep, __ptep_get(ptep))
#define wrprotect_ptes __wrprotect_ptes
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 5675a61452ac..1cef93b15d6e 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -44,21 +44,3 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
}
EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
-
-void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned int nr)
-{
- /*
- * If wrprotecting an entire contig range, we can avoid unfolding. Just
- * set wrprotect and wait for the later mmu_gather flush to invalidate
- * the tlb. Until the flush, the page may or may not be wrprotected.
- * After the flush, it is guaranteed wrprotected. If it's a partial
- * range though, we must unfold, because we can't have a case where
- * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this
- * would cause it to continue to be unpredictable after the flush.
- */
-
- contpte_try_unfold_partial(mm, addr, ptep, nr);
- __wrprotect_ptes(mm, addr, ptep, nr);
-}
-EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index b151a5aa4de8..728f31da5e6a 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -755,11 +755,21 @@ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
}

static inline void __ptep_set_wrprotect(struct mm_struct *mm,
- unsigned long address, pte_t *ptep)
+ unsigned long address, pte_t *ptep,
+ pte_t pte)
{
atomic_long_and(~(unsigned long)_PAGE_WRITE, (atomic_long_t *)ptep);
}

+static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
+ pte_t *ptep, unsigned int nr)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
+ __ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
+}
+
static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
@@ -807,6 +817,12 @@ extern int ptep_clear_flush_young(struct vm_area_struct *vma,
extern int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty);
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+extern void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);
+extern void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr);
+#define wrprotect_ptes wrprotect_ptes

#else /* CONFIG_THP_CONTPTE */

@@ -822,12 +838,13 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma,
#define ptep_clear_flush_young __ptep_clear_flush_young
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect(mm, addr, ptep) \
+ __ptep_set_wrprotect(mm, addr, ptep, __ptep_get(ptep))
+#define wrprotect_ptes __wrprotect_ptes

#endif /* CONFIG_THP_CONTPTE */

-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define ptep_set_wrprotect __ptep_set_wrprotect
-
#define pgprot_nx pgprot_nx
static inline pgprot_t pgprot_nx(pgprot_t _prot)
{
diff --git a/include/linux/contpte.h b/include/linux/contpte.h
index 76244b0c678a..d1439db1706c 100644
--- a/include/linux/contpte.h
+++ b/include/linux/contpte.h
@@ -26,5 +26,7 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty);
+void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr);

#endif /* _LINUX_CONTPTE_H */
diff --git a/mm/contpte.c b/mm/contpte.c
index 9cbbff1f67ad..fe36b6b1d20a 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -49,6 +49,8 @@
* - ptep_get_and_clear()
* - ptep_test_and_clear_young()
* - ptep_clear_flush_young()
+ * - wrprotect_ptes()
+ * - ptep_set_wrprotect()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -266,7 +268,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
pte_t pte;

if (!pte_cont(__ptep_get(ptep))) {
- __ptep_set_wrprotect(mm, addr, ptep);
+ __ptep_set_wrprotect(mm, addr, ptep, __ptep_get(ptep));
return;
}

@@ -832,4 +834,75 @@ __always_inline int ptep_set_access_flags(struct vm_area_struct *vma,

return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
}
+
+static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ /*
+ * Unfold any partially covered contpte block at the beginning and end
+ * of the range.
+ */
+ size_t pgsize;
+ int ncontig;
+
+ ncontig = arch_contpte_get_num_contig(mm, addr, ptep, 0, &pgsize);
+
+ if (ptep != arch_contpte_align_down(ptep) || nr < ncontig)
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+
+ if (ptep + nr != arch_contpte_align_down(ptep + nr)) {
+ unsigned long last_addr = addr + pgsize * (nr - 1);
+ pte_t *last_ptep = ptep + nr - 1;
+
+ contpte_try_unfold(mm, last_addr, last_ptep,
+ __ptep_get(last_ptep));
+ }
+}
+
+void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ /*
+ * If wrprotecting an entire contig range, we can avoid unfolding. Just
+ * set wrprotect and wait for the later mmu_gather flush to invalidate
+ * the tlb. Until the flush, the page may or may not be wrprotected.
+ * After the flush, it is guaranteed wrprotected. If it's a partial
+ * range though, we must unfold, because we can't have a case where
+ * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this
+ * would cause it to continue to be unpredictable after the flush.
+ */
+
+ contpte_try_unfold_partial(mm, addr, ptep, nr);
+ __wrprotect_ptes(mm, addr, ptep, nr);
+}
+EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes);
+
+__always_inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ if (likely(nr == 1)) {
+ /*
+ * Optimization: wrprotect_ptes() can only be called for present
+ * ptes so we only need to check contig bit as condition for
+ * unfold, and we can remove the contig bit from the pte we read
+ * to avoid re-reading. This speeds up fork() which is sensitive
+ * for order-0 folios. Equivalent to contpte_try_unfold().
+ */
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (unlikely(pte_cont(orig_pte))) {
+ __contpte_try_unfold(mm, addr, ptep, orig_pte);
+ orig_pte = pte_mknoncont(orig_pte);
+ }
+ __ptep_set_wrprotect(mm, addr, ptep, orig_pte);
+ } else {
+ contpte_wrprotect_ptes(mm, addr, ptep, nr);
+ }
+}
+
+__always_inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ wrprotect_ptes(mm, addr, ptep, 1);
+}
#endif /* CONFIG_THP_CONTPTE */
--
2.39.2


2024-05-08 19:32:23

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH 12/12] mm, riscv, arm64: Use common get_and_clear_full_ptes()/clear_full_ptes() functions

Make riscv use the contpte aware get_and_clear_full_ptes()/clear_full_ptes()
function from arm64.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 41 ++++------------------------
arch/arm64/mm/Makefile | 1 -
arch/arm64/mm/contpte.c | 46 -------------------------------
arch/riscv/include/asm/pgtable.h | 39 ++++++++++++++++++++++++++
include/linux/contpte.h | 5 ++++
mm/contpte.c | 47 ++++++++++++++++++++++++++++++++
6 files changed, 96 insertions(+), 83 deletions(-)
delete mode 100644 arch/arm64/mm/contpte.c

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 162efd9647dd..f8a3159f9df0 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1373,17 +1373,6 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,

#ifdef CONFIG_THP_CONTPTE

-/*
- * The contpte APIs are used to transparently manage the contiguous bit in ptes
- * where it is possible and makes sense to do so. The PTE_CONT bit is considered
- * a private implementation detail of the public ptep API (see below).
- */
-extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned int nr, int full);
-extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep,
- unsigned int nr, int full);
-
#define pte_batch_hint pte_batch_hint
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
{
@@ -1428,34 +1417,14 @@ extern void pte_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
#define pte_clear pte_clear

+extern void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full);
#define clear_full_ptes clear_full_ptes
-static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned int nr, int full)
-{
- if (likely(nr == 1)) {
- contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
- __clear_full_ptes(mm, addr, ptep, nr, full);
- } else {
- contpte_clear_full_ptes(mm, addr, ptep, nr, full);
- }
-}

+extern pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full);
#define get_and_clear_full_ptes get_and_clear_full_ptes
-static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep,
- unsigned int nr, int full)
-{
- pte_t pte;
-
- if (likely(nr == 1)) {
- contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
- pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
- } else {
- pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
- }
-
- return pte;
-}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
extern pte_t ptep_get_and_clear(struct mm_struct *mm,
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index 52a1b2082627..dbd1bc95967d 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,7 +3,6 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
cache.o copypage.o flush.o \
ioremap.o mmap.o pgd.o mmu.o \
context.o proc.o pageattr.o fixmap.o
-obj-$(CONFIG_THP_CONTPTE) += contpte.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
deleted file mode 100644
index 1cef93b15d6e..000000000000
--- a/arch/arm64/mm/contpte.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2023 ARM Ltd.
- */
-
-#include <linux/mm.h>
-#include <linux/efi.h>
-#include <linux/export.h>
-#include <asm/tlbflush.h>
-
-static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned int nr)
-{
- /*
- * Unfold any partially covered contpte block at the beginning and end
- * of the range.
- */
-
- if (ptep != arch_contpte_align_down(ptep) || nr < CONT_PTES)
- contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
-
- if (ptep + nr != arch_contpte_align_down(ptep + nr)) {
- unsigned long last_addr = addr + PAGE_SIZE * (nr - 1);
- pte_t *last_ptep = ptep + nr - 1;
-
- contpte_try_unfold(mm, last_addr, last_ptep,
- __ptep_get(last_ptep));
- }
-}
-
-void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, unsigned int nr, int full)
-{
- contpte_try_unfold_partial(mm, addr, ptep, nr);
- __clear_full_ptes(mm, addr, ptep, nr, full);
-}
-EXPORT_SYMBOL_GPL(contpte_clear_full_ptes);
-
-pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep,
- unsigned int nr, int full)
-{
- contpte_try_unfold_partial(mm, addr, ptep, nr);
- return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
-}
-EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 728f31da5e6a..a4843bdfdb37 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -754,6 +754,37 @@ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
return pte;
}

+static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ for (;;) {
+ __ptep_get_and_clear(mm, addr, ptep);
+ if (--nr == 0)
+ break;
+ ptep++;
+ addr += PAGE_SIZE;
+ }
+}
+
+static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full)
+{
+ pte_t pte, tmp_pte;
+
+ pte = __ptep_get_and_clear(mm, addr, ptep);
+ while (--nr) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
+ return pte;
+}
+
static inline void __ptep_set_wrprotect(struct mm_struct *mm,
unsigned long address, pte_t *ptep,
pte_t pte)
@@ -823,6 +854,13 @@ extern void ptep_set_wrprotect(struct mm_struct *mm,
extern void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr);
#define wrprotect_ptes wrprotect_ptes
+extern void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full);
+#define clear_full_ptes clear_full_ptes
+extern pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full);
+#define get_and_clear_full_ptes get_and_clear_full_ptes

#else /* CONFIG_THP_CONTPTE */

@@ -842,6 +880,7 @@ extern void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
#define ptep_set_wrprotect(mm, addr, ptep) \
__ptep_set_wrprotect(mm, addr, ptep, __ptep_get(ptep))
#define wrprotect_ptes __wrprotect_ptes
+#define clear_full_ptes __clear_full_ptes

#endif /* CONFIG_THP_CONTPTE */

diff --git a/include/linux/contpte.h b/include/linux/contpte.h
index d1439db1706c..b24554ebca41 100644
--- a/include/linux/contpte.h
+++ b/include/linux/contpte.h
@@ -28,5 +28,10 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
pte_t entry, int dirty);
void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned int nr);
+void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full);
+pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full);

#endif /* _LINUX_CONTPTE_H */
diff --git a/mm/contpte.c b/mm/contpte.c
index fe36b6b1d20a..677344e0e3c3 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -51,6 +51,8 @@
* - ptep_clear_flush_young()
* - wrprotect_ptes()
* - ptep_set_wrprotect()
+ * - clear_full_ptes()
+ * - get_and_clear_full_ptes()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -905,4 +907,49 @@ __always_inline void ptep_set_wrprotect(struct mm_struct *mm,
{
wrprotect_ptes(mm, addr, ptep, 1);
}
+
+void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ contpte_try_unfold_partial(mm, addr, ptep, nr);
+ __clear_full_ptes(mm, addr, ptep, nr, full);
+}
+EXPORT_SYMBOL_GPL(contpte_clear_full_ptes);
+
+pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full)
+{
+ contpte_try_unfold_partial(mm, addr, ptep, nr);
+ return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+}
+EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
+
+__always_inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr, int full)
+{
+ if (likely(nr == 1)) {
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __clear_full_ptes(mm, addr, ptep, nr, full);
+ } else {
+ contpte_clear_full_ptes(mm, addr, ptep, nr, full);
+ }
+}
+
+__always_inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep,
+ unsigned int nr, int full)
+{
+ pte_t pte;
+
+ if (likely(nr == 1)) {
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+ } else {
+ pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+ }
+
+ return pte;
+}
+
#endif /* CONFIG_THP_CONTPTE */
--
2.39.2


2024-05-08 19:39:18

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH 06/12] mm, riscv, arm64: Use common pte_clear() function

Make riscv use the contpte aware pte_clear() function from arm64.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 9 +++------
arch/riscv/include/asm/pgtable.h | 4 +++-
arch/riscv/mm/init.c | 2 +-
mm/contpte.c | 6 ++++++
4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index bb6210fb72c8..74e582f2884f 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1439,12 +1439,9 @@ extern void set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr);
#define set_ptes set_ptes

-static inline void pte_clear(struct mm_struct *mm,
- unsigned long addr, pte_t *ptep)
-{
- contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
- __pte_clear(mm, addr, ptep);
-}
+extern void pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep);
+#define pte_clear pte_clear

#define clear_full_ptes clear_full_ptes
static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 4f8f673787e7..41534f4b8a6d 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -792,18 +792,20 @@ extern void set_ptes(struct mm_struct *mm, unsigned long addr,
#define set_ptes set_ptes
extern void set_pte(pte_t *ptep, pte_t pte);
#define set_pte set_pte
+extern void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+#define pte_clear pte_clear

#else /* CONFIG_THP_CONTPTE */

#define ptep_get __ptep_get
#define set_ptes __set_ptes
#define set_pte __set_pte
+#define pte_clear __pte_clear

#endif /* CONFIG_THP_CONTPTE */

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define ptep_get_and_clear __ptep_get_and_clear
-#define pte_clear __pte_clear
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index bb5c6578204c..c82f17b3060b 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -327,7 +327,7 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot)
if (pgprot_val(prot))
__set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, prot));
else
- pte_clear(&init_mm, addr, ptep);
+ __pte_clear(&init_mm, addr, ptep);
local_flush_tlb_page(addr);
}

diff --git a/mm/contpte.c b/mm/contpte.c
index 543ae5b5a863..c9eff6426ca0 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -45,6 +45,7 @@
* - set_ptes()
* - ptep_get_lockless()
* - set_pte()
+ * - pte_clear()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -676,4 +677,9 @@ void set_pte(pte_t *ptep, pte_t pte)
__set_pte(ptep, pte_mknoncont(pte));
}

+void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __pte_clear(mm, addr, ptep);
+}
#endif /* CONFIG_THP_CONTPTE */
--
2.39.2


2024-05-08 19:45:27

by Alexandre Ghiti

[permalink] [raw]
Subject: [PATCH 08/12] mm, riscv, arm64: Use common ptep_test_and_clear_young() function

Make riscv use the contpte aware ptep_test_and_clear_young() function from
arm64.

Signed-off-by: Alexandre Ghiti <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 14 ++----------
arch/arm64/mm/contpte.c | 25 --------------------
arch/riscv/include/asm/pgtable.h | 12 ++++++----
arch/riscv/kvm/mmu.c | 2 +-
arch/riscv/mm/pgtable.c | 2 +-
include/linux/contpte.h | 2 ++
mm/contpte.c | 39 ++++++++++++++++++++++++++++++++
7 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index ff7fe1d9cabe..9a8702d1ad00 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1389,8 +1389,6 @@ extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
unsigned long addr, pte_t *ptep,
unsigned int nr, int full);
-extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep);
extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
@@ -1477,16 +1475,8 @@ extern pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- pte_t orig_pte = __ptep_get(ptep);
-
- if (likely(!pte_valid_cont(orig_pte)))
- return __ptep_test_and_clear_young(vma, addr, ptep);
-
- return contpte_ptep_test_and_clear_young(vma, addr, ptep);
-}
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 5e9e40145085..9bf471633ca4 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -45,31 +45,6 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);

-int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- /*
- * ptep_clear_flush_young() technically requires us to clear the access
- * flag for a _single_ pte. However, the core-mm code actually tracks
- * access/dirty per folio, not per page. And since we only create a
- * contig range when the range is covered by a single folio, we can get
- * away with clearing young for the whole contig range here, so we avoid
- * having to unfold.
- */
-
- int young = 0;
- int i;
-
- ptep = arch_contpte_align_down(ptep);
- addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
-
- for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
- young |= __ptep_test_and_clear_young(vma, addr, ptep);
-
- return young;
-}
-EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
-
int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 03cd640137ed..d39cb24c6c4a 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -739,8 +739,7 @@ static inline void __pte_clear(struct mm_struct *mm,

extern int __ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep, pte_t entry, int dirty);
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG /* defined in mm/pgtable.c */
-extern int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address,
+extern int __ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep);

static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
@@ -778,7 +777,7 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
* shouldn't really matter because there's no real memory
* pressure for swapout to react to. ]
*/
- return ptep_test_and_clear_young(vma, address, ptep);
+ return __ptep_test_and_clear_young(vma, address, ptep);
}

#ifdef CONFIG_THP_CONTPTE
@@ -797,6 +796,9 @@ extern void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
extern pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);

#else /* CONFIG_THP_CONTPTE */

@@ -806,6 +808,8 @@ extern pte_t ptep_get_and_clear(struct mm_struct *mm,
#define pte_clear __pte_clear
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define ptep_get_and_clear __ptep_get_and_clear
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young __ptep_test_and_clear_young

#endif /* CONFIG_THP_CONTPTE */

@@ -987,7 +991,7 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp)
{
- return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
+ return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
}

#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 1ee6139d495f..554926e33760 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -585,7 +585,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
&ptep, &ptep_level))
return false;

- return ptep_test_and_clear_young(NULL, 0, ptep);
+ return __ptep_test_and_clear_young(NULL, 0, ptep);
}

bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c
index 5756bde9eb42..5f31d0594109 100644
--- a/arch/riscv/mm/pgtable.c
+++ b/arch/riscv/mm/pgtable.c
@@ -18,7 +18,7 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
return true;
}

-int ptep_test_and_clear_young(struct vm_area_struct *vma,
+int __ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pte_t *ptep)
{
diff --git a/include/linux/contpte.h b/include/linux/contpte.h
index 01da4bfc3af6..38092adbe0d4 100644
--- a/include/linux/contpte.h
+++ b/include/linux/contpte.h
@@ -19,5 +19,7 @@ void contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr);
+int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);

#endif /* _LINUX_CONTPTE_H */
diff --git a/mm/contpte.c b/mm/contpte.c
index 5bf939639233..220e9d81f401 100644
--- a/mm/contpte.c
+++ b/mm/contpte.c
@@ -47,6 +47,7 @@
* - set_pte()
* - pte_clear()
* - ptep_get_and_clear()
+ * - ptep_test_and_clear_young()
*/

pte_t huge_ptep_get(pte_t *ptep)
@@ -690,4 +691,42 @@ pte_t ptep_get_and_clear(struct mm_struct *mm,
contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
return __ptep_get_and_clear(mm, addr, ptep);
}
+
+int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ /*
+ * ptep_clear_flush_young() technically requires us to clear the access
+ * flag for a _single_ pte. However, the core-mm code actually tracks
+ * access/dirty per folio, not per page. And since we only create a
+ * contig range when the range is covered by a single folio, we can get
+ * away with clearing young for the whole contig range here, so we avoid
+ * having to unfold.
+ */
+
+ size_t pgsize;
+ int young = 0;
+ int i, ncontig;
+
+ ptep = arch_contpte_align_down(ptep);
+ ncontig = arch_contpte_get_num_contig(vma->vm_mm, addr, ptep, 0, &pgsize);
+ addr = ALIGN_DOWN(addr, ncontig * pgsize);
+
+ for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
+ young |= __ptep_test_and_clear_young(vma, addr, ptep);
+
+ return young;
+}
+EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
+
+__always_inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (likely(!pte_valid_cont(orig_pte)))
+ return __ptep_test_and_clear_young(vma, addr, ptep);
+
+ return contpte_ptep_test_and_clear_young(vma, addr, ptep);
+}
#endif /* CONFIG_THP_CONTPTE */
--
2.39.2