Hi All,
This patchset adds support for lazyfreeing multi-size THP (mTHP) without
needing to first split the large folio via split_folio(). However, we
still need to split a large folio that is not fully mapped within the
target range.
If a large folio is locked or shared, or if we fail to split it, we just
leave it in place and advance to the next PTE in the range. But note that
the behavior is changed; previously, any failure of this sort would cause
the entire operation to give up. As large folios become more common,
sticking to the old way could result in wasted opportunities.
Performance Testing
===================
On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
the same size results in the following runtimes for madvise(MADV_FREE)
in seconds (shorter is better):
Folio Size | Old | New | Change
------------------------------------------
4KiB | 0.590251 | 0.590259 | 0%
16KiB | 2.990447 | 0.185655 | -94%
32KiB | 2.547831 | 0.104870 | -95%
64KiB | 2.457796 | 0.052812 | -97%
128KiB | 2.281034 | 0.032777 | -99%
256KiB | 2.230387 | 0.017496 | -99%
512KiB | 2.189106 | 0.010781 | -99%
1024KiB | 2.183949 | 0.007753 | -99%
2048KiB | 0.002799 | 0.002804 | 0%
---
This patchset applies against mm-unstable (d4cd6840d1dc).
The performance numbers are from v2. I did a quick benchmark run of v4 and
nothing significantly changed.
Changes since v3 [3]
====================
- Rename refresh_full_ptes -> mkold_clean_ptes (per Ryan Roberts)
- Override mkold_clean_ptes() for arm64 to make it faster (per Ryan Roberts)
- Update the changelog
Changes since v2 [2]
====================
- Only skip all the PTEs for nr_pages when the number of batched PTEs matches
nr_pages (per Barry Song)
- Change folio_pte_batch() to consume an optional *any_dirty and *any_young
function (per David Hildenbrand)
- Move the ptep_get_and_clear_full() loop into refresh_full_ptes() (per
David Hildenbrand)
- Follow a similar pattern for madvise_free_pte_range() (per Ryan Roberts)
Changes since v1 [1]
====================
- Update the performance numbers
- Update the changelog (per Ryan Roberts)
- Check the COW folio (per Yin Fengwei)
- Check if we are mapping all subpages (per Barry Song, David Hildenbrand,
Ryan Roberts)
[1] https://lore.kernel.org/linux-mm/[email protected]
[2] https://lore.kernel.org/linux-mm/[email protected]
[3] https://lore.kernel.org/linux-mm/[email protected]
Thanks,
Lance
Lance Yang (2):
mm/madvise: introduce mkold_clean_ptes() batch helper
mm/madvise: optimize lazyfreeing with mTHP in madvise_free
arch/arm64/include/asm/pgtable.h | 36 +++++++++++++++++
arch/arm64/mm/contpte.c | 10 +++++
include/linux/pgtable.h | 30 ++++++++++++++
mm/internal.h | 12 +++++-
mm/madvise.c | 147 ++++++++++++++++++++++++++++++++++++--
mm/memory.c | 4 +-
6 files changed, 164 insertions(+), 75 deletions(-)
--
2.33.1
Change the code that clears young and dirty bits from the PTEs to use
ptep_get_and_clear_full() and set_pte_at(), via the new mkold_clean_ptes()
batch helper function.
Unfortunately, the per-pte get_and_clear/modify/set approach would result
in unfolding/refolding for contpte mappings on arm64. So we need to
override mkold_clean_ptes() for arm64 to avoid it.
Suggested-by: David Hildenbrand <[email protected]>
Suggested-by: Barry Song <[email protected]>
Suggested-by: Ryan Roberts <[email protected]>
Signed-off-by: Lance Yang <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 36 ++++++++++++++++++++++++++++++++
arch/arm64/mm/contpte.c | 10 +++++++++
include/linux/pgtable.h | 30 ++++++++++++++++++++++++++
3 files changed, 76 insertions(+)
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9fd8613b2db2..b032c107090c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1086,6 +1086,27 @@ static inline bool pud_user_accessible_page(pud_t pud)
}
#endif
+static inline void __mkold_clean_pte(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t old_pte, pte;
+
+ pte = __ptep_get(ptep);
+ do {
+ old_pte = pte;
+ pte = pte_mkclean(pte_mkold(pte));
+ pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
+ pte_val(old_pte), pte_val(pte));
+ } while (pte_val(pte) != pte_val(old_pte));
+}
+
+static inline void mkold_clean_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+ for (; nr-- > 0; ptep++, addr += PAGE_SIZE)
+ __mkold_clean_pte(vma, addr, ptep);
+}
+
/*
* Atomic pte/pmd modifications.
*/
@@ -1379,6 +1400,8 @@ extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty);
+extern void contpte_ptep_mkold_clean(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
static __always_inline void contpte_try_fold(struct mm_struct *mm,
unsigned long addr, pte_t *ptep, pte_t pte)
@@ -1603,6 +1626,18 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
}
+#define mkold_clean_ptes mkold_clean_ptes
+static inline void mkold_clean_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (likely(!pte_valid_cont(orig_pte)))
+ return __mkold_clean_ptes(vma, addr, ptep, nr, full);
+
+ return contpte_ptep_mkold_clean(vma, addr, ptep);
+}
+
#else /* CONFIG_ARM64_CONTPTE */
#define ptep_get __ptep_get
@@ -1622,6 +1657,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
#define wrprotect_ptes __wrprotect_ptes
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags
+#define mkold_clean_ptes __mkold_clean_ptes
#endif /* CONFIG_ARM64_CONTPTE */
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 1b64b4c3f8bf..560622cfb2a9 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -322,6 +322,16 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
}
EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
+void contpte_ptep_mkold_clean(struct vm_area_struct *vma, unsigned long addr,
+ pte_t *ptep)
+{
+ ptep = contpte_align_down(ptep);
+ addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+ __mkold_clean_ptes(vma, addr, ptep, CONT_PTES, 0);
+}
+EXPORT_SYMBOL_GPL(contpte_ptep_mkold_clean);
+
int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index fa8f92f6e2d7..fd30779fe487 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -391,6 +391,36 @@ static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
}
#endif
+#ifndef mkold_clean_ptes
+/**
+ * mkold_clean_ptes - Mark PTEs that map consecutive pages of the same folio
+ * as old and clean.
+ * @vma: VMA the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to mark old and clean.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_get_and_clear_full().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock. The PTEs map consecutive
+ * pages that belong to the same folio. The PTEs are all in the same PMD.
+ */
+static inline void mkold_clean_ptes(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep, unsigned int nr, int full)
+{
+ pte_t pte;
+
+ for (; nr-- > 0; ptep++, addr += PAGE_SIZE) {
+ pte = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
+ set_pte_at(vma->vm_mm, addr, ptep, pte_mkclean(pte_mkold(pte)));
+ }
+}
+#endif
+
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
--
2.33.1
This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
(Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
splitting if the large folio is fully mapped within the target range.
If a large folio is locked or shared, or if we fail to split it, we just
leave it in place and advance to the next PTE in the range. But note that
the behavior is changed; previously, any failure of this sort would cause
the entire operation to give up. As large folios become more common,
sticking to the old way could result in wasted opportunities.
On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
the same size results in the following runtimes for madvise(MADV_FREE) in
seconds (shorter is better):
Folio Size | Old | New | Change
------------------------------------------
4KiB | 0.590251 | 0.590259 | 0%
16KiB | 2.990447 | 0.185655 | -94%
32KiB | 2.547831 | 0.104870 | -95%
64KiB | 2.457796 | 0.052812 | -97%
128KiB | 2.281034 | 0.032777 | -99%
256KiB | 2.230387 | 0.017496 | -99%
512KiB | 2.189106 | 0.010781 | -99%
1024KiB | 2.183949 | 0.007753 | -99%
2048KiB | 0.002799 | 0.002804 | 0%
[1] https://lkml.kernel.org/r/[email protected]
[2] https://lore.kernel.org/linux-mm/[email protected]
Signed-off-by: Lance Yang <[email protected]>
---
mm/internal.h | 12 ++++-
mm/madvise.c | 147 ++++++++++++++++++++++++++------------------------
mm/memory.c | 4 +-
3 files changed, 88 insertions(+), 75 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 3df06a152ff0..cdc6e2162b30 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -132,6 +132,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
* first one is writable.
* @any_young: Optional pointer to indicate whether any entry except the
* first one is young.
+ * @any_dirty: Optional pointer to indicate whether any entry except the
+ * first one is dirty.
*
* Detect a PTE batch: consecutive (present) PTEs that map consecutive
* pages of the same large folio.
@@ -147,18 +149,20 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
*/
static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
- bool *any_writable, bool *any_young)
+ bool *any_writable, bool *any_young, bool *any_dirty)
{
unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
const pte_t *end_ptep = start_ptep + max_nr;
pte_t expected_pte, *ptep;
- bool writable, young;
+ bool writable, young, dirty;
int nr;
if (any_writable)
*any_writable = false;
if (any_young)
*any_young = false;
+ if (any_dirty)
+ *any_dirty = false;
VM_WARN_ON_FOLIO(!pte_present(pte), folio);
VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
@@ -174,6 +178,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
writable = !!pte_write(pte);
if (any_young)
young = !!pte_young(pte);
+ if (any_dirty)
+ dirty = !!pte_dirty(pte);
pte = __pte_batch_clear_ignored(pte, flags);
if (!pte_same(pte, expected_pte))
@@ -191,6 +197,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
*any_writable |= writable;
if (any_young)
*any_young |= young;
+ if (any_dirty)
+ *any_dirty |= dirty;
nr = pte_batch_hint(ptep, pte);
expected_pte = pte_advance_pfn(expected_pte, nr);
diff --git a/mm/madvise.c b/mm/madvise.c
index bd00b83e7c50..8197effd9f14 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -321,6 +321,38 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
file_permission(vma->vm_file, MAY_WRITE) == 0;
}
+static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
+ struct folio *folio, pte_t *pte,
+ bool *any_writable, bool *any_young, bool *any_dirty)
+{
+ int max_nr = (end - addr) / PAGE_SIZE;
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+
+ return folio_pte_batch(folio, addr, pte, ptep_get(pte), max_nr,
+ fpb_flags, any_writable, any_young, any_dirty);
+}
+
+static inline bool madvise_pte_split_folio(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long addr, struct folio *folio, pte_t **pte,
+ spinlock_t **ptl)
+{
+ int err;
+
+ if (!folio_trylock(folio))
+ return false;
+
+ folio_get(folio);
+ pte_unmap_unlock(*pte, *ptl);
+ *pte = NULL;
+ err = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ *pte = pte_offset_map_lock(mm, pmd, addr, ptl);
+
+ return err == 0;
+}
+
static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
@@ -456,40 +488,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
* next pte in the range.
*/
if (folio_test_large(folio)) {
- const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
- FPB_IGNORE_SOFT_DIRTY;
- int max_nr = (end - addr) / PAGE_SIZE;
bool any_young;
-
- nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
- fpb_flags, NULL, &any_young);
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ NULL, &any_young, NULL);
if (any_young)
ptent = pte_mkyoung(ptent);
if (nr < folio_nr_pages(folio)) {
- int err;
-
if (folio_likely_mapped_shared(folio))
continue;
if (pageout_anon_only_filter && !folio_test_anon(folio))
continue;
- if (!folio_trylock(folio))
- continue;
- folio_get(folio);
+
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- continue;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
+ if (madvise_pte_split_folio(mm, pmd, addr,
+ folio, &start_pte, &ptl))
+ nr = 0;
if (!start_pte)
break;
+ pte = start_pte;
arch_enter_lazy_mmu_mode();
- nr = 0;
continue;
}
}
@@ -688,72 +706,59 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
/*
- * If pmd isn't transhuge but the folio is large and
- * is owned by only this process, split it and
- * deactivate all pages.
+ * If we encounter a large folio, only split it if it is not
+ * fully mapped within the range we are operating on. Otherwise
+ * leave it as is so that it can be marked as lazyfree. If we
+ * fail to split a folio, leave it in place and advance to the
+ * next pte in the range.
*/
if (folio_test_large(folio)) {
- int err;
+ bool any_young, any_dirty;
+ nr = madvise_folio_pte_batch(addr, end, folio, pte,
+ NULL, &any_young, &any_dirty);
+ if (any_young || any_dirty)
+ ptent = pte_mkdirty(pte_mkyoung(ptent));
- if (folio_likely_mapped_shared(folio))
- break;
- if (!folio_trylock(folio))
- break;
- folio_get(folio);
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(start_pte, ptl);
- start_pte = NULL;
- err = split_folio(folio);
- folio_unlock(folio);
- folio_put(folio);
- if (err)
- break;
- start_pte = pte =
- pte_offset_map_lock(mm, pmd, addr, &ptl);
- if (!start_pte)
- break;
- arch_enter_lazy_mmu_mode();
- pte--;
- addr -= PAGE_SIZE;
- continue;
- }
+ if (nr < folio_nr_pages(folio)) {
+ if (folio_likely_mapped_shared(folio))
+ continue;
- if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
- if (!folio_trylock(folio))
- continue;
- /*
- * If folio is shared with others, we mustn't clear
- * the folio's dirty flag.
- */
- if (folio_mapcount(folio) != 1) {
- folio_unlock(folio);
+ arch_leave_lazy_mmu_mode();
+ if (madvise_pte_split_folio(mm, pmd, addr,
+ folio, &start_pte, &ptl))
+ nr = 0;
+ if (!start_pte)
+ break;
+ pte = start_pte;
+ arch_enter_lazy_mmu_mode();
continue;
}
+ }
+ if (!folio_trylock(folio))
+ continue;
+ /*
+ * If we have a large folio at this point, we know it is fully mapped
+ * so if its mapcount is the same as its number of pages, it must be
+ * exclusive.
+ */
+ if (folio_mapcount(folio) != folio_nr_pages(folio)) {
+ folio_unlock(folio);
+ continue;
+ }
+ if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
if (folio_test_swapcache(folio) &&
!folio_free_swap(folio)) {
folio_unlock(folio);
continue;
}
-
folio_clear_dirty(folio);
- folio_unlock(folio);
}
+ folio_unlock(folio);
if (pte_young(ptent) || pte_dirty(ptent)) {
- /*
- * Some of architecture(ex, PPC) don't update TLB
- * with set_pte_at and tlb_remove_tlb_entry so for
- * the portability, remap the pte with old|clean
- * after pte clearing.
- */
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
-
- ptent = pte_mkold(ptent);
- ptent = pte_mkclean(ptent);
- set_pte_at(mm, addr, pte, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
+ mkold_clean_ptes(vma, addr, pte, nr, tlb->fullmm);
+ tlb_remove_tlb_entries(tlb, pte, nr, addr);
}
folio_mark_lazyfree(folio);
}
diff --git a/mm/memory.c b/mm/memory.c
index 912cd738ec03..24769ecb59e5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
flags |= FPB_IGNORE_SOFT_DIRTY;
nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
- &any_writable, NULL);
+ &any_writable, NULL, NULL);
folio_ref_add(folio, nr);
if (folio_test_anon(folio)) {
if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1559,7 +1559,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
*/
if (unlikely(folio_test_large(folio) && max_nr != 1)) {
nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
- NULL, NULL);
+ NULL, NULL, NULL);
zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
addr, details, rss, force_flush,
--
2.33.1
On 02/04/2024 13:40, Lance Yang wrote:
> Change the code that clears young and dirty bits from the PTEs to use
> ptep_get_and_clear_full() and set_pte_at(), via the new mkold_clean_ptes()
> batch helper function.
>
> Unfortunately, the per-pte get_and_clear/modify/set approach would result
> in unfolding/refolding for contpte mappings on arm64. So we need to
> override mkold_clean_ptes() for arm64 to avoid it.
>
> Suggested-by: David Hildenbrand <[email protected]>
> Suggested-by: Barry Song <[email protected]>
> Suggested-by: Ryan Roberts <[email protected]>
> Signed-off-by: Lance Yang <[email protected]>
> ---
> arch/arm64/include/asm/pgtable.h | 36 ++++++++++++++++++++++++++++++++
> arch/arm64/mm/contpte.c | 10 +++++++++
> include/linux/pgtable.h | 30 ++++++++++++++++++++++++++
> 3 files changed, 76 insertions(+)
When I suggested splitting out this patch, this wasn't quite what I meant.
Instead I intended for the first patch to make the MADV_FREE change and as part
of that introduce mkold_clean_ptes() with a generic implementation. Then a
second patch would do the arm64-specific specialization of mkold_clean_ptes()
for an arm64-specific performance boost. In general it is often useful to
separate core changes from arch changes where practical since they have
different target audiences.
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 9fd8613b2db2..b032c107090c 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -1086,6 +1086,27 @@ static inline bool pud_user_accessible_page(pud_t pud)
> }
> #endif
>
> +static inline void __mkold_clean_pte(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep)
> +{
> + pte_t old_pte, pte;
> +
> + pte = __ptep_get(ptep);
> + do {
> + old_pte = pte;
> + pte = pte_mkclean(pte_mkold(pte));
> + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
> + pte_val(old_pte), pte_val(pte));
> + } while (pte_val(pte) != pte_val(old_pte));
> +}
I'm not sure how value it is to split this out, given we are only exposing a
batched version to the core. Perhaps just include the body in mkold_clean_ptes()?
> +
> +static inline void mkold_clean_ptes(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> +{
> + for (; nr-- > 0; ptep++, addr += PAGE_SIZE)
> + __mkold_clean_pte(vma, addr, ptep);
> +}
> +
> /*
> * Atomic pte/pmd modifications.
> */
> @@ -1379,6 +1400,8 @@ extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
> extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep,
> pte_t entry, int dirty);
> +extern void contpte_ptep_mkold_clean(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep);
Let's have this follow the same naming pattern; contpte_mkold_clean_ptes()
>
> static __always_inline void contpte_try_fold(struct mm_struct *mm,
> unsigned long addr, pte_t *ptep, pte_t pte)
> @@ -1603,6 +1626,18 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
> return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
> }
>
> +#define mkold_clean_ptes mkold_clean_ptes
> +static inline void mkold_clean_ptes(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> +{
> + pte_t orig_pte = __ptep_get(ptep);
> +
> + if (likely(!pte_valid_cont(orig_pte)))
> + return __mkold_clean_ptes(vma, addr, ptep, nr, full);
> +
> + return contpte_ptep_mkold_clean(vma, addr, ptep);> +}
This function is totally broken as far as I can tell. You are assuming if the
first pte is not cont, then the whole range is not cont; you can't assume that.
Then if you decide it is cont, you ignore nr and only fixup a single contpte block.
Take a look at wrprotect_ptes() or another one of the batched helpers and follow
that pattern.
> +
> #else /* CONFIG_ARM64_CONTPTE */
>
> #define ptep_get __ptep_get
> @@ -1622,6 +1657,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
> #define wrprotect_ptes __wrprotect_ptes
> #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
> #define ptep_set_access_flags __ptep_set_access_flags
> +#define mkold_clean_ptes __mkold_clean_ptes
>
> #endif /* CONFIG_ARM64_CONTPTE */
>
> diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
> index 1b64b4c3f8bf..560622cfb2a9 100644
> --- a/arch/arm64/mm/contpte.c
> +++ b/arch/arm64/mm/contpte.c
> @@ -322,6 +322,16 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> }
> EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
>
> +void contpte_ptep_mkold_clean(struct vm_area_struct *vma, unsigned long addr,
> + pte_t *ptep)
> +{
> + ptep = contpte_align_down(ptep);
> + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
> +
> + __mkold_clean_ptes(vma, addr, ptep, CONT_PTES, 0);
As above, this is broken as is.
> +}
> +EXPORT_SYMBOL_GPL(contpte_ptep_mkold_clean);
> +
> int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep)
> {
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index fa8f92f6e2d7..fd30779fe487 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -391,6 +391,36 @@ static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
> }
> #endif
>
> +#ifndef mkold_clean_ptes
> +/**
> + * mkold_clean_ptes - Mark PTEs that map consecutive pages of the same folio
> + * as old and clean.
> + * @vma: VMA the pages are mapped into.
> + * @addr: Address the first page is mapped at.
> + * @ptep: Page table pointer for the first entry.
> + * @nr: Number of entries to mark old and clean.
> + *
> + * May be overridden by the architecture; otherwise, implemented as a simple
> + * loop over ptep_get_and_clear_full().
How about "otherwise, implemented by get_and_clear/modify/set for each pte in
the range."?
> + *
> + * Note that PTE bits in the PTE range besides the PFN can differ. For example,
> + * some PTEs might be write-protected.
> + *
> + * Context: The caller holds the page table lock. The PTEs map consecutive
> + * pages that belong to the same folio. The PTEs are all in the same PMD.
> + */
> +static inline void mkold_clean_ptes(struct vm_area_struct *vma,
> + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
You've included the "full" param in the function but not in the docs. I don't
think the full param is valuable though; it doesn't make sense to call this
function when tearing down a process. So drop the parameter and just call
ptep_get_and_clear().
> +{
> + pte_t pte;
> +
> + for (; nr-- > 0; ptep++, addr += PAGE_SIZE) {
nit: This is using a different pattern to all the other batch helpers
(set_ptes(), wrprotect_ptes() clear_full_ptes()).
> + pte = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
> + set_pte_at(vma->vm_mm, addr, ptep, pte_mkclean(pte_mkold(pte)));
> + }
> +}
> +#endif
> +
> #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
> #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
> static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
On 02/04/2024 13:40, Lance Yang wrote:
> This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
> (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
> splitting if the large folio is fully mapped within the target range.
>
> If a large folio is locked or shared, or if we fail to split it, we just
> leave it in place and advance to the next PTE in the range. But note that
> the behavior is changed; previously, any failure of this sort would cause
> the entire operation to give up. As large folios become more common,
> sticking to the old way could result in wasted opportunities.
>
> On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
> the same size results in the following runtimes for madvise(MADV_FREE) in
> seconds (shorter is better):
>
> Folio Size | Old | New | Change
> ------------------------------------------
> 4KiB | 0.590251 | 0.590259 | 0%
> 16KiB | 2.990447 | 0.185655 | -94%
> 32KiB | 2.547831 | 0.104870 | -95%
> 64KiB | 2.457796 | 0.052812 | -97%
> 128KiB | 2.281034 | 0.032777 | -99%
> 256KiB | 2.230387 | 0.017496 | -99%
> 512KiB | 2.189106 | 0.010781 | -99%
> 1024KiB | 2.183949 | 0.007753 | -99%
> 2048KiB | 0.002799 | 0.002804 | 0%
I'm guessing the reason that 2M is not showing any change is because its
PMD-mapped and splitting is already elided? If you were to force it to be
PTE-mapped then you'll see the very impressive speed-up there too. Don't worry
about doing that on my account though; these results are already sufficient IMHO.
>
> [1] https://lkml.kernel.org/r/[email protected]
> [2] https://lore.kernel.org/linux-mm/[email protected]
>
> Signed-off-by: Lance Yang <[email protected]>
> ---
> mm/internal.h | 12 ++++-
> mm/madvise.c | 147 ++++++++++++++++++++++++++------------------------
> mm/memory.c | 4 +-
> 3 files changed, 88 insertions(+), 75 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 3df06a152ff0..cdc6e2162b30 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -132,6 +132,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
> * first one is writable.
> * @any_young: Optional pointer to indicate whether any entry except the
> * first one is young.
> + * @any_dirty: Optional pointer to indicate whether any entry except the
> + * first one is dirty.
> *
> * Detect a PTE batch: consecutive (present) PTEs that map consecutive
> * pages of the same large folio.
> @@ -147,18 +149,20 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
> */
> static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
> - bool *any_writable, bool *any_young)
> + bool *any_writable, bool *any_young, bool *any_dirty)
> {
> unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
> const pte_t *end_ptep = start_ptep + max_nr;
> pte_t expected_pte, *ptep;
> - bool writable, young;
> + bool writable, young, dirty;
> int nr;
>
> if (any_writable)
> *any_writable = false;
> if (any_young)
> *any_young = false;
> + if (any_dirty)
> + *any_dirty = false;
>
> VM_WARN_ON_FOLIO(!pte_present(pte), folio);
> VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
> @@ -174,6 +178,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> writable = !!pte_write(pte);
> if (any_young)
> young = !!pte_young(pte);
> + if (any_dirty)
> + dirty = !!pte_dirty(pte);
> pte = __pte_batch_clear_ignored(pte, flags);
>
> if (!pte_same(pte, expected_pte))
> @@ -191,6 +197,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> *any_writable |= writable;
> if (any_young)
> *any_young |= young;
> + if (any_dirty)
> + *any_dirty |= dirty;
>
> nr = pte_batch_hint(ptep, pte);
> expected_pte = pte_advance_pfn(expected_pte, nr);
> diff --git a/mm/madvise.c b/mm/madvise.c
> index bd00b83e7c50..8197effd9f14 100644
> --- a/mm/madvise.c
> +++ b/mm/madvise.c
> @@ -321,6 +321,38 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
> file_permission(vma->vm_file, MAY_WRITE) == 0;
> }
>
> +static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
> + struct folio *folio, pte_t *pte,
> + bool *any_writable, bool *any_young, bool *any_dirty)
any_writable is always NULL. Do you need it?
> +{
> + int max_nr = (end - addr) / PAGE_SIZE;
> + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> +
> + return folio_pte_batch(folio, addr, pte, ptep_get(pte), max_nr,
ptep_get() was problematic for performance of the order-0 folio case when we
batched fork(). So we are deliberately passing around the value we already read
in the main loop. Granted this case is not so performance critical because we
only end up here for large folios. But I would still prefer to just pass the
data we have already read into this function rather than reading it again.
> + fpb_flags, any_writable, any_young, any_dirty);
> +}
> +
> +static inline bool madvise_pte_split_folio(struct mm_struct *mm, pmd_t *pmd,
> + unsigned long addr, struct folio *folio, pte_t **pte,
nit: I know 80 chars is a soft limit now (and I think 100 is a hard limit), but
try to be consistent. You could move the addr param to the previous line and be
within the 100 char limit. Personally I would just make the prototype fit in 80
chars (same goes for madvise_folio_pte_batch).
> + spinlock_t **ptl)
> +{
> + int err;
> +
> + if (!folio_trylock(folio))
> + return false;
> +
> + folio_get(folio);
> + pte_unmap_unlock(*pte, *ptl);
> + *pte = NULL;
nit: you don't need this since you are later unconditionally setting it again.
> + err = split_folio(folio);
> + folio_unlock(folio);
> + folio_put(folio);
> +
> + *pte = pte_offset_map_lock(mm, pmd, addr, ptl);
> +
> + return err == 0;
> +}
> +
> static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> unsigned long addr, unsigned long end,
> struct mm_walk *walk)
> @@ -456,40 +488,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> * next pte in the range.
> */
> if (folio_test_large(folio)) {
> - const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
> - FPB_IGNORE_SOFT_DIRTY;
> - int max_nr = (end - addr) / PAGE_SIZE;
> bool any_young;
> -
> - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
> - fpb_flags, NULL, &any_young);
> + nr = madvise_folio_pte_batch(addr, end, folio, pte,
> + NULL, &any_young, NULL);
> if (any_young)
> ptent = pte_mkyoung(ptent);
>
> if (nr < folio_nr_pages(folio)) {
> - int err;
> -
> if (folio_likely_mapped_shared(folio))
> continue;
> if (pageout_anon_only_filter && !folio_test_anon(folio))
> continue;
> - if (!folio_trylock(folio))
> - continue;
> - folio_get(folio);
> +
> arch_leave_lazy_mmu_mode();
> - pte_unmap_unlock(start_pte, ptl);
> - start_pte = NULL;
> - err = split_folio(folio);
> - folio_unlock(folio);
> - folio_put(folio);
> - if (err)
> - continue;
> - start_pte = pte =
> - pte_offset_map_lock(mm, pmd, addr, &ptl);
> + if (madvise_pte_split_folio(mm, pmd, addr,
> + folio, &start_pte, &ptl))
> + nr = 0;
> if (!start_pte)
> break;
> + pte = start_pte;
> arch_enter_lazy_mmu_mode();
> - nr = 0;
> continue;
This change fixes a bug I've introduced in my swap-out series. Nice. I tried to
fix in v6, but looking at this, I've realised its still broken. I've replied
against that series with the fix.
> }
> }
> @@ -688,72 +706,59 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> continue;
>
> /*
> - * If pmd isn't transhuge but the folio is large and
> - * is owned by only this process, split it and
> - * deactivate all pages.
> + * If we encounter a large folio, only split it if it is not
> + * fully mapped within the range we are operating on. Otherwise
> + * leave it as is so that it can be marked as lazyfree. If we
> + * fail to split a folio, leave it in place and advance to the
> + * next pte in the range.
> */
> if (folio_test_large(folio)) {
> - int err;
> + bool any_young, any_dirty;
> + nr = madvise_folio_pte_batch(addr, end, folio, pte,
> + NULL, &any_young, &any_dirty);
> + if (any_young || any_dirty)
> + ptent = pte_mkdirty(pte_mkyoung(ptent));
I don't think it makes any difference to how ptent is consumed below, but its
probably more intuitive to separate these two operations:
if (any_young)
ptent = pte_mkyoung(ptent);
if (any_dirty)
ptent = pte_mkdirty(ptent);
>
> - if (folio_likely_mapped_shared(folio))
> - break;
> - if (!folio_trylock(folio))
> - break;
> - folio_get(folio);
> - arch_leave_lazy_mmu_mode();
> - pte_unmap_unlock(start_pte, ptl);
> - start_pte = NULL;
> - err = split_folio(folio);
> - folio_unlock(folio);
> - folio_put(folio);
> - if (err)
> - break;
> - start_pte = pte =
> - pte_offset_map_lock(mm, pmd, addr, &ptl);
> - if (!start_pte)
> - break;
> - arch_enter_lazy_mmu_mode();
> - pte--;
> - addr -= PAGE_SIZE;
> - continue;
> - }
> + if (nr < folio_nr_pages(folio)) {
> + if (folio_likely_mapped_shared(folio))
> + continue;
>
> - if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
> - if (!folio_trylock(folio))
> - continue;
> - /*
> - * If folio is shared with others, we mustn't clear
> - * the folio's dirty flag.
> - */
> - if (folio_mapcount(folio) != 1) {
> - folio_unlock(folio);
> + arch_leave_lazy_mmu_mode();
> + if (madvise_pte_split_folio(mm, pmd, addr,
> + folio, &start_pte, &ptl))
> + nr = 0;
> + if (!start_pte)
> + break;
> + pte = start_pte;
> + arch_enter_lazy_mmu_mode();
> continue;
> }
> + }
>
> + if (!folio_trylock(folio))
> + continue;
> + /*
> + * If we have a large folio at this point, we know it is fully mapped
> + * so if its mapcount is the same as its number of pages, it must be
> + * exclusive.
> + */
> + if (folio_mapcount(folio) != folio_nr_pages(folio)) {
> + folio_unlock(folio);
> + continue;
> + }
> + if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
I don't understand the rationale for reducing the scope of this conditional?
Previously it was used to avoid having to lock the folio if it wasn't in the
swapcache or if it wasn't dirty. So now you will be locking much more often.
Thanks,
Ryan
> if (folio_test_swapcache(folio) &&
> !folio_free_swap(folio)) {
> folio_unlock(folio);
> continue;
> }
> -
> folio_clear_dirty(folio);
> - folio_unlock(folio);
> }
> + folio_unlock(folio);
>
> if (pte_young(ptent) || pte_dirty(ptent)) {
> - /*
> - * Some of architecture(ex, PPC) don't update TLB
> - * with set_pte_at and tlb_remove_tlb_entry so for
> - * the portability, remap the pte with old|clean
> - * after pte clearing.
> - */
> - ptent = ptep_get_and_clear_full(mm, addr, pte,
> - tlb->fullmm);
> -
> - ptent = pte_mkold(ptent);
> - ptent = pte_mkclean(ptent);
> - set_pte_at(mm, addr, pte, ptent);
> - tlb_remove_tlb_entry(tlb, pte, addr);
> + mkold_clean_ptes(vma, addr, pte, nr, tlb->fullmm);
> + tlb_remove_tlb_entries(tlb, pte, nr, addr);
> }
> folio_mark_lazyfree(folio);
> }
> diff --git a/mm/memory.c b/mm/memory.c
> index 912cd738ec03..24769ecb59e5 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
> flags |= FPB_IGNORE_SOFT_DIRTY;
>
> nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
> - &any_writable, NULL);
> + &any_writable, NULL, NULL);
> folio_ref_add(folio, nr);
> if (folio_test_anon(folio)) {
> if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
> @@ -1559,7 +1559,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
> */
> if (unlikely(folio_test_large(folio) && max_nr != 1)) {
> nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
> - NULL, NULL);
> + NULL, NULL, NULL);
>
> zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
> addr, details, rss, force_flush,
Hey Ryan,
Thanks for taking time to review!
On Thu, Apr 4, 2024 at 12:38 AM Ryan Roberts <[email protected]> wrote:
>
> On 02/04/2024 13:40, Lance Yang wrote:
> > Change the code that clears young and dirty bits from the PTEs to use
> > ptep_get_and_clear_full() and set_pte_at(), via the new mkold_clean_ptes()
> > batch helper function.
> >
> > Unfortunately, the per-pte get_and_clear/modify/set approach would result
> > in unfolding/refolding for contpte mappings on arm64. So we need to
> > override mkold_clean_ptes() for arm64 to avoid it.
> >
> > Suggested-by: David Hildenbrand <[email protected]>
> > Suggested-by: Barry Song <[email protected]>
> > Suggested-by: Ryan Roberts <[email protected]>
> > Signed-off-by: Lance Yang <[email protected]>
> > ---
> > arch/arm64/include/asm/pgtable.h | 36 ++++++++++++++++++++++++++++++++
> > arch/arm64/mm/contpte.c | 10 +++++++++
> > include/linux/pgtable.h | 30 ++++++++++++++++++++++++++
> > 3 files changed, 76 insertions(+)
>
> When I suggested splitting out this patch, this wasn't quite what I meant.
> Instead I intended for the first patch to make the MADV_FREE change and as part
> of that introduce mkold_clean_ptes() with a generic implementation. Then a
> second patch would do the arm64-specific specialization of mkold_clean_ptes()
> for an arm64-specific performance boost. In general it is often useful to
> separate core changes from arch changes where practical since they have
> different target audiences.
Thanks for clearing that up! I‘ll make sure to implement it as discussed.
>
>
> >
> > diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> > index 9fd8613b2db2..b032c107090c 100644
> > --- a/arch/arm64/include/asm/pgtable.h
> > +++ b/arch/arm64/include/asm/pgtable.h
> > @@ -1086,6 +1086,27 @@ static inline bool pud_user_accessible_page(pud_t pud)
> > }
> > #endif
> >
> > +static inline void __mkold_clean_pte(struct vm_area_struct *vma,
> > + unsigned long addr, pte_t *ptep)
> > +{
> > + pte_t old_pte, pte;
> > +
> > + pte = __ptep_get(ptep);
> > + do {
> > + old_pte = pte;
> > + pte = pte_mkclean(pte_mkold(pte));
> > + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
> > + pte_val(old_pte), pte_val(pte));
> > + } while (pte_val(pte) != pte_val(old_pte));
> > +}
>
> I'm not sure how value it is to split this out, given we are only exposing a
> batched version to the core. Perhaps just include the body in mkold_clean_ptes()?
I agree that including the body in mkold_clean_ptes() would be better.
>
> > +
> > +static inline void mkold_clean_ptes(struct vm_area_struct *vma,
> > + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> > +{
> > + for (; nr-- > 0; ptep++, addr += PAGE_SIZE)
> > + __mkold_clean_pte(vma, addr, ptep);
> > +}
> > +
> > /*
> > * Atomic pte/pmd modifications.
> > */
> > @@ -1379,6 +1400,8 @@ extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
> > extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
> > unsigned long addr, pte_t *ptep,
> > pte_t entry, int dirty);
> > +extern void contpte_ptep_mkold_clean(struct vm_area_struct *vma,
> > + unsigned long addr, pte_t *ptep);
>
> Let's have this follow the same naming pattern; contpte_mkold_clean_ptes()
Got it. Let’s keep the naming consistent with contpte_mkold_clean_ptes().
>
> >
> > static __always_inline void contpte_try_fold(struct mm_struct *mm,
> > unsigned long addr, pte_t *ptep, pte_t pte)
> > @@ -1603,6 +1626,18 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
> > return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
> > }
> >
> > +#define mkold_clean_ptes mkold_clean_ptes
> > +static inline void mkold_clean_ptes(struct vm_area_struct *vma,
> > + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
> > +{
> > + pte_t orig_pte = __ptep_get(ptep);
> > +
> > + if (likely(!pte_valid_cont(orig_pte)))
> > + return __mkold_clean_ptes(vma, addr, ptep, nr, full);
> > +
> > + return contpte_ptep_mkold_clean(vma, addr, ptep);> +}
>
> This function is totally broken as far as I can tell. You are assuming if the
> first pte is not cont, then the whole range is not cont; you can't assume that.
> Then if you decide it is cont, you ignore nr and only fixup a single contpte block.
>
> Take a look at wrprotect_ptes() or another one of the batched helpers and follow
> that pattern.
Sorry for the mistake. I'll take a closer look at wrprotect_ptes() and
make sure to
follow that pattern.
>
> > +
> > #else /* CONFIG_ARM64_CONTPTE */
> >
> > #define ptep_get __ptep_get
> > @@ -1622,6 +1657,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
> > #define wrprotect_ptes __wrprotect_ptes
> > #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
> > #define ptep_set_access_flags __ptep_set_access_flags
> > +#define mkold_clean_ptes __mkold_clean_ptes
> >
> > #endif /* CONFIG_ARM64_CONTPTE */
> >
> > diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
> > index 1b64b4c3f8bf..560622cfb2a9 100644
> > --- a/arch/arm64/mm/contpte.c
> > +++ b/arch/arm64/mm/contpte.c
> > @@ -322,6 +322,16 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> > }
> > EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
> >
> > +void contpte_ptep_mkold_clean(struct vm_area_struct *vma, unsigned long addr,
> > + pte_t *ptep)
> > +{
> > + ptep = contpte_align_down(ptep);
> > + addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
> > +
> > + __mkold_clean_ptes(vma, addr, ptep, CONT_PTES, 0);
>
> As above, this is broken as is.
>
> > +}
> > +EXPORT_SYMBOL_GPL(contpte_ptep_mkold_clean);
> > +
> > int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
> > unsigned long addr, pte_t *ptep)
> > {
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index fa8f92f6e2d7..fd30779fe487 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -391,6 +391,36 @@ static inline void mkold_ptes(struct vm_area_struct *vma, unsigned long addr,
> > }
> > #endif
> >
> > +#ifndef mkold_clean_ptes
> > +/**
> > + * mkold_clean_ptes - Mark PTEs that map consecutive pages of the same folio
> > + * as old and clean.
> > + * @vma: VMA the pages are mapped into.
> > + * @addr: Address the first page is mapped at.
> > + * @ptep: Page table pointer for the first entry.
> > + * @nr: Number of entries to mark old and clean.
> > + *
> > + * May be overridden by the architecture; otherwise, implemented as a simple
> > + * loop over ptep_get_and_clear_full().
>
> How about "otherwise, implemented by get_and_clear/modify/set for each pte in
> the range."?
Nice, this is clearer and more accurate than before.
>
> > + *
> > + * Note that PTE bits in the PTE range besides the PFN can differ. For example,
> > + * some PTEs might be write-protected.
> > + *
> > + * Context: The caller holds the page table lock. The PTEs map consecutive
> > + * pages that belong to the same folio. The PTEs are all in the same PMD.
> > + */
> > +static inline void mkold_clean_ptes(struct vm_area_struct *vma,
> > + unsigned long addr, pte_t *ptep, unsigned int nr, int full)
>
> You've included the "full" param in the function but not in the docs. I don't
> think the full param is valuable though; it doesn't make sense to call this
> function when tearing down a process. So drop the parameter and just call
> ptep_get_and_clear().
Got it. I'll drop the "full" parameter.
>
> > +{
> > + pte_t pte;
> > +
> > + for (; nr-- > 0; ptep++, addr += PAGE_SIZE) {
>
> nit: This is using a different pattern to all the other batch helpers
> (set_ptes(), wrprotect_ptes() clear_full_ptes()).
Thanks for pointing that out.
Thanks,
Lance
>
> > + pte = ptep_get_and_clear_full(vma->vm_mm, addr, ptep, full);
> > + set_pte_at(vma->vm_mm, addr, ptep, pte_mkclean(pte_mkold(pte)));
> > + }
> > +}
> > +#endif
> > +
> > #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
> > #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
> > static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
>
Hey Ryan,
Thanks for taking time to review!
On Thu, Apr 4, 2024 at 1:35 AM Ryan Roberts <[email protected]> wrote:
>
> On 02/04/2024 13:40, Lance Yang wrote:
> > This patch optimizes lazyfreeing with PTE-mapped mTHP[1]
> > (Inspired by David Hildenbrand[2]). We aim to avoid unnecessary folio
> > splitting if the large folio is fully mapped within the target range.
> >
> > If a large folio is locked or shared, or if we fail to split it, we just
> > leave it in place and advance to the next PTE in the range. But note that
> > the behavior is changed; previously, any failure of this sort would cause
> > the entire operation to give up. As large folios become more common,
> > sticking to the old way could result in wasted opportunities.
> >
> > On an Intel I5 CPU, lazyfreeing a 1GiB VMA backed by PTE-mapped folios of
> > the same size results in the following runtimes for madvise(MADV_FREE) in
> > seconds (shorter is better):
> >
> > Folio Size | Old | New | Change
> > ------------------------------------------
> > 4KiB | 0.590251 | 0.590259 | 0%
> > 16KiB | 2.990447 | 0.185655 | -94%
> > 32KiB | 2.547831 | 0.104870 | -95%
> > 64KiB | 2.457796 | 0.052812 | -97%
> > 128KiB | 2.281034 | 0.032777 | -99%
> > 256KiB | 2.230387 | 0.017496 | -99%
> > 512KiB | 2.189106 | 0.010781 | -99%
> > 1024KiB | 2.183949 | 0.007753 | -99%
> > 2048KiB | 0.002799 | 0.002804 | 0%
>
> I'm guessing the reason that 2M is not showing any change is because its
> PMD-mapped and splitting is already elided? If you were to force it to be
Your guess is correct. The lack of change in 2M is because it's PMD-mapped.
> PTE-mapped then you'll see the very impressive speed-up there too. Don't worry
> about doing that on my account though; these results are already sufficient IMHO.
>
> >
> > [1] https://lkml.kernel.org/r/20231207161211.2374093-5-ryan.roberts@armcom
> > [2] https://lore.kernel.org/linux-mm/[email protected]
> >
> > Signed-off-by: Lance Yang <[email protected]>
> > ---
> > mm/internal.h | 12 ++++-
> > mm/madvise.c | 147 ++++++++++++++++++++++++++------------------------
> > mm/memory.c | 4 +-
> > 3 files changed, 88 insertions(+), 75 deletions(-)
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 3df06a152ff0..cdc6e2162b30 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -132,6 +132,8 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
> > * first one is writable.
> > * @any_young: Optional pointer to indicate whether any entry except the
> > * first one is young.
> > + * @any_dirty: Optional pointer to indicate whether any entry except the
> > + * first one is dirty.
> > *
> > * Detect a PTE batch: consecutive (present) PTEs that map consecutive
> > * pages of the same large folio.
> > @@ -147,18 +149,20 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
> > */
> > static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> > pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
> > - bool *any_writable, bool *any_young)
> > + bool *any_writable, bool *any_young, bool *any_dirty)
> > {
> > unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
> > const pte_t *end_ptep = start_ptep + max_nr;
> > pte_t expected_pte, *ptep;
> > - bool writable, young;
> > + bool writable, young, dirty;
> > int nr;
> >
> > if (any_writable)
> > *any_writable = false;
> > if (any_young)
> > *any_young = false;
> > + if (any_dirty)
> > + *any_dirty = false;
> >
> > VM_WARN_ON_FOLIO(!pte_present(pte), folio);
> > VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
> > @@ -174,6 +178,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> > writable = !!pte_write(pte);
> > if (any_young)
> > young = !!pte_young(pte);
> > + if (any_dirty)
> > + dirty = !!pte_dirty(pte);
> > pte = __pte_batch_clear_ignored(pte, flags);
> >
> > if (!pte_same(pte, expected_pte))
> > @@ -191,6 +197,8 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
> > *any_writable |= writable;
> > if (any_young)
> > *any_young |= young;
> > + if (any_dirty)
> > + *any_dirty |= dirty;
> >
> > nr = pte_batch_hint(ptep, pte);
> > expected_pte = pte_advance_pfn(expected_pte, nr);
> > diff --git a/mm/madvise.c b/mm/madvise.c
> > index bd00b83e7c50..8197effd9f14 100644
> > --- a/mm/madvise.c
> > +++ b/mm/madvise.c
> > @@ -321,6 +321,38 @@ static inline bool can_do_file_pageout(struct vm_area_struct *vma)
> > file_permission(vma->vm_file, MAY_WRITE) == 0;
> > }
> >
> > +static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
> > + struct folio *folio, pte_t *pte,
> > + bool *any_writable, bool *any_young, bool *any_dirty)
>
> any_writable is always NULL. Do you need it?
Thanks for pointing that out.
It seems that the any_writable parameter is redundant here, so I'll drop it.
>
> > +{
> > + int max_nr = (end - addr) / PAGE_SIZE;
> > + const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
> > +
> > + return folio_pte_batch(folio, addr, pte, ptep_get(pte), max_nr,
>
> ptep_get() was problematic for performance of the order-0 folio case when we
> batched fork(). So we are deliberately passing around the value we already read
> in the main loop. Granted this case is not so performance critical because we
> only end up here for large folios. But I would still prefer to just pass the
> data we have already read into this function rather than reading it again.
Thanks for the explanation! I completely agree.
I‘ll pass the data we’ve already read into this function.
>
> > + fpb_flags, any_writable, any_young, any_dirty);
> > +}
> > +
> > +static inline bool madvise_pte_split_folio(struct mm_struct *mm, pmd_t *pmd,
> > + unsigned long addr, struct folio *folio, pte_t **pte,
>
> nit: I know 80 chars is a soft limit now (and I think 100 is a hard limit), but
> try to be consistent. You could move the addr param to the previous line and be
> within the 100 char limit. Personally I would just make the prototype fit in 80
> chars (same goes for madvise_folio_pte_batch).
Got it. Thanks.
>
> > + spinlock_t **ptl)
> > +{
> > + int err;
> > +
> > + if (!folio_trylock(folio))
> > + return false;
> > +
> > + folio_get(folio);
> > + pte_unmap_unlock(*pte, *ptl);
> > + *pte = NULL;
>
> nit: you don't need this since you are later unconditionally setting it again.
Nice. I'll remove it.
>
> > + err = split_folio(folio);
> > + folio_unlock(folio);
> > + folio_put(folio);
> > +
> > + *pte = pte_offset_map_lock(mm, pmd, addr, ptl);
> > +
> > + return err == 0;
> > +}
> > +
> > static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> > unsigned long addr, unsigned long end,
> > struct mm_walk *walk)
> > @@ -456,40 +488,26 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
> > * next pte in the range.
> > */
> > if (folio_test_large(folio)) {
> > - const fpb_t fpb_flags = FPB_IGNORE_DIRTY |
> > - FPB_IGNORE_SOFT_DIRTY;
> > - int max_nr = (end - addr) / PAGE_SIZE;
> > bool any_young;
> > -
> > - nr = folio_pte_batch(folio, addr, pte, ptent, max_nr,
> > - fpb_flags, NULL, &any_young);
> > + nr = madvise_folio_pte_batch(addr, end, folio, pte,
> > + NULL, &any_young, NULL);
> > if (any_young)
> > ptent = pte_mkyoung(ptent);
> >
> > if (nr < folio_nr_pages(folio)) {
> > - int err;
> > -
> > if (folio_likely_mapped_shared(folio))
> > continue;
> > if (pageout_anon_only_filter && !folio_test_anon(folio))
> > continue;
> > - if (!folio_trylock(folio))
> > - continue;
> > - folio_get(folio);
> > +
> > arch_leave_lazy_mmu_mode();
> > - pte_unmap_unlock(start_pte, ptl);
> > - start_pte = NULL;
> > - err = split_folio(folio);
> > - folio_unlock(folio);
> > - folio_put(folio);
> > - if (err)
> > - continue;
> > - start_pte = pte =
> > - pte_offset_map_lock(mm, pmd, addr, &ptl);
> > + if (madvise_pte_split_folio(mm, pmd, addr,
> > + folio, &start_pte, &ptl))
> > + nr = 0;
> > if (!start_pte)
> > break;
> > + pte = start_pte;
> > arch_enter_lazy_mmu_mode();
> > - nr = 0;
> > continue;
>
> This change fixes a bug I've introduced in my swap-out series. Nice. I tried to
> fix in v6, but looking at this, I've realised its still broken. I've replied
> against that series with the fix.
>
>
> > }
> > }
> > @@ -688,72 +706,59 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
> > continue;
> >
> > /*
> > - * If pmd isn't transhuge but the folio is large and
> > - * is owned by only this process, split it and
> > - * deactivate all pages.
> > + * If we encounter a large folio, only split it if it is not
> > + * fully mapped within the range we are operating on. Otherwise
> > + * leave it as is so that it can be marked as lazyfree. If we
> > + * fail to split a folio, leave it in place and advance to the
> > + * next pte in the range.
> > */
> > if (folio_test_large(folio)) {
> > - int err;
> > + bool any_young, any_dirty;
> > + nr = madvise_folio_pte_batch(addr, end, folio, pte,
> > + NULL, &any_young, &any_dirty);
> > + if (any_young || any_dirty)
> > + ptent = pte_mkdirty(pte_mkyoung(ptent));
>
> I don't think it makes any difference to how ptent is consumed below, but its
> probably more intuitive to separate these two operations:
>
> if (any_young)
> ptent = pte_mkyoung(ptent);
> if (any_dirty)
> ptent = pte_mkdirty(ptent);
I agree that it's more intuitive to separate these two operations. Thanks.
>
> >
> > - if (folio_likely_mapped_shared(folio))
> > - break;
> > - if (!folio_trylock(folio))
> > - break;
> > - folio_get(folio);
> > - arch_leave_lazy_mmu_mode();
> > - pte_unmap_unlock(start_pte, ptl);
> > - start_pte = NULL;
> > - err = split_folio(folio);
> > - folio_unlock(folio);
> > - folio_put(folio);
> > - if (err)
> > - break;
> > - start_pte = pte =
> > - pte_offset_map_lock(mm, pmd, addr, &ptl);
> > - if (!start_pte)
> > - break;
> > - arch_enter_lazy_mmu_mode();
> > - pte--;
> > - addr -= PAGE_SIZE;
> > - continue;
> > - }
> > + if (nr < folio_nr_pages(folio)) {
> > + if (folio_likely_mapped_shared(folio))
> > + continue;
> >
> > - if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
> > - if (!folio_trylock(folio))
> > - continue;
> > - /*
> > - * If folio is shared with others, we mustn't clear
> > - * the folio's dirty flag.
> > - */
> > - if (folio_mapcount(folio) != 1) {
> > - folio_unlock(folio);
> > + arch_leave_lazy_mmu_mode();
> > + if (madvise_pte_split_folio(mm, pmd, addr,
> > + folio, &start_pte, &ptl))
> > + nr = 0;
> > + if (!start_pte)
> > + break;
> > + pte = start_pte;
> > + arch_enter_lazy_mmu_mode();
> > continue;
> > }
> > + }
> >
> > + if (!folio_trylock(folio))
> > + continue;
> > + /*
> > + * If we have a large folio at this point, we know it is fully mapped
> > + * so if its mapcount is the same as its number of pages, it must be
> > + * exclusive.
> > + */
> > + if (folio_mapcount(folio) != folio_nr_pages(folio)) {
> > + folio_unlock(folio);
> > + continue;
> > + }
> > + if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
>
> I don't understand the rationale for reducing the scope of this conditional?
> Previously it was used to avoid having to lock the folio if it wasn't in the
> swapcache or if it wasn't dirty. So now you will be locking much more often.
You're right. I should keep the previous behavior of avoiding locking
the folio if
wasn't in the swapcache or if it wasn't dirty.
Thanks again for your time!
Lance
>
> Thanks,
> Ryan
>
> > if (folio_test_swapcache(folio) &&
> > !folio_free_swap(folio)) {
> > folio_unlock(folio);
> > continue;
> > }
> > -
> > folio_clear_dirty(folio);
> > - folio_unlock(folio);
> > }
> > + folio_unlock(folio);
> >
> > if (pte_young(ptent) || pte_dirty(ptent)) {
> > - /*
> > - * Some of architecture(ex, PPC) don't update TLB
> > - * with set_pte_at and tlb_remove_tlb_entry so for
> > - * the portability, remap the pte with old|clean
> > - * after pte clearing.
> > - */
> > - ptent = ptep_get_and_clear_full(mm, addr, pte,
> > - tlb->fullmm);
> > -
> > - ptent = pte_mkold(ptent);
> > - ptent = pte_mkclean(ptent);
> > - set_pte_at(mm, addr, pte, ptent);
> > - tlb_remove_tlb_entry(tlb, pte, addr);
> > + mkold_clean_ptes(vma, addr, pte, nr, tlb->fullmm);
> > + tlb_remove_tlb_entries(tlb, pte, nr, addr);
> > }
> > folio_mark_lazyfree(folio);
> > }
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 912cd738ec03..24769ecb59e5 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -989,7 +989,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
> > flags |= FPB_IGNORE_SOFT_DIRTY;
> >
> > nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
> > - &any_writable, NULL);
> > + &any_writable, NULL, NULL);
> > folio_ref_add(folio, nr);
> > if (folio_test_anon(folio)) {
> > if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
> > @@ -1559,7 +1559,7 @@ static inline int zap_present_ptes(struct mmu_gather *tlb,
> > */
> > if (unlikely(folio_test_large(folio) && max_nr != 1)) {
> > nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
> > - NULL, NULL);
> > + NULL, NULL, NULL);
> >
> > zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
> > addr, details, rss, force_flush,
>