LinuxLists.cc - [PATCH v3 00/15] Transparent Contiguous PTEs for User Mappings

[permalink] [raw]

Subject: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

Convert copy_pte_range() to copy a set of ptes in a batch. A given batch
maps a physically contiguous block of memory, all belonging to the same
folio. This will likely improve performance by a tiny amount due to
batching the folio reference count management and calling set_ptes()
rather than making individual calls to set_pte_at().

However, the primary motivation for this change is to reduce the number
of tlb maintenance operations that the arm64 backend has to perform
during fork, as it is about to add transparent support for the
"contiguous bit" in its ptes. By write-protecting the parent using the
new ptep_set_wrprotects() (note the 's' at the end) function, the
backend can avoid having to unfold contig ranges of PTEs, which is
expensive, when all ptes in the range are being write-protected.
Similarly, by using set_ptes() rather than set_pte_at() to set up ptes
in the child, the backend does not need to fold a contiguous range once
they are all populated - they can be initially populated as a contiguous
range in the first place.

This change addresses the core-mm refactoring only, and introduces
ptep_set_wrprotects() with a default implementation that calls
ptep_set_wrprotect() for each pte in the range. A separate change will
implement ptep_set_wrprotects() in the arm64 backend to realize the
performance improvement as part of the work to enable contpte mappings.

Signed-off-by: Ryan Roberts <[email protected]>
---
include/linux/pgtable.h | 13 +++
mm/memory.c | 195 ++++++++++++++++++++++++++++++----------
2 files changed, 162 insertions(+), 46 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index af7639c3b0a3..1c50f8a0fdde 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -622,6 +622,19 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
}
#endif

+#ifndef ptep_set_wrprotects
+struct mm_struct;
+static inline void ptep_set_wrprotects(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep,
+ unsigned int nr)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
+ ptep_set_wrprotect(mm, address, ptep);
+}
+#endif
+
/*
* On some architectures hardware does not set page access bit when accessing
* memory page, it is responsibility of software setting this bit. It brings
diff --git a/mm/memory.c b/mm/memory.c
index 1f18ed4a5497..8a87a488950c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -924,68 +924,162 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
return 0;
}

+static int folio_nr_pages_cont_mapped(struct folio *folio,
+ struct page *page, pte_t *pte,
+ unsigned long addr, unsigned long end,
+ pte_t ptent, bool enforce_uffd_wp,
+ int *dirty_nr, int *writable_nr)
+{
+ int floops;
+ int i;
+ unsigned long pfn;
+ bool prot_none;
+ bool uffd_wp;
+
+ if (!folio_test_large(folio))
+ return 1;
+
+ /*
+ * Loop either to `end` or to end of folio if its contiguously mapped,
+ * whichever is smaller.
+ */
+ floops = (end - addr) >> PAGE_SHIFT;
+ floops = min_t(int, floops,
+ folio_pfn(folio_next(folio)) - page_to_pfn(page));
+
+ pfn = page_to_pfn(page);
+ prot_none = pte_protnone(ptent);
+ uffd_wp = pte_uffd_wp(ptent);
+
+ *dirty_nr = !!pte_dirty(ptent);
+ *writable_nr = !!pte_write(ptent);
+
+ pfn++;
+ pte++;
+
+ for (i = 1; i < floops; i++) {
+ ptent = ptep_get(pte);
+
+ if (!pte_present(ptent) || pte_pfn(ptent) != pfn ||
+ prot_none != pte_protnone(ptent) ||
+ (enforce_uffd_wp && uffd_wp != pte_uffd_wp(ptent)))
+ break;
+
+ if (pte_dirty(ptent))
+ (*dirty_nr)++;
+ if (pte_write(ptent))
+ (*writable_nr)++;
+
+ pfn++;
+ pte++;
+ }
+
+ return i;
+}
+
/*
- * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
- * is required to copy this pte.
+ * Copy set of contiguous ptes. Returns number of ptes copied if succeeded
+ * (always gte 1), or -EAGAIN if one preallocated page is required to copy the
+ * first pte.
*/
static inline int
-copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
- pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
- struct folio **prealloc)
+copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+ pte_t *dst_pte, pte_t *src_pte,
+ unsigned long addr, unsigned long end,
+ int *rss, struct folio **prealloc)
{
struct mm_struct *src_mm = src_vma->vm_mm;
unsigned long vm_flags = src_vma->vm_flags;
pte_t pte = ptep_get(src_pte);
struct page *page;
struct folio *folio;
+ int nr = 1;
+ bool anon = false;
+ bool enforce_uffd_wp = userfaultfd_wp(dst_vma);
+ int nr_dirty = !!pte_dirty(pte);
+ int nr_writable = !!pte_write(pte);
+ int i, ret;

page = vm_normal_page(src_vma, addr, pte);
- if (page)
+ if (page) {
folio = page_folio(page);
- if (page && folio_test_anon(folio)) {
- /*
- * If this page may have been pinned by the parent process,
- * copy the page immediately for the child so that we'll always
- * guarantee the pinned page won't be randomly replaced in the
- * future.
- */
- folio_get(folio);
- if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
- /* Page may be pinned, we have to copy. */
- folio_put(folio);
- return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, prealloc, page);
+ anon = folio_test_anon(folio);
+ nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
+ pte, enforce_uffd_wp, &nr_dirty,
+ &nr_writable);
+ folio_ref_add(folio, nr);
+
+ for (i = 0; i < nr; i++, page++) {
+ if (anon) {
+ /*
+ * If this page may have been pinned by the
+ * parent process, copy the page immediately for
+ * the child so that we'll always guarantee the
+ * pinned page won't be randomly replaced in the
+ * future.
+ */
+ if (unlikely(page_try_dup_anon_rmap(
+ page, false, src_vma))) {
+ if (i != 0)
+ break;
+ /* Page may be pinned, we have to copy. */
+ folio_ref_sub(folio, nr);
+ ret = copy_present_page(
+ dst_vma, src_vma, dst_pte,
+ src_pte, addr, rss, prealloc,
+ page);
+ return ret == 0 ? 1 : ret;
+ }
+ rss[MM_ANONPAGES]++;
+ VM_BUG_ON(PageAnonExclusive(page));
+ } else {
+ page_dup_file_rmap(page, false);
+ rss[mm_counter_file(page)]++;
+ }
}
- rss[MM_ANONPAGES]++;
- } else if (page) {
- folio_get(folio);
- page_dup_file_rmap(page, false);
- rss[mm_counter_file(page)]++;
- }

- /*
- * If it's a COW mapping, write protect it both
- * in the parent and the child
- */
- if (is_cow_mapping(vm_flags) && pte_write(pte)) {
- ptep_set_wrprotect(src_mm, addr, src_pte);
- pte = pte_wrprotect(pte);
+ if (i < nr) {
+ folio_ref_sub(folio, nr - i);
+ nr = i;
+ }
}
- VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));

/*
- * If it's a shared mapping, mark it clean in
- * the child
+ * If it's a shared mapping, mark it clean and write protected in the
+ * child, and rely on a write fault to fix up the permissions. This
+ * allows determining batch size without having to consider RO/RW
+ * permissions. As an optimization, skip wrprotect if all ptes in the
+ * batch have the same permissions.
+ *
+ * If its a private (CoW) mapping, mark it dirty in the child if _any_
+ * of the parent mappings in the block were marked dirty. The contiguous
+ * block of mappings are all backed by the same folio, so if any are
+ * dirty then the whole folio is dirty. This allows determining batch
+ * size without having to consider the dirty bit. Further, write protect
+ * it both in the parent and the child so that a future write will cause
+ * a CoW. As as an optimization, skip the wrprotect if all the ptes in
+ * the batch are already readonly.
*/
- if (vm_flags & VM_SHARED)
+ if (vm_flags & VM_SHARED) {
pte = pte_mkclean(pte);
- pte = pte_mkold(pte);
+ if (nr_writable > 0 && nr_writable < nr)
+ pte = pte_wrprotect(pte);
+ } else {
+ if (nr_dirty)
+ pte = pte_mkdirty(pte);
+ if (nr_writable) {
+ ptep_set_wrprotects(src_mm, addr, src_pte, nr);
+ pte = pte_wrprotect(pte);
+ }
+ }

- if (!userfaultfd_wp(dst_vma))
+ pte = pte_mkold(pte);
+ pte = pte_clear_soft_dirty(pte);
+ if (!enforce_uffd_wp)
pte = pte_clear_uffd_wp(pte);

- set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
- return 0;
+ set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
+ return nr;
}

static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
@@ -1021,6 +1115,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
struct folio *prealloc = NULL;
+ int nr_ptes;

again:
progress = 0;
@@ -1051,6 +1146,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
arch_enter_lazy_mmu_mode();

do {
+ nr_ptes = 1;
+
/*
* We are holding two locks at this point - either of them
* could generate latencies in another task on another CPU.
@@ -1086,16 +1183,21 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
* the now present pte.
*/
WARN_ON_ONCE(ret != -ENOENT);
+ ret = 0;
}
- /* copy_present_pte() will clear `*prealloc' if consumed */
- ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, &prealloc);
+ /* copy_present_ptes() will clear `*prealloc' if consumed */
+ nr_ptes = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
+ addr, end, rss, &prealloc);
+
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
*/
- if (unlikely(ret == -EAGAIN))
+ if (unlikely(nr_ptes == -EAGAIN)) {
+ ret = -EAGAIN;
break;
+ }
+
if (unlikely(prealloc)) {
/*
* pre-alloc page cannot be reused by next time so as
@@ -1106,8 +1208,9 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
folio_put(prealloc);
prealloc = NULL;
}
- progress += 8;
- } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+ progress += 8 * nr_ptes;
+ } while (dst_pte += nr_ptes, src_pte += nr_ptes,
+ addr += PAGE_SIZE * nr_ptes, addr != end);

arch_leave_lazy_mmu_mode();
pte_unmap_unlock(orig_src_pte, src_ptl);
--
2.25.1

2023-12-04 10:56:04

[permalink] [raw]

Subject: [PATCH v3 03/15] arm64/mm: set_pte(): New layer to manage contig bit

2023-12-04 10:56:20

[permalink] [raw]

Subject: [PATCH v3 02/15] mm: Batch-clear PTE ranges during zap_pte_range()

Convert zap_pte_range() to clear a set of ptes in a batch. A given batch
maps a physically contiguous block of memory, all belonging to the same
folio. This will likely improve performance by a tiny amount due to
removing duplicate calls to mark the folio dirty and accessed. And also
provides us with a future opportunity to batch the rmap removal.

However, the primary motivation for this change is to reduce the number
of tlb maintenance operations that the arm64 backend has to perform
during exit and other syscalls that cause zap_pte_range() (e.g. munmap,
madvise(DONTNEED), etc.), as it is about to add transparent support for
the "contiguous bit" in its ptes. By clearing ptes using the new
clear_ptes() API, the backend doesn't have to perform an expensive
unfold operation when a PTE being cleared is part of a contpte block.
Instead it can just clear the whole block immediately.

This change addresses the core-mm refactoring only, and introduces
clear_ptes() with a default implementation that calls
ptep_get_and_clear_full() for each pte in the range. Note that this API
returns the pte at the beginning of the batch, but with the dirty and
young bits set if ANY of the ptes in the cleared batch had those bits
set; this information is applied to the folio by the core-mm. Given the
batch is garranteed to cover only a single folio, collapsing this state
does not lose any useful information.

A separate change will implement clear_ptes() in the arm64 backend to
realize the performance improvement as part of the work to enable
contpte mappings.

Signed-off-by: Ryan Roberts <[email protected]>
---
include/asm-generic/tlb.h | 9 ++++++
include/linux/pgtable.h | 26 ++++++++++++++++
mm/memory.c | 63 ++++++++++++++++++++++++++-------------
mm/mmu_gather.c | 14 +++++++++
4 files changed, 92 insertions(+), 20 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 129a3a759976..b84ba3aa1f6e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -75,6 +75,9 @@
* boolean indicating if the queue is (now) full and a call to
* tlb_flush_mmu() is required.
*
+ * tlb_get_guaranteed_space() returns the minimum garanteed number of pages
+ * that can be queued without overflow.
+ *
* tlb_remove_page() and tlb_remove_page_size() imply the call to
* tlb_flush_mmu() when required and has no return value.
*
@@ -263,6 +266,7 @@ struct mmu_gather_batch {
extern bool __tlb_remove_page_size(struct mmu_gather *tlb,
struct encoded_page *page,
int page_size);
+extern unsigned int tlb_get_guaranteed_space(struct mmu_gather *tlb);

#ifdef CONFIG_SMP
/*
@@ -273,6 +277,11 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb,
extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma);
#endif

+#else
+static inline unsigned int tlb_get_guaranteed_space(struct mmu_gather *tlb)
+{
+ return 1;
+}
#endif

/*
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 1c50f8a0fdde..e998080eb7ae 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -635,6 +635,32 @@ static inline void ptep_set_wrprotects(struct mm_struct *mm,
}
#endif

+#ifndef clear_ptes
+struct mm_struct;
+static inline pte_t clear_ptes(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep,
+ int full, unsigned int nr)
+{
+ unsigned int i;
+ pte_t pte;
+ pte_t orig_pte = ptep_get_and_clear_full(mm, address, ptep, full);
+
+ for (i = 1; i < nr; i++) {
+ address += PAGE_SIZE;
+ ptep++;
+ pte = ptep_get_and_clear_full(mm, address, ptep, full);
+
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+
+ return orig_pte;
+}
+#endif
+
/*
* On some architectures hardware does not set page access bit when accessing
* memory page, it is responsibility of software setting this bit. It brings
diff --git a/mm/memory.c b/mm/memory.c
index 8a87a488950c..60f030700a3f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1515,6 +1515,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
pte_t *start_pte;
pte_t *pte;
swp_entry_t entry;
+ int nr;

tlb_change_page_size(tlb, PAGE_SIZE);
init_rss_vec(rss);
@@ -1527,6 +1528,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
do {
pte_t ptent = ptep_get(pte);
struct page *page;
+ int i;
+
+ nr = 1;

if (pte_none(ptent))
continue;
@@ -1535,45 +1539,64 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
break;

if (pte_present(ptent)) {
- unsigned int delay_rmap;
+ unsigned int delay_rmap = 0;
+ bool tlb_full = false;
+ struct folio *folio = NULL;

page = vm_normal_page(vma, addr, ptent);
if (unlikely(!should_zap_page(details, page)))
continue;
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
+
+ if (likely(page)) {
+ folio = page_folio(page);
+ nr = folio_nr_pages_cont_mapped(folio, page,
+ pte, addr, end,
+ ptent, true, &i, &i);
+ nr = min_t(int, nr, tlb_get_guaranteed_space(tlb));
+ }
+
+ ptent = clear_ptes(mm, addr, pte, tlb->fullmm, nr);
arch_check_zapped_pte(vma, ptent);
- tlb_remove_tlb_entry(tlb, pte, addr);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details,
- ptent);
+
+ for (i = 0; i < nr; i++) {
+ unsigned long subaddr = addr + PAGE_SIZE * i;
+
+ tlb_remove_tlb_entry(tlb, &pte[i], subaddr);
+ zap_install_uffd_wp_if_needed(vma, subaddr,
+ &pte[i], details, ptent);
+ }
if (unlikely(!page)) {
ksm_might_unmap_zero_page(mm, ptent);
continue;
}

- delay_rmap = 0;
- if (!PageAnon(page)) {
+ if (!folio_test_anon(folio)) {
if (pte_dirty(ptent)) {
- set_page_dirty(page);
+ folio_mark_dirty(folio);
if (tlb_delay_rmap(tlb)) {
delay_rmap = 1;
force_flush = 1;
}
}
if (pte_young(ptent) && likely(vma_has_recency(vma)))
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
}
- rss[mm_counter(page)]--;
- if (!delay_rmap) {
- page_remove_rmap(page, vma, false);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
+ for (i = 0; i < nr; i++, page++) {
+ rss[mm_counter(page)]--;
+ if (!delay_rmap) {
+ page_remove_rmap(page, vma, false);
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ }
+ if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
+ tlb_full = true;
+ force_flush = 1;
+ addr += PAGE_SIZE * (i + 1);
+ break;
+ }
}
- if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
- force_flush = 1;
- addr += PAGE_SIZE;
+ if (unlikely(tlb_full))
break;
- }
continue;
}

@@ -1624,7 +1647,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
}
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
- } while (pte++, addr += PAGE_SIZE, addr != end);
+ } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);

add_mm_rss_vec(mm, rss);
arch_leave_lazy_mmu_mode();
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 4f559f4ddd21..57b4d5f0dfa4 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -47,6 +47,20 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
return true;
}

+unsigned int tlb_get_guaranteed_space(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch = tlb->active;
+ unsigned int nr_next = 0;
+
+ /* Allocate next batch so we can guarrantee at least one batch. */
+ if (tlb_next_batch(tlb)) {
+ tlb->active = batch;
+ nr_next = batch->next->max;
+ }
+
+ return batch->max - batch->nr + nr_next;
+}
+
#ifdef CONFIG_SMP
static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
--
2.25.1

2023-12-04 10:56:29

[permalink] [raw]

Subject: [PATCH v3 05/15] arm64/mm: pte_clear(): New layer to manage contig bit

2023-12-04 10:56:43

[permalink] [raw]

Subject: [PATCH v3 12/15] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

Split __flush_tlb_range() into __flush_tlb_range_nosync() +
__flush_tlb_range(), in the same way as the existing flush_tlb_page()
arrangement. This allows calling __flush_tlb_range_nosync() to elide the
trailing DSB. Forthcoming "contpte" code will take advantage of this
when clearing the young bit from a contiguous range of ptes.

Signed-off-by: Ryan Roberts <[email protected]>
---
arch/arm64/include/asm/tlbflush.h | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index bb2c2833a987..925ef3bdf9ed 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -399,7 +399,7 @@ do { \
#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false)

-static inline void __flush_tlb_range(struct vm_area_struct *vma,
+static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long stride, bool last_level,
int tlb_level)
@@ -431,10 +431,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
else
__flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true);

- dsb(ish);
mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
}

+static inline void __flush_tlb_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long stride, bool last_level,
+ int tlb_level)
+{
+ __flush_tlb_range_nosync(vma, start, end, stride,
+ last_level, tlb_level);
+ dsb(ish);
+}
+
static inline void flush_tlb_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end)
{
--
2.25.1

2023-12-04 10:56:47

[permalink] [raw]

Subject: [PATCH v3 11/15] arm64/mm: ptep_get(): New layer to manage contig bit

Create a new layer for the in-table PTE manipulation APIs. For now, The
existing API is prefixed with double underscore to become the
arch-private API and the public API is just a simple wrapper that calls
the private API.

The public API implementation will subsequently be used to transparently
manipulate the contiguous bit where appropriate. But since there are
already some contig-aware users (e.g. hugetlb, kernel mapper), we must
first ensure those users use the private API directly so that the future
contig-bit manipulations in the public API do not interfere with those
existing uses.

arm64 did not previously define an arch-specific ptep_get(), so override
the default version in the arch code, and also define the private
__ptep_get() version. Currently they both do the same thing that the
default version does (READ_ONCE()). Some arch users (hugetlb) were
already using ptep_get() so convert those to the private API. While
other callsites were doing direct READ_ONCE(), so convert those to use
the appropriate (public/private) API too.

Signed-off-by: Ryan Roberts <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 12 +++++++++---
arch/arm64/kernel/efi.c | 2 +-
arch/arm64/mm/fault.c | 4 ++--
arch/arm64/mm/hugetlbpage.c | 18 +++++++++---------
arch/arm64/mm/kasan_init.c | 2 +-
arch/arm64/mm/mmu.c | 12 ++++++------
arch/arm64/mm/pageattr.c | 4 ++--
arch/arm64/mm/trans_pgd.c | 2 +-
8 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 85010c2d4dfa..6930c14f062f 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -276,6 +276,11 @@ static inline void __set_pte(pte_t *ptep, pte_t pte)
}
}

+static inline pte_t __ptep_get(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+
extern void __sync_icache_dcache(pte_t pteval);
bool pgattr_change_is_safe(u64 old, u64 new);

@@ -303,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
if (!IS_ENABLED(CONFIG_DEBUG_VM))
return;

- old_pte = READ_ONCE(*ptep);
+ old_pte = __ptep_get(ptep);

if (!pte_valid(old_pte) || !pte_valid(pte))
return;
@@ -893,7 +898,7 @@ static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma,
{
pte_t old_pte, pte;

- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);
do {
old_pte = pte;
pte = pte_mkold(pte);
@@ -966,7 +971,7 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
{
pte_t old_pte, pte;

- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);
do {
old_pte = pte;
pte = pte_wrprotect(pte);
@@ -1111,6 +1116,7 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t old_pte, pte_t new_pte);

+#define ptep_get __ptep_get
#define set_pte __set_pte
#define set_ptes __set_ptes
#define pte_clear __pte_clear
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 44288a12fc6c..9afcc690fe73 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
{
struct set_perm_data *spd = data;
const efi_memory_desc_t *md = spd->md;
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = __ptep_get(ptep);

if (md->attribute & EFI_MEMORY_RO)
pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 7cebd9847aae..d63f3a0a7251 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -191,7 +191,7 @@ static void show_pte(unsigned long addr)
if (!ptep)
break;

- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);
pr_cont(", pte=%016llx", pte_val(pte));
pte_unmap(ptep);
} while(0);
@@ -214,7 +214,7 @@ int __ptep_set_access_flags(struct vm_area_struct *vma,
pte_t entry, int dirty)
{
pteval_t old_pteval, pteval;
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = __ptep_get(ptep);

if (pte_same(pte, entry))
return 0;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 627a9717e98c..52fb767607e0 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -152,14 +152,14 @@ pte_t huge_ptep_get(pte_t *ptep)
{
int ncontig, i;
size_t pgsize;
- pte_t orig_pte = ptep_get(ptep);
+ pte_t orig_pte = __ptep_get(ptep);

if (!pte_present(orig_pte) || !pte_cont(orig_pte))
return orig_pte;

ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
for (i = 0; i < ncontig; i++, ptep++) {
- pte_t pte = ptep_get(ptep);
+ pte_t pte = __ptep_get(ptep);

if (pte_dirty(pte))
orig_pte = pte_mkdirty(orig_pte);
@@ -184,7 +184,7 @@ static pte_t get_clear_contig(struct mm_struct *mm,
unsigned long pgsize,
unsigned long ncontig)
{
- pte_t orig_pte = ptep_get(ptep);
+ pte_t orig_pte = __ptep_get(ptep);
unsigned long i;

for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
@@ -408,7 +408,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
{
int ncontig;
size_t pgsize;
- pte_t orig_pte = ptep_get(ptep);
+ pte_t orig_pte = __ptep_get(ptep);

if (!pte_cont(orig_pte))
return __ptep_get_and_clear(mm, addr, ptep);
@@ -431,11 +431,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
{
int i;

- if (pte_write(pte) != pte_write(ptep_get(ptep)))
+ if (pte_write(pte) != pte_write(__ptep_get(ptep)))
return 1;

for (i = 0; i < ncontig; i++) {
- pte_t orig_pte = ptep_get(ptep + i);
+ pte_t orig_pte = __ptep_get(ptep + i);

if (pte_dirty(pte) != pte_dirty(orig_pte))
return 1;
@@ -492,7 +492,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
size_t pgsize;
pte_t pte;

- if (!pte_cont(READ_ONCE(*ptep))) {
+ if (!pte_cont(__ptep_get(ptep))) {
__ptep_set_wrprotect(mm, addr, ptep);
return;
}
@@ -517,7 +517,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
size_t pgsize;
int ncontig;

- if (!pte_cont(READ_ONCE(*ptep)))
+ if (!pte_cont(__ptep_get(ptep)))
return ptep_clear_flush(vma, addr, ptep);

ncontig = find_num_contig(mm, addr, ptep, &pgsize);
@@ -550,7 +550,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr
* when the permission changes from executable to non-executable
* in cases where cpu is affected with errata #2645198.
*/
- if (pte_user_exec(READ_ONCE(*ptep)))
+ if (pte_user_exec(__ptep_get(ptep)))
return huge_ptep_clear_flush(vma, addr, ptep);
}
return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 5eade712e9e5..5274c317d775 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -113,7 +113,7 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE);
next = addr + PAGE_SIZE;
__set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
- } while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep)));
+ } while (ptep++, addr = next, addr != end && pte_none(__ptep_get(ptep)));
}

static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 080e9b50f595..784f1e312447 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -176,7 +176,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,

ptep = pte_set_fixmap_offset(pmdp, addr);
do {
- pte_t old_pte = READ_ONCE(*ptep);
+ pte_t old_pte = __ptep_get(ptep);

__set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));

@@ -185,7 +185,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
* only allow updates to the permission attributes.
*/
BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
- READ_ONCE(pte_val(*ptep))));
+ pte_val(__ptep_get(ptep))));

phys += PAGE_SIZE;
} while (ptep++, addr += PAGE_SIZE, addr != end);
@@ -854,7 +854,7 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,

do {
ptep = pte_offset_kernel(pmdp, addr);
- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);
if (pte_none(pte))
continue;

@@ -987,7 +987,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,

do {
ptep = pte_offset_kernel(pmdp, addr);
- pte = READ_ONCE(*ptep);
+ pte = __ptep_get(ptep);

/*
* This is just a sanity check here which verifies that
@@ -1006,7 +1006,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
*/
ptep = pte_offset_kernel(pmdp, 0UL);
for (i = 0; i < PTRS_PER_PTE; i++) {
- if (!pte_none(READ_ONCE(ptep[i])))
+ if (!pte_none(__ptep_get(&ptep[i])))
return;
}

@@ -1475,7 +1475,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte
* when the permission changes from executable to non-executable
* in cases where cpu is affected with errata #2645198.
*/
- if (pte_user_exec(READ_ONCE(*ptep)))
+ if (pte_user_exec(ptep_get(ptep)))
return ptep_clear_flush(vma, addr, ptep);
}
return ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 057097acf9e0..624b0b0982e3 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -36,7 +36,7 @@ bool can_set_direct_map(void)
static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
{
struct page_change_data *cdata = data;
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = __ptep_get(ptep);

pte = clear_pte_bit(pte, cdata->clear_mask);
pte = set_pte_bit(pte, cdata->set_mask);
@@ -246,5 +246,5 @@ bool kernel_page_present(struct page *page)
return true;

ptep = pte_offset_kernel(pmdp, addr);
- return pte_valid(READ_ONCE(*ptep));
+ return pte_valid(__ptep_get(ptep));
}
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 230b607cf881..5139a28130c0 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info)

static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
{
- pte_t pte = READ_ONCE(*src_ptep);
+ pte_t pte = __ptep_get(src_ptep);

if (pte_valid(pte)) {
/*
--
2.25.1

2023-12-04 10:56:56

[permalink] [raw]

Subject: [PATCH v3 13/15] arm64/mm: Wire up PTE_CONT for user mappings

With the ptep API sufficiently refactored, we can now introduce a new
"contpte" API layer, which transparently manages the PTE_CONT bit for
user mappings. Whenever it detects a set of PTEs that meet the
requirements for a contiguous range, the PTEs are re-painted with the
PTE_CONT bit. Use of contpte mappings is intended to be transparent to
the core-mm, which continues to interact with individual ptes.

Since a contpte block only has a single access and dirty bit, the
semantic here changes slightly; when getting a pte (e.g. ptep_get())
that is part of a contpte mapping, the access and dirty information are
pulled from the block (so all ptes in the block return the same
access/dirty info). When changing the access/dirty info on a pte (e.g.
ptep_set_access_flags()) that is part of a contpte mapping, this change
will affect the whole contpte block. This is works fine in practice
since we guarrantee that only a single folio is mapped by a contpte
block, and the core-mm tracks access/dirty information per folio.

This initial change provides a baseline that can be optimized in future
commits. That said, fold/unfold operations (which imply tlb
invalidation) are avoided where possible with a few tricks for
access/dirty bit management. Write-protect modifications for contpte
mappings are currently non-optimal, and incure a regression in fork()
performance. This will be addressed in follow-up changes.

In order for the public functions, which used to be pure inline, to
continue to be callable by modules, export all the contpte_* symbols
that are now called by those public inline functions.

The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter
at build time. It defaults to enabled as long as its dependency,
TRANSPARENT_HUGEPAGE is also enabled. The core-mm depends upon
TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not
enabled, then there is no chance of meeting the physical contiguity
requirement for contpte mappings.

Signed-off-by: Ryan Roberts <[email protected]>
---
arch/arm64/Kconfig | 10 +-
arch/arm64/include/asm/pgtable.h | 202 ++++++++++++++++++
arch/arm64/mm/Makefile | 1 +
arch/arm64/mm/contpte.c | 352 +++++++++++++++++++++++++++++++
4 files changed, 564 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/mm/contpte.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b071a00425d..de76e484ff3a 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2209,6 +2209,15 @@ config UNWIND_PATCH_PAC_INTO_SCS
select UNWIND_TABLES
select DYNAMIC_SCS

+config ARM64_CONTPTE
+ bool "Contiguous PTE mappings for user memory" if EXPERT
+ depends on TRANSPARENT_HUGEPAGE
+ default y
+ help
+ When enabled, user mappings are configured using the PTE contiguous
+ bit, for any mappings that meet the size and alignment requirements.
+ This reduces TLB pressure and improves performance.
+
endmenu # "Kernel Features"

menu "Boot options"
@@ -2318,4 +2327,3 @@ endmenu # "CPU Power Management"
source "drivers/acpi/Kconfig"

source "arch/arm64/kvm/Kconfig"
-
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 6930c14f062f..15bc9cf1eef4 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -133,6 +133,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
*/
#define pte_valid_not_user(pte) \
((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN))
+/*
+ * Returns true if the pte is valid and has the contiguous bit set.
+ */
+#define pte_valid_cont(pte) (pte_valid(pte) && pte_cont(pte))
/*
* Could the pte be present in the TLB? We must check mm_tlb_flush_pending
* so that we don't erroneously return false for pages that have been
@@ -1116,6 +1120,202 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t old_pte, pte_t new_pte);

+#ifdef CONFIG_ARM64_CONTPTE
+
+/*
+ * The contpte APIs are used to transparently manage the contiguous bit in ptes
+ * where it is possible and makes sense to do so. The PTE_CONT bit is considered
+ * a private implementation detail of the public ptep API (see below).
+ */
+extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte);
+extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
+extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
+extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr);
+extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
+extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty);
+
+static inline pte_t *contpte_align_down(pte_t *ptep)
+{
+ return (pte_t *)(ALIGN_DOWN((unsigned long)ptep >> 3, CONT_PTES) << 3);
+}
+
+static inline bool contpte_is_enabled(struct mm_struct *mm)
+{
+ /*
+ * Don't attempt to apply the contig bit to kernel mappings, because
+ * dynamically adding/removing the contig bit can cause page faults.
+ * These racing faults are ok for user space, since they get serialized
+ * on the PTL. But kernel mappings can't tolerate faults.
+ */
+
+ return mm != &init_mm;
+}
+
+static inline void contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ /*
+ * Only bother trying if both the virtual and physical addresses are
+ * aligned and correspond to the last entry in a contig range. The core
+ * code mostly modifies ranges from low to high, so this is the likely
+ * the last modification in the contig range, so a good time to fold.
+ * We can't fold special mappings, because there is no associated folio.
+ */
+
+ bool valign = ((unsigned long)ptep >> 3) % CONT_PTES == CONT_PTES - 1;
+ bool palign = pte_pfn(pte) % CONT_PTES == CONT_PTES - 1;
+
+ if (contpte_is_enabled(mm) && valign && palign &&
+ pte_valid(pte) && !pte_cont(pte) && !pte_special(pte))
+ __contpte_try_fold(mm, addr, ptep, pte);
+}
+
+static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ if (contpte_is_enabled(mm) && pte_valid_cont(pte))
+ __contpte_try_unfold(mm, addr, ptep, pte);
+}
+
+/*
+ * The below functions constitute the public API that arm64 presents to the
+ * core-mm to manipulate PTE entries within the their page tables (or at least
+ * this is the subset of the API that arm64 needs to implement). These public
+ * versions will automatically and transparently apply the contiguous bit where
+ * it makes sense to do so. Therefore any users that are contig-aware (e.g.
+ * hugetlb, kernel mapper) should NOT use these APIs, but instead use the
+ * private versions, which are prefixed with double underscore. All of these
+ * APIs except for ptep_get_lockless() are expected to be called with the PTL
+ * held.
+ */
+
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+ pte_t pte = __ptep_get(ptep);
+
+ if (!pte_valid_cont(pte))
+ return pte;
+
+ return contpte_ptep_get(ptep, pte);
+}
+
+#define ptep_get_lockless ptep_get_lockless
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+ pte_t pte = __ptep_get(ptep);
+
+ if (!pte_valid_cont(pte))
+ return pte;
+
+ return contpte_ptep_get_lockless(ptep);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+ /*
+ * We don't have the mm or vaddr so cannot unfold or fold contig entries
+ * (since it requires tlb maintenance). set_pte() is not used in core
+ * code, so this should never even be called. Regardless do our best to
+ * service any call and emit a warning if there is any attempt to set a
+ * pte on top of an existing contig range.
+ */
+ pte_t orig_pte = __ptep_get(ptep);
+
+ WARN_ON_ONCE(pte_valid_cont(orig_pte));
+ __set_pte(ptep, pte_mknoncont(pte));
+}
+
+#define set_ptes set_ptes
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
+{
+ pte = pte_mknoncont(pte);
+
+ if (!contpte_is_enabled(mm))
+ __set_ptes(mm, addr, ptep, pte, nr);
+ else if (nr == 1) {
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __set_ptes(mm, addr, ptep, pte, nr);
+ contpte_try_fold(mm, addr, ptep, pte);
+ } else
+ contpte_set_ptes(mm, addr, ptep, pte, nr);
+}
+
+static inline void pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __pte_clear(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ return __ptep_get_and_clear(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (!pte_valid_cont(orig_pte))
+ return __ptep_test_and_clear_young(vma, addr, ptep);
+
+ return contpte_ptep_test_and_clear_young(vma, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ if (!pte_valid_cont(orig_pte))
+ return __ptep_clear_flush_young(vma, addr, ptep);
+
+ return contpte_ptep_clear_flush_young(vma, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __ptep_set_wrprotect(mm, addr, ptep);
+ contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
+}
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+static inline int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ pte_t orig_pte = __ptep_get(ptep);
+
+ entry = pte_mknoncont(entry);
+
+ if (!pte_valid_cont(orig_pte))
+ return __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+
+ return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+}
+
+#else /* CONFIG_ARM64_CONTPTE */
+
#define ptep_get __ptep_get
#define set_pte __set_pte
#define set_ptes __set_ptes
@@ -1131,6 +1331,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags __ptep_set_access_flags

+#endif /* CONFIG_ARM64_CONTPTE */
+
#endif /* !__ASSEMBLY__ */

#endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index dbd1bc95967d..60454256945b 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,6 +3,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \
cache.o copypage.o flush.o \
ioremap.o mmap.o pgd.o mmu.o \
context.o proc.o pageattr.o fixmap.o
+obj-$(CONFIG_ARM64_CONTPTE) += contpte.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
new file mode 100644
index 000000000000..e079ec61d7d1
--- /dev/null
+++ b/arch/arm64/mm/contpte.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <asm/tlbflush.h>
+
+static void ptep_clear_flush_range(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int nr)
+{
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ unsigned long start_addr = addr;
+ int i;
+
+ for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE)
+ __pte_clear(mm, addr, ptep);
+
+ __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+}
+
+static bool ptep_any_valid(pte_t *ptep, int nr)
+{
+ int i;
+
+ for (i = 0; i < nr; i++, ptep++) {
+ if (pte_valid(__ptep_get(ptep)))
+ return true;
+ }
+
+ return false;
+}
+
+static void contpte_convert(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+ unsigned long start_addr;
+ pte_t *start_ptep;
+ int i;
+
+ start_ptep = ptep = contpte_align_down(ptep);
+ start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+ pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte));
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) {
+ pte_t ptent = __ptep_get_and_clear(mm, addr, ptep);
+
+ if (pte_dirty(ptent))
+ pte = pte_mkdirty(pte);
+
+ if (pte_young(ptent))
+ pte = pte_mkyoung(pte);
+ }
+
+ __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+
+ __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
+}
+
+void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ /*
+ * We have already checked that the virtual and pysical addresses are
+ * correctly aligned for a contpte mapping in contpte_try_fold() so the
+ * remaining checks are to ensure that the contpte range is fully
+ * covered by a single folio, and ensure that all the ptes are valid
+ * with contiguous PFNs and matching prots. We ignore the state of the
+ * access and dirty bits for the purpose of deciding if its a contiguous
+ * range; the folding process will generate a single contpte entry which
+ * has a single access and dirty bit. Those 2 bits are the logical OR of
+ * their respective bits in the constituent pte entries. In order to
+ * ensure the contpte range is covered by a single folio, we must
+ * recover the folio from the pfn, but special mappings don't have a
+ * folio backing them. Fortunately contpte_try_fold() already checked
+ * that the pte is not special - we never try to fold special mappings.
+ * Note we can't use vm_normal_page() for this since we don't have the
+ * vma.
+ */
+
+ struct page *page = pte_page(pte);
+ struct folio *folio = page_folio(page);
+ unsigned long folio_saddr = addr - (page - &folio->page) * PAGE_SIZE;
+ unsigned long folio_eaddr = folio_saddr + folio_nr_pages(folio) * PAGE_SIZE;
+ unsigned long cont_saddr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+ unsigned long cont_eaddr = cont_saddr + CONT_PTE_SIZE;
+ unsigned long pfn;
+ pgprot_t prot;
+ pte_t subpte;
+ pte_t *orig_ptep;
+ int i;
+
+ if (folio_saddr > cont_saddr || folio_eaddr < cont_eaddr)
+ return;
+
+ pfn = pte_pfn(pte) - ((addr - cont_saddr) >> PAGE_SHIFT);
+ prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+ orig_ptep = ptep;
+ ptep = contpte_align_down(ptep);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
+ subpte = __ptep_get(ptep);
+ subpte = pte_mkold(pte_mkclean(subpte));
+
+ if (!pte_valid(subpte) ||
+ pte_pfn(subpte) != pfn ||
+ pgprot_val(pte_pgprot(subpte)) != pgprot_val(prot))
+ return;
+ }
+
+ pte = pte_mkcont(pte);
+ contpte_convert(mm, addr, orig_ptep, pte);
+}
+EXPORT_SYMBOL(__contpte_try_fold);
+
+void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ /*
+ * We have already checked that the ptes are contiguous in
+ * contpte_try_unfold(), so we can unfold unconditionally here.
+ */
+
+ pte = pte_mknoncont(pte);
+ contpte_convert(mm, addr, ptep, pte);
+}
+EXPORT_SYMBOL(__contpte_try_unfold);
+
+pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
+{
+ /*
+ * Gather access/dirty bits, which may be populated in any of the ptes
+ * of the contig range. We are guarranteed to be holding the PTL, so any
+ * contiguous range cannot be unfolded or otherwise modified under our
+ * feet.
+ */
+
+ pte_t pte;
+ int i;
+
+ ptep = contpte_align_down(ptep);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++) {
+ pte = __ptep_get(ptep);
+
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+
+ return orig_pte;
+}
+EXPORT_SYMBOL(contpte_ptep_get);
+
+pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
+{
+ /*
+ * Gather access/dirty bits, which may be populated in any of the ptes
+ * of the contig range. We may not be holding the PTL, so any contiguous
+ * range may be unfolded/modified/refolded under our feet. Therefore we
+ * ensure we read a _consistent_ contpte range by checking that all ptes
+ * in the range are valid and have CONT_PTE set, that all pfns are
+ * contiguous and that all pgprots are the same (ignoring access/dirty).
+ * If we find a pte that is not consistent, then we must be racing with
+ * an update so start again. If the target pte does not have CONT_PTE
+ * set then that is considered consistent on its own because it is not
+ * part of a contpte range.
+ */
+
+ pte_t orig_pte;
+ pgprot_t orig_prot;
+ pte_t *ptep;
+ unsigned long pfn;
+ pte_t pte;
+ pgprot_t prot;
+ int i;
+
+retry:
+ orig_pte = __ptep_get(orig_ptep);
+
+ if (!pte_valid_cont(orig_pte))
+ return orig_pte;
+
+ orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte)));
+ ptep = contpte_align_down(orig_ptep);
+ pfn = pte_pfn(orig_pte) - (orig_ptep - ptep);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
+ pte = __ptep_get(ptep);
+ prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+
+ if (!pte_valid_cont(pte) ||
+ pte_pfn(pte) != pfn ||
+ pgprot_val(prot) != pgprot_val(orig_prot))
+ goto retry;
+
+ if (pte_dirty(pte))
+ orig_pte = pte_mkdirty(orig_pte);
+
+ if (pte_young(pte))
+ orig_pte = pte_mkyoung(orig_pte);
+ }
+
+ return orig_pte;
+}
+EXPORT_SYMBOL(contpte_ptep_get_lockless);
+
+void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte, unsigned int nr)
+{
+ unsigned long next;
+ unsigned long end = addr + (nr << PAGE_SHIFT);
+ unsigned long pfn = pte_pfn(pte);
+ pgprot_t prot = pte_pgprot(pte);
+ pte_t orig_pte;
+
+ do {
+ next = pte_cont_addr_end(addr, end);
+ nr = (next - addr) >> PAGE_SHIFT;
+ pte = pfn_pte(pfn, prot);
+
+ if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0)
+ pte = pte_mkcont(pte);
+ else
+ pte = pte_mknoncont(pte);
+
+ /*
+ * If operating on a partial contiguous range then we must first
+ * unfold the contiguous range if it was previously folded.
+ * Otherwise we could end up with overlapping tlb entries.
+ */
+ if (nr != CONT_PTES)
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+
+ /*
+ * If we are replacing ptes that were contiguous or if the new
+ * ptes are contiguous and any of the ptes being replaced are
+ * valid, we need to clear and flush the range to prevent
+ * overlapping tlb entries.
+ */
+ orig_pte = __ptep_get(ptep);
+ if (pte_valid_cont(orig_pte) ||
+ (pte_cont(pte) && ptep_any_valid(ptep, nr)))
+ ptep_clear_flush_range(mm, addr, ptep, nr);
+
+ __set_ptes(mm, addr, ptep, pte, nr);
+
+ addr = next;
+ ptep += nr;
+ pfn += nr;
+
+ } while (addr != end);
+}
+EXPORT_SYMBOL(contpte_set_ptes);
+
+int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ /*
+ * ptep_clear_flush_young() technically requires us to clear the access
+ * flag for a _single_ pte. However, the core-mm code actually tracks
+ * access/dirty per folio, not per page. And since we only create a
+ * contig range when the range is covered by a single folio, we can get
+ * away with clearing young for the whole contig range here, so we avoid
+ * having to unfold.
+ */
+
+ int i;
+ int young = 0;
+
+ ptep = contpte_align_down(ptep);
+ addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+ young |= __ptep_test_and_clear_young(vma, addr, ptep);
+
+ return young;
+}
+EXPORT_SYMBOL(contpte_ptep_test_and_clear_young);
+
+int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int young;
+
+ young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
+
+ if (young) {
+ /*
+ * See comment in __ptep_clear_flush_young(); same rationale for
+ * eliding the trailing DSB applies here.
+ */
+ addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+ __flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
+ PAGE_SIZE, true, 3);
+ }
+
+ return young;
+}
+EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
+
+int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ pte_t orig_pte;
+ int i;
+ unsigned long start_addr;
+
+ /*
+ * Gather the access/dirty bits for the contiguous range. If nothing has
+ * changed, its a noop.
+ */
+ orig_pte = ptep_get(ptep);
+ if (pte_val(orig_pte) == pte_val(entry))
+ return 0;
+
+ /*
+ * We can fix up access/dirty bits without having to unfold/fold the
+ * contig range. But if the write bit is changing, we need to go through
+ * the full unfold/fold cycle.
+ */
+ if (pte_write(orig_pte) == pte_write(entry)) {
+ /*
+ * For HW access management, we technically only need to update
+ * the flag on a single pte in the range. But for SW access
+ * management, we need to update all the ptes to prevent extra
+ * faults. Avoid per-page tlb flush in __ptep_set_access_flags()
+ * and instead flush the whole range at the end.
+ */
+ ptep = contpte_align_down(ptep);
+ start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+ for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+ __ptep_set_access_flags(vma, addr, ptep, entry, 0);
+
+ if (dirty)
+ __flush_tlb_range(vma, start_addr, addr,
+ PAGE_SIZE, true, 3);
+ } else {
+ __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
+ __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+ contpte_try_fold(vma->vm_mm, addr, ptep, entry);
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(contpte_ptep_set_access_flags);
--
2.25.1

2023-12-04 10:57:00

[permalink] [raw]

Subject: [PATCH v3 04/15] arm64/mm: set_ptes()/set_pte_at(): New layer to manage contig bit

2023-12-04 10:57:05

[permalink] [raw]

Subject: [PATCH v3 06/15] arm64/mm: ptep_get_and_clear(): New layer to manage contig bit

2023-12-04 10:57:05

[permalink] [raw]

Subject: [PATCH v3 15/15] arm64/mm: Implement clear_ptes() to optimize exit()

With the core-mm changes in place to batch-clear ptes during
zap_pte_range(), we can take advantage of this in arm64 to greatly
reduce the number of tlbis we have to issue, and recover the lost exit
performance incured when adding support for transparent contiguous ptes.

If we are clearing a whole contpte range, we can elide first unfolding
that range and save the tlbis. We just clear the whole range.

The following shows the results of running a kernel compilation workload
and measuring the cost of arm64_sys_exit_group() (which at ~1.5% is a
very small part of the overall workload).

Benchmarks were run on Ampere Altra in 2 configs; single numa node and 2
numa nodes (tlbis are more expensive in 2 node config).

- baseline: v6.7-rc1 + anonfolio-v7
- no-opt: contpte series without any attempt to optimize exit()
- simple-ptep_get_clear_full: simple optimization to exploit full=1.
ptep_get_clear_full() does not fully conform to its intended semantic
- robust-ptep_get_clear_full: similar to previous but
ptep_get_clear_full() fully conforms to its intended semantic
- clear_ptes: optimization implemented by this patch

| config | numa=1 | numa=2 |
|----------------------------|--------|--------|
| baseline | 0% | 0% |
| no-opt | 190% | 768% |
| simple-ptep_get_clear_full | 8% | 29% |
| robust-ptep_get_clear_full | 21% | 19% |
| clear_ptes | 13% | 9% |

In all cases, the cost of arm64_sys_exit_group() increases; this is
anticipated because there is more work to do to tear down the page
tables. But clear_ptes() gives the smallest increase overall.

Signed-off-by: Ryan Roberts <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 32 ++++++++++++++++++++++++
arch/arm64/mm/contpte.c | 42 ++++++++++++++++++++++++++++++++
2 files changed, 74 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9bd2f57a9e11..ff6b3cc9e819 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1145,6 +1145,8 @@ extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte, unsigned int nr);
+extern pte_t contpte_clear_ptes(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr);
extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
@@ -1270,6 +1272,36 @@ static inline void pte_clear(struct mm_struct *mm,
__pte_clear(mm, addr, ptep);
}

+#define clear_ptes clear_ptes
+static inline pte_t clear_ptes(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep, int full,
+ unsigned int nr)
+{
+ pte_t pte;
+
+ if (!contpte_is_enabled(mm)) {
+ unsigned int i;
+ pte_t tail;
+
+ pte = __ptep_get_and_clear(mm, addr, ptep);
+ for (i = 1; i < nr; i++) {
+ addr += PAGE_SIZE;
+ ptep++;
+ tail = __ptep_get_and_clear(mm, addr, ptep);
+ if (pte_dirty(tail))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tail))
+ pte = pte_mkyoung(pte);
+ }
+ } else if (nr == 1) {
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ pte = __ptep_get_and_clear(mm, addr, ptep);
+ } else
+ pte = contpte_clear_ptes(mm, addr, ptep, nr);
+
+ return pte;
+}
+
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 2a57df16bf58..34b43bde3fcd 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -257,6 +257,48 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
}
EXPORT_SYMBOL(contpte_set_ptes);

+pte_t contpte_clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+ unsigned int nr)
+{
+ /*
+ * If we cover a partial contpte block at the beginning or end of the
+ * batch, unfold if currently folded. This makes it safe to clear some
+ * of the entries while keeping others. contpte blocks in the middle of
+ * the range, which are fully covered don't need to be unfolded because
+ * we will clear the full block.
+ */
+
+ unsigned int i;
+ pte_t pte;
+ pte_t tail;
+
+ if (ptep != contpte_align_down(ptep) || nr < CONT_PTES)
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+
+ if (ptep + nr != contpte_align_down(ptep + nr))
+ contpte_try_unfold(mm, addr + PAGE_SIZE * (nr - 1),
+ ptep + nr - 1,
+ __ptep_get(ptep + nr - 1));
+
+ pte = __ptep_get_and_clear(mm, addr, ptep);
+
+ for (i = 1; i < nr; i++) {
+ addr += PAGE_SIZE;
+ ptep++;
+
+ tail = __ptep_get_and_clear(mm, addr, ptep);
+
+ if (pte_dirty(tail))
+ pte = pte_mkdirty(pte);
+
+ if (pte_young(tail))
+ pte = pte_mkyoung(pte);
+ }
+
+ return pte;
+}
+EXPORT_SYMBOL(contpte_clear_ptes);
+
int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
--
2.25.1

2023-12-04 11:24:57

[permalink] [raw]

Subject: [PATCH v3 10/15] arm64/mm: ptep_set_access_flags(): New layer to manage contig bit

2023-12-04 11:25:22

[permalink] [raw]

Subject: [PATCH v3 07/15] arm64/mm: ptep_test_and_clear_young(): New layer to manage contig bit

2023-12-04 11:25:35

[permalink] [raw]

Subject: [PATCH v3 14/15] arm64/mm: Implement ptep_set_wrprotects() to optimize fork()

With the core-mm changes in place to batch-copy ptes during fork, we can
take advantage of this in arm64 to greatly reduce the number of tlbis we
have to issue, and recover the lost fork performance incured when adding
support for transparent contiguous ptes.

If we are write-protecting a whole contig range, we can apply the
write-protection to the whole range and know that it won't change
whether the range should have the contiguous bit set or not. For ranges
smaller than the contig range, we will still have to unfold, apply the
write-protection, then fold if the change now means the range is
foldable.

This optimization is possible thanks to the tightening of the Arm ARM in
respect to the definition and behaviour when 'Misprogramming the
Contiguous bit'. See section D21194 at
https://developer.arm.com/documentation/102105/latest/

Performance tested with the following test written for the will-it-scale
framework:

-------

char *testcase_description = "fork and exit";

void testcase(unsigned long long *iterations, unsigned long nr)
{
int pid;
char *mem;

mem = malloc(SZ_128M);
assert(mem);
memset(mem, 1, SZ_128M);

while (1) {
pid = fork();
assert(pid >= 0);

if (!pid)
exit(0);

waitpid(pid, NULL, 0);

(*iterations)++;
}
}

-------

I see huge performance regression when PTE_CONT support was added, then
the regression is mostly fixed with the addition of this change. The
following shows regression relative to before PTE_CONT was enabled
(bigger negative value is bigger regression):

| cpus | before opt | after opt |
|-------:|-------------:|------------:|
| 1 | -10.4% | -5.2% |
| 8 | -15.4% | -3.5% |
| 16 | -38.7% | -3.7% |
| 24 | -57.0% | -4.4% |
| 32 | -65.8% | -5.4% |

Signed-off-by: Ryan Roberts <[email protected]>
---
arch/arm64/include/asm/pgtable.h | 30 ++++++++++++++++++++---
arch/arm64/mm/contpte.c | 42 ++++++++++++++++++++++++++++++++
2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 15bc9cf1eef4..9bd2f57a9e11 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -984,6 +984,16 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
} while (pte_val(pte) != pte_val(old_pte));
}

+static inline void __ptep_set_wrprotects(struct mm_struct *mm,
+ unsigned long address, pte_t *ptep,
+ unsigned int nr)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
+ __ptep_set_wrprotect(mm, address, ptep);
+}
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
@@ -1139,6 +1149,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep);
+extern void contpte_set_wrprotects(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr);
extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty);
@@ -1290,13 +1302,25 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
return contpte_ptep_clear_flush_young(vma, addr, ptep);
}

+#define ptep_set_wrprotects ptep_set_wrprotects
+static inline void ptep_set_wrprotects(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ if (!contpte_is_enabled(mm))
+ __ptep_set_wrprotects(mm, addr, ptep, nr);
+ else if (nr == 1) {
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+ __ptep_set_wrprotects(mm, addr, ptep, 1);
+ contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
+ } else
+ contpte_set_wrprotects(mm, addr, ptep, nr);
+}
+
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
- __ptep_set_wrprotect(mm, addr, ptep);
- contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
+ ptep_set_wrprotects(mm, addr, ptep, 1);
}

#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index e079ec61d7d1..2a57df16bf58 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -303,6 +303,48 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
}
EXPORT_SYMBOL(contpte_ptep_clear_flush_young);

+void contpte_set_wrprotects(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned int nr)
+{
+ unsigned long next;
+ unsigned long end = addr + (nr << PAGE_SHIFT);
+
+ do {
+ next = pte_cont_addr_end(addr, end);
+ nr = (next - addr) >> PAGE_SHIFT;
+
+ /*
+ * If wrprotecting an entire contig range, we can avoid
+ * unfolding. Just set wrprotect and wait for the later
+ * mmu_gather flush to invalidate the tlb. Until the flush, the
+ * page may or may not be wrprotected. After the flush, it is
+ * guarranteed wrprotected. If its a partial range though, we
+ * must unfold, because we can't have a case where CONT_PTE is
+ * set but wrprotect applies to a subset of the PTEs; this would
+ * cause it to continue to be unpredictable after the flush.
+ */
+ if (nr != CONT_PTES)
+ contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+
+ __ptep_set_wrprotects(mm, addr, ptep, nr);
+
+ addr = next;
+ ptep += nr;
+
+ /*
+ * If applying to a partial contig range, the change could have
+ * made the range foldable. Use the last pte in the range we
+ * just set for comparison, since contpte_try_fold() only
+ * triggers when acting on the last pte in the contig range.
+ */
+ if (nr != CONT_PTES)
+ contpte_try_fold(mm, addr - PAGE_SIZE, ptep - 1,
+ __ptep_get(ptep - 1));
+
+ } while (addr != end);
+}
+EXPORT_SYMBOL(contpte_set_wrprotects);
+
int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep,
pte_t entry, int dirty)
--
2.25.1

2023-12-04 11:25:46

[permalink] [raw]

Subject: [PATCH v3 09/15] arm64/mm: ptep_set_wrprotect(): New layer to manage contig bit

2023-12-04 11:25:52

[permalink] [raw]

Subject: [PATCH v3 08/15] arm64/mm: ptep_clear_flush_young(): New layer to manage contig bit

2023-12-04 15:47:42

[permalink] [raw]

Subject: Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

On 04.12.23 11:54, Ryan Roberts wrote:
> Convert copy_pte_range() to copy a set of ptes in a batch. A given batch
> maps a physically contiguous block of memory, all belonging to the same
> folio. This will likely improve performance by a tiny amount due to
> batching the folio reference count management and calling set_ptes()
> rather than making individual calls to set_pte_at().
>
> However, the primary motivation for this change is to reduce the number
> of tlb maintenance operations that the arm64 backend has to perform
> during fork, as it is about to add transparent support for the
> "contiguous bit" in its ptes. By write-protecting the parent using the
> new ptep_set_wrprotects() (note the 's' at the end) function, the
> backend can avoid having to unfold contig ranges of PTEs, which is
> expensive, when all ptes in the range are being write-protected.
> Similarly, by using set_ptes() rather than set_pte_at() to set up ptes
> in the child, the backend does not need to fold a contiguous range once
> they are all populated - they can be initially populated as a contiguous
> range in the first place.
>
> This change addresses the core-mm refactoring only, and introduces
> ptep_set_wrprotects() with a default implementation that calls
> ptep_set_wrprotect() for each pte in the range. A separate change will
> implement ptep_set_wrprotects() in the arm64 backend to realize the
> performance improvement as part of the work to enable contpte mappings.
>
> Signed-off-by: Ryan Roberts <[email protected]>
> ---
> include/linux/pgtable.h | 13 +++
> mm/memory.c | 195 ++++++++++++++++++++++++++++++----------
> 2 files changed, 162 insertions(+), 46 deletions(-)
>
> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> index af7639c3b0a3..1c50f8a0fdde 100644
> --- a/include/linux/pgtable.h
> +++ b/include/linux/pgtable.h
> @@ -622,6 +622,19 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
> }
> #endif
>
> +#ifndef ptep_set_wrprotects
> +struct mm_struct;
> +static inline void ptep_set_wrprotects(struct mm_struct *mm,
> + unsigned long address, pte_t *ptep,
> + unsigned int nr)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
> + ptep_set_wrprotect(mm, address, ptep);
> +}
> +#endif
> +
> /*
> * On some architectures hardware does not set page access bit when accessing
> * memory page, it is responsibility of software setting this bit. It brings
> diff --git a/mm/memory.c b/mm/memory.c
> index 1f18ed4a5497..8a87a488950c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -924,68 +924,162 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
> return 0;
> }
>
> +static int folio_nr_pages_cont_mapped(struct folio *folio,
> + struct page *page, pte_t *pte,
> + unsigned long addr, unsigned long end,
> + pte_t ptent, bool enforce_uffd_wp,
> + int *dirty_nr, int *writable_nr)
> +{
> + int floops;
> + int i;
> + unsigned long pfn;
> + bool prot_none;
> + bool uffd_wp;
> +
> + if (!folio_test_large(folio))
> + return 1;
> +
> + /*
> + * Loop either to `end` or to end of folio if its contiguously mapped,
> + * whichever is smaller.
> + */
> + floops = (end - addr) >> PAGE_SHIFT;
> + floops = min_t(int, floops,
> + folio_pfn(folio_next(folio)) - page_to_pfn(page));
> +
> + pfn = page_to_pfn(page);
> + prot_none = pte_protnone(ptent);
> + uffd_wp = pte_uffd_wp(ptent);
> +
> + *dirty_nr = !!pte_dirty(ptent);
> + *writable_nr = !!pte_write(ptent);
> +
> + pfn++;
> + pte++;
> +
> + for (i = 1; i < floops; i++) {
> + ptent = ptep_get(pte);
> +
> + if (!pte_present(ptent) || pte_pfn(ptent) != pfn ||
> + prot_none != pte_protnone(ptent) ||
> + (enforce_uffd_wp && uffd_wp != pte_uffd_wp(ptent)))
> + break;
> +
> + if (pte_dirty(ptent))
> + (*dirty_nr)++;
> + if (pte_write(ptent))
> + (*writable_nr)++;
> +
> + pfn++;
> + pte++;
> + }
> +
> + return i;
> +}
> +
> /*
> - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
> - * is required to copy this pte.
> + * Copy set of contiguous ptes. Returns number of ptes copied if succeeded
> + * (always gte 1), or -EAGAIN if one preallocated page is required to copy the
> + * first pte.
> */
> static inline int
> -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
> - struct folio **prealloc)
> +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> + pte_t *dst_pte, pte_t *src_pte,
> + unsigned long addr, unsigned long end,
> + int *rss, struct folio **prealloc)
> {
> struct mm_struct *src_mm = src_vma->vm_mm;
> unsigned long vm_flags = src_vma->vm_flags;
> pte_t pte = ptep_get(src_pte);
> struct page *page;
> struct folio *folio;
> + int nr = 1;
> + bool anon = false;
> + bool enforce_uffd_wp = userfaultfd_wp(dst_vma);
> + int nr_dirty = !!pte_dirty(pte);
> + int nr_writable = !!pte_write(pte);
> + int i, ret;
>
> page = vm_normal_page(src_vma, addr, pte);
> - if (page)
> + if (page) {
> folio = page_folio(page);
> - if (page && folio_test_anon(folio)) {
> - /*
> - * If this page may have been pinned by the parent process,
> - * copy the page immediately for the child so that we'll always
> - * guarantee the pinned page won't be randomly replaced in the
> - * future.
> - */
> - folio_get(folio);
> - if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
> - /* Page may be pinned, we have to copy. */
> - folio_put(folio);
> - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
> - addr, rss, prealloc, page);
> + anon = folio_test_anon(folio);
> + nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
> + pte, enforce_uffd_wp, &nr_dirty,
> + &nr_writable);
> + folio_ref_add(folio, nr);
> +
> + for (i = 0; i < nr; i++, page++) {
> + if (anon) {
> + /*
> + * If this page may have been pinned by the
> + * parent process, copy the page immediately for
> + * the child so that we'll always guarantee the
> + * pinned page won't be randomly replaced in the
> + * future.
> + */
> + if (unlikely(page_try_dup_anon_rmap(
> + page, false, src_vma))) {
> + if (i != 0)
> + break;
> + /* Page may be pinned, we have to copy. */
> + folio_ref_sub(folio, nr);
> + ret = copy_present_page(
> + dst_vma, src_vma, dst_pte,
> + src_pte, addr, rss, prealloc,
> + page);
> + return ret == 0 ? 1 : ret;
> + }
> + rss[MM_ANONPAGES]++;
> + VM_BUG_ON(PageAnonExclusive(page));
> + } else {
> + page_dup_file_rmap(page, false);
> + rss[mm_counter_file(page)]++;
> + }
> }
> - rss[MM_ANONPAGES]++;
> - } else if (page) {
> - folio_get(folio);
> - page_dup_file_rmap(page, false);
> - rss[mm_counter_file(page)]++;
> - }

This likely looks a lot neater if you keep the existing structure.

For example, you can simply have on the !anon path

} else if (page) {
folio = page_folio(page);
nr = folio_nr_pages_cont_mapped ...
folio_ref_add(folio, nr);
for (i = 0; i < nr; i++, page++)
page_dup_file_rmap(page, false);
rss[mm_counter_file(&folio->page)] += nr;
}

--
Cheers,

David / dhildenb

2023-12-04 16:01:27

[permalink] [raw]

Subject: Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

On 04.12.23 16:47, David Hildenbrand wrote:
> On 04.12.23 11:54, Ryan Roberts wrote:
>> Convert copy_pte_range() to copy a set of ptes in a batch. A given batch
>> maps a physically contiguous block of memory, all belonging to the same
>> folio. This will likely improve performance by a tiny amount due to
>> batching the folio reference count management and calling set_ptes()
>> rather than making individual calls to set_pte_at().
>>
>> However, the primary motivation for this change is to reduce the number
>> of tlb maintenance operations that the arm64 backend has to perform
>> during fork, as it is about to add transparent support for the
>> "contiguous bit" in its ptes. By write-protecting the parent using the
>> new ptep_set_wrprotects() (note the 's' at the end) function, the
>> backend can avoid having to unfold contig ranges of PTEs, which is
>> expensive, when all ptes in the range are being write-protected.
>> Similarly, by using set_ptes() rather than set_pte_at() to set up ptes
>> in the child, the backend does not need to fold a contiguous range once
>> they are all populated - they can be initially populated as a contiguous
>> range in the first place.
>>
>> This change addresses the core-mm refactoring only, and introduces
>> ptep_set_wrprotects() with a default implementation that calls
>> ptep_set_wrprotect() for each pte in the range. A separate change will
>> implement ptep_set_wrprotects() in the arm64 backend to realize the
>> performance improvement as part of the work to enable contpte mappings.
>>
>> Signed-off-by: Ryan Roberts <[email protected]>
>> ---
>> include/linux/pgtable.h | 13 +++
>> mm/memory.c | 195 ++++++++++++++++++++++++++++++----------
>> 2 files changed, 162 insertions(+), 46 deletions(-)
>>
>> diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
>> index af7639c3b0a3..1c50f8a0fdde 100644
>> --- a/include/linux/pgtable.h
>> +++ b/include/linux/pgtable.h
>> @@ -622,6 +622,19 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
>> }
>> #endif
>>
>> +#ifndef ptep_set_wrprotects
>> +struct mm_struct;
>> +static inline void ptep_set_wrprotects(struct mm_struct *mm,
>> + unsigned long address, pte_t *ptep,
>> + unsigned int nr)
>> +{
>> + unsigned int i;
>> +
>> + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
>> + ptep_set_wrprotect(mm, address, ptep);
>> +}
>> +#endif
>> +
>> /*
>> * On some architectures hardware does not set page access bit when accessing
>> * memory page, it is responsibility of software setting this bit. It brings
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 1f18ed4a5497..8a87a488950c 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -924,68 +924,162 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
>> return 0;
>> }
>>
>> +static int folio_nr_pages_cont_mapped(struct folio *folio,
>> + struct page *page, pte_t *pte,
>> + unsigned long addr, unsigned long end,
>> + pte_t ptent, bool enforce_uffd_wp,
>> + int *dirty_nr, int *writable_nr)
>> +{
>> + int floops;
>> + int i;
>> + unsigned long pfn;
>> + bool prot_none;
>> + bool uffd_wp;
>> +
>> + if (!folio_test_large(folio))
>> + return 1;
>> +
>> + /*
>> + * Loop either to `end` or to end of folio if its contiguously mapped,
>> + * whichever is smaller.
>> + */
>> + floops = (end - addr) >> PAGE_SHIFT;
>> + floops = min_t(int, floops,
>> + folio_pfn(folio_next(folio)) - page_to_pfn(page));
>> +
>> + pfn = page_to_pfn(page);
>> + prot_none = pte_protnone(ptent);
>> + uffd_wp = pte_uffd_wp(ptent);
>> +
>> + *dirty_nr = !!pte_dirty(ptent);
>> + *writable_nr = !!pte_write(ptent);
>> +
>> + pfn++;
>> + pte++;
>> +
>> + for (i = 1; i < floops; i++) {
>> + ptent = ptep_get(pte);
>> +
>> + if (!pte_present(ptent) || pte_pfn(ptent) != pfn ||
>> + prot_none != pte_protnone(ptent) ||
>> + (enforce_uffd_wp && uffd_wp != pte_uffd_wp(ptent)))
>> + break;
>> +
>> + if (pte_dirty(ptent))
>> + (*dirty_nr)++;
>> + if (pte_write(ptent))
>> + (*writable_nr)++;
>> +
>> + pfn++;
>> + pte++;
>> + }
>> +
>> + return i;
>> +}
>> +
>> /*
>> - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
>> - * is required to copy this pte.
>> + * Copy set of contiguous ptes. Returns number of ptes copied if succeeded
>> + * (always gte 1), or -EAGAIN if one preallocated page is required to copy the
>> + * first pte.
>> */
>> static inline int
>> -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>> - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
>> - struct folio **prealloc)
>> +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>> + pte_t *dst_pte, pte_t *src_pte,
>> + unsigned long addr, unsigned long end,
>> + int *rss, struct folio **prealloc)
>> {
>> struct mm_struct *src_mm = src_vma->vm_mm;
>> unsigned long vm_flags = src_vma->vm_flags;
>> pte_t pte = ptep_get(src_pte);
>> struct page *page;
>> struct folio *folio;
>> + int nr = 1;
>> + bool anon = false;
>> + bool enforce_uffd_wp = userfaultfd_wp(dst_vma);
>> + int nr_dirty = !!pte_dirty(pte);
>> + int nr_writable = !!pte_write(pte);
>> + int i, ret;
>>
>> page = vm_normal_page(src_vma, addr, pte);
>> - if (page)
>> + if (page) {
>> folio = page_folio(page);
>> - if (page && folio_test_anon(folio)) {
>> - /*
>> - * If this page may have been pinned by the parent process,
>> - * copy the page immediately for the child so that we'll always
>> - * guarantee the pinned page won't be randomly replaced in the
>> - * future.
>> - */
>> - folio_get(folio);
>> - if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
>> - /* Page may be pinned, we have to copy. */
>> - folio_put(folio);
>> - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
>> - addr, rss, prealloc, page);
>> + anon = folio_test_anon(folio);
>> + nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
>> + pte, enforce_uffd_wp, &nr_dirty,
>> + &nr_writable);
>> + folio_ref_add(folio, nr);
>> +
>> + for (i = 0; i < nr; i++, page++) {
>> + if (anon) {
>> + /*
>> + * If this page may have been pinned by the
>> + * parent process, copy the page immediately for
>> + * the child so that we'll always guarantee the
>> + * pinned page won't be randomly replaced in the
>> + * future.
>> + */
>> + if (unlikely(page_try_dup_anon_rmap(
>> + page, false, src_vma))) {
>> + if (i != 0)
>> + break;
>> + /* Page may be pinned, we have to copy. */
>> + folio_ref_sub(folio, nr);
>> + ret = copy_present_page(
>> + dst_vma, src_vma, dst_pte,
>> + src_pte, addr, rss, prealloc,
>> + page);
>> + return ret == 0 ? 1 : ret;
>> + }
>> + rss[MM_ANONPAGES]++;
>> + VM_BUG_ON(PageAnonExclusive(page));
>> + } else {
>> + page_dup_file_rmap(page, false);
>> + rss[mm_counter_file(page)]++;
>> + }
>> }
>> - rss[MM_ANONPAGES]++;
>> - } else if (page) {
>> - folio_get(folio);
>> - page_dup_file_rmap(page, false);
>> - rss[mm_counter_file(page)]++;
>> - }
>
> This likely looks a lot neater if you keep the existing structure.
>
> For example, you can simply have on the !anon path
>
> } else if (page) {
> folio = page_folio(page);
> nr = folio_nr_pages_cont_mapped ...
> folio_ref_add(folio, nr);
> for (i = 0; i < nr; i++, page++)
> page_dup_file_rmap(page, false);
> rss[mm_counter_file(&folio->page)] += nr;
> }
>

With rmap batching from [1] -- rebased+changed on top of that -- we could turn
that into an effective (untested):

if (page && folio_test_anon(folio)) {
+ nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
+ pte, enforce_uffd_wp, &nr_dirty,
+ &nr_writable);
/*
* If this page may have been pinned by the parent process,
* copy the page immediately for the child so that we'll always
* guarantee the pinned page won't be randomly replaced in the
* future.
*/
- folio_get(folio);
- if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
+ folio_ref_add(folio, nr);
+ if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr, src_vma))) {
/* Page may be pinned, we have to copy. */
- folio_put(folio);
- return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
- addr, rss, prealloc, page);
+ folio_ref_sub(folio, nr);
+ ret = copy_present_page(dst_vma, src_vma, dst_pte,
+ src_pte, addr, rss, prealloc,
+ page);
+ return ret == 0 ? 1 : ret;
}
- rss[MM_ANONPAGES]++;
+ rss[MM_ANONPAGES] += nr;
} else if (page) {
- folio_get(folio);
- folio_dup_file_rmap_pte(folio, page);
- rss[mm_counter_file(page)]++;
+ nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
+ pte, enforce_uffd_wp, &nr_dirty,
+ &nr_writable);
+ folio_ref_add(folio, nr);
+ folio_dup_file_rmap_ptes(folio, page, nr);
+ rss[mm_counter_file(page)] += nr;
}

We'll have to test performance, but it could be that we want to specialize
more on !folio_test_large(). That code is very performance-sensitive.

[1] https://lkml.kernel.org/r/[email protected]

--
Cheers,

David / dhildenb

2023-12-04 17:27:30

[permalink] [raw]

Subject: Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

>
> With rmap batching from [1] -- rebased+changed on top of that -- we could turn
> that into an effective (untested):
>
> if (page && folio_test_anon(folio)) {
> + nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
> + pte, enforce_uffd_wp, &nr_dirty,
> + &nr_writable);
> /*
> * If this page may have been pinned by the parent process,
> * copy the page immediately for the child so that we'll always
> * guarantee the pinned page won't be randomly replaced in the
> * future.
> */
> - folio_get(folio);
> - if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
> + folio_ref_add(folio, nr);
> + if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr, src_vma))) {
> /* Page may be pinned, we have to copy. */
> - folio_put(folio);
> - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
> - addr, rss, prealloc, page);
> + folio_ref_sub(folio, nr);
> + ret = copy_present_page(dst_vma, src_vma, dst_pte,
> + src_pte, addr, rss, prealloc,
> + page);
> + return ret == 0 ? 1 : ret;
> }
> - rss[MM_ANONPAGES]++;
> + rss[MM_ANONPAGES] += nr;
> } else if (page) {
> - folio_get(folio);
> - folio_dup_file_rmap_pte(folio, page);
> - rss[mm_counter_file(page)]++;
> + nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
> + pte, enforce_uffd_wp, &nr_dirty,
> + &nr_writable);
> + folio_ref_add(folio, nr);
> + folio_dup_file_rmap_ptes(folio, page, nr);
> + rss[mm_counter_file(page)] += nr;
> }
>
>
> We'll have to test performance, but it could be that we want to specialize
> more on !folio_test_large(). That code is very performance-sensitive.
>
>
> [1] https://lkml.kernel.org/r/[email protected]

So, on top of [1] without rmap batching but with a slightly modified
version of yours (that keeps the existing code structure as pointed out
and e.g., updates counter updates), running my fork() microbenchmark
with a 1 GiB of memory:

Compared to [1], with all order-0 pages it gets 13--14% _slower_ and
with all PTE-mapped THP (order-9) it gets ~29--30% _faster_.

So looks like we really want to have a completely seprate code path for
"!folio_test_large()" to keep that case as fast as possible. And
"Likely" we want to use "likely(!folio_test_large()". ;)

Performing rmap batching on top of that code only slightly (another 1%
or so) improves performance in the PTE-mapped THP (order-9) case right
now, in contrast to other rmap batching. Reason is as all rmap code gets
inlined here and we're only doing subpage mapcount updates + PAE handling.

--
Cheers,

David / dhildenb

2023-12-05 03:41:42

by John Hubbard

[permalink] [raw]

Subject: Re: [PATCH v3 00/15] Transparent Contiguous PTEs for User Mappings

On 12/4/23 02:54, Ryan Roberts wrote:
> Hi All,
>
> This is v3 of a series to opportunistically and transparently use contpte
> mappings (set the contiguous bit in ptes) for user memory when those mappings
> meet the requirements. It is part of a wider effort to improve performance by
> allocating and mapping variable-sized blocks of memory (folios). One aim is for
> the 4K kernel to approach the performance of the 16K kernel, but without
> breaking compatibility and without the associated increase in memory. Another
> aim is to benefit the 16K and 64K kernels by enabling 2M THP, since this is the
> contpte size for those kernels. We have good performance data that demonstrates
> both aims are being met (see below).
>
> Of course this is only one half of the change. We require the mapped physical
> memory to be the correct size and alignment for this to actually be useful (i.e.
> 64K for 4K pages, or 2M for 16K/64K pages). Fortunately folios are solving this
> problem for us. Filesystems that support it (XFS, AFS, EROFS, tmpfs, ...) will
> allocate large folios up to the PMD size today, and more filesystems are coming.
> And the other half of my work, to enable "multi-size THP" (large folios) for
> anonymous memory, makes contpte sized folios prevalent for anonymous memory too
> [3].
>

Hi Ryan,

Using a couple of Armv8 systems, I've tested this patchset. Details are in my
reply to the mTHP patchset [1].

So for this patchset, please feel free to add:

Tested-by: John Hubbard <[email protected]>

[1] https://lore.kernel.org/all/[email protected]/

thanks,
--
John Hubbard
NVIDIA

> Optimistically, I would really like to get this series merged for v6.8; there is
> a chance that the multi-size THP series will also get merged for that version
> (although at this point pretty small). But even if it doesn't, this series still
> benefits file-backed memory from the file systems that support large folios so
> shouldn't be held up for it. Additionally I've got data that shows this series
> adds no regression when the system has no appropriate large folios.
>
> All dependecies listed against v1 are now resolved; This series applies cleanly
> against v6.7-rc1.
>
> Note that the first two patchs are for core-mm and provides the refactoring to
> make some crucial optimizations possible - which are then implemented in patches
> 14 and 15. The remaining patches are arm64-specific.
>
> Testing
> =======
>
> I've tested this series together with multi-size THP [3] on both Ampere Altra
> (bare metal) and Apple M2 (VM):
> - mm selftests (inc new tests written for multi-size THP); no regressions
> - Speedometer Java script benchmark in Chromium web browser; no issues
> - Kernel compilation; no issues
> - Various tests under high memory pressure with swap enabled; no issues
>
>
> Performance
> ===========
>
> John Hubbard at Nvidia has indicated dramatic 10x performance improvements for
> some workloads at [4], when using 64K base page kernel.
>
> You can also see the original performance results I posted against v1 [1] which
> are still valid.
>
> I've additionally run the kernel compilation and speedometer benchmarks on a
> system with multi-size THP disabled and large folio support for file-backed
> memory intentionally disabled; I see no change in performance in this case (i.e.
> no regression when this change is "present but not useful").
>
>
> Changes since v2 [2]
> ====================
>
> - Removed contpte_ptep_get_and_clear_full() optimisation for exit() (v2#14),
> and replaced with a batch-clearing approach using a new arch helper,
> clear_ptes() (v3#2 and v3#15) (Alistair and Barry)
> - (v2#1 / v3#1)
> - Fixed folio refcounting so that refcount >= mapcount always (DavidH)
> - Reworked batch demarcation to avoid pte_pgprot() (DavidH)
> - Reverted return semantic of copy_present_page() and instead fix it up in
> copy_present_ptes() (Alistair)
> - Removed page_cont_mapped_vaddr() and replaced with simpler logic
> (Alistair)
> - Made batch accounting clearer in copy_pte_range() (Alistair)
> - (v2#12 / v3#13)
> - Renamed contpte_fold() -> contpte_convert() and hoisted setting/
> clearing CONT_PTE bit to higher level (Alistair)
>
>
> Changes since v1 [1]
> ====================
>
> - Export contpte_* symbols so that modules can continue to call inline
> functions (e.g. ptep_get) which may now call the contpte_* functions (thanks
> to JohnH)
> - Use pte_valid() instead of pte_present() where sensible (thanks to Catalin)
> - Factor out (pte_valid() && pte_cont()) into new pte_valid_cont() helper
> (thanks to Catalin)
> - Fixed bug in contpte_ptep_set_access_flags() where TLBIs were missed (thanks
> to Catalin)
> - Added ARM64_CONTPTE expert Kconfig (enabled by default) (thanks to Anshuman)
> - Simplified contpte_ptep_get_and_clear_full()
> - Improved various code comments
>
>
> [1] https://lore.kernel.org/linux-arm-kernel/[email protected]/
> [2] https://lore.kernel.org/linux-arm-kernel/[email protected]/
> [3] https://lore.kernel.org/linux-arm-kernel/[email protected]/
> [4] https://lore.kernel.org/linux-mm/[email protected]/
>
>
> Thanks,
> Ryan
>
> Ryan Roberts (15):
> mm: Batch-copy PTE ranges during fork()
> mm: Batch-clear PTE ranges during zap_pte_range()
> arm64/mm: set_pte(): New layer to manage contig bit
> arm64/mm: set_ptes()/set_pte_at(): New layer to manage contig bit
> arm64/mm: pte_clear(): New layer to manage contig bit
> arm64/mm: ptep_get_and_clear(): New layer to manage contig bit
> arm64/mm: ptep_test_and_clear_young(): New layer to manage contig bit
> arm64/mm: ptep_clear_flush_young(): New layer to manage contig bit
> arm64/mm: ptep_set_wrprotect(): New layer to manage contig bit
> arm64/mm: ptep_set_access_flags(): New layer to manage contig bit
> arm64/mm: ptep_get(): New layer to manage contig bit
> arm64/mm: Split __flush_tlb_range() to elide trailing DSB
> arm64/mm: Wire up PTE_CONT for user mappings
> arm64/mm: Implement ptep_set_wrprotects() to optimize fork()
> arm64/mm: Implement clear_ptes() to optimize exit()
>
> arch/arm64/Kconfig | 10 +-
> arch/arm64/include/asm/pgtable.h | 343 ++++++++++++++++++++---
> arch/arm64/include/asm/tlbflush.h | 13 +-
> arch/arm64/kernel/efi.c | 4 +-
> arch/arm64/kernel/mte.c | 2 +-
> arch/arm64/kvm/guest.c | 2 +-
> arch/arm64/mm/Makefile | 1 +
> arch/arm64/mm/contpte.c | 436 ++++++++++++++++++++++++++++++
> arch/arm64/mm/fault.c | 12 +-
> arch/arm64/mm/fixmap.c | 4 +-
> arch/arm64/mm/hugetlbpage.c | 40 +--
> arch/arm64/mm/kasan_init.c | 6 +-
> arch/arm64/mm/mmu.c | 16 +-
> arch/arm64/mm/pageattr.c | 6 +-
> arch/arm64/mm/trans_pgd.c | 6 +-
> include/asm-generic/tlb.h | 9 +
> include/linux/pgtable.h | 39 +++
> mm/memory.c | 258 +++++++++++++-----
> mm/mmu_gather.c | 14 +
> 19 files changed, 1067 insertions(+), 154 deletions(-)
> create mode 100644 arch/arm64/mm/contpte.c
>
> --
> 2.25.1
>

2023-12-05 11:31:35

[permalink] [raw]

Subject: Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

On 04/12/2023 17:27, David Hildenbrand wrote:
>>
>> With rmap batching from [1] -- rebased+changed on top of that -- we could turn
>> that into an effective (untested):
>>
>>           if (page && folio_test_anon(folio)) {
>> +               nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
>> +                                               pte, enforce_uffd_wp, &nr_dirty,
>> +                                               &nr_writable);
>>                   /*
>>                    * If this page may have been pinned by the parent process,
>>                    * copy the page immediately for the child so that we'll always
>>                    * guarantee the pinned page won't be randomly replaced in the
>>                    * future.
>>                    */
>> -               folio_get(folio);
>> -               if (unlikely(folio_try_dup_anon_rmap_pte(folio, page,
>> src_vma))) {
>> +               folio_ref_add(folio, nr);
>> +               if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr,
>> src_vma))) {
>>                           /* Page may be pinned, we have to copy. */
>> -                       folio_put(folio);
>> -                       return copy_present_page(dst_vma, src_vma, dst_pte,
>> src_pte,
>> -                                                addr, rss, prealloc, page);
>> +                       folio_ref_sub(folio, nr);
>> +                       ret = copy_present_page(dst_vma, src_vma, dst_pte,
>> +                                               src_pte, addr, rss, prealloc,
>> +                                               page);
>> +                       return ret == 0 ? 1 : ret;
>>                   }
>> -               rss[MM_ANONPAGES]++;
>> +               rss[MM_ANONPAGES] += nr;
>>           } else if (page) {
>> -               folio_get(folio);
>> -               folio_dup_file_rmap_pte(folio, page);
>> -               rss[mm_counter_file(page)]++;
>> +               nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
>> +                                               pte, enforce_uffd_wp, &nr_dirty,
>> +                                               &nr_writable);
>> +               folio_ref_add(folio, nr);
>> +               folio_dup_file_rmap_ptes(folio, page, nr);
>> +               rss[mm_counter_file(page)] += nr;
>>           }
>>
>>
>> We'll have to test performance, but it could be that we want to specialize
>> more on !folio_test_large(). That code is very performance-sensitive.
>>
>>
>> [1] https://lkml.kernel.org/r/[email protected]
>
> So, on top of [1] without rmap batching but with a slightly modified version of

Can you clarify what you mean by "without rmap batching"? I thought [1]
implicitly adds rmap batching? (e.g. folio_dup_file_rmap_ptes(), which you've
added in the code snippet above).

> yours (that keeps the existing code structure as pointed out and e.g., updates
> counter updates), running my fork() microbenchmark with a 1 GiB of memory:
>
> Compared to [1], with all order-0 pages it gets 13--14% _slower_ and with all
> PTE-mapped THP (order-9) it gets ~29--30% _faster_.

What test are you running - I'd like to reproduce if possible, since it sounds
like I've got some work to do to remove the order-0 regression.

>
> So looks like we really want to have a completely seprate code path for
> "!folio_test_large()" to keep that case as fast as possible. And "Likely" we
> want to use "likely(!folio_test_large()". ;)

Yuk, but fair enough. If I can repro the perf numbers, I'll have a go a
reworking this.

I think you're also implicitly suggesting that this change needs to depend on
[1]? Which is a shame...

I guess I should also go through a similar exercise for patch 2 in this series.

>
> Performing rmap batching on top of that code only slightly (another 1% or so)
> improves performance in the PTE-mapped THP (order-9) case right now, in contrast
> to other rmap batching. Reason is as all rmap code gets inlined here and we're
> only doing subpage mapcount updates + PAE handling.
>

2023-12-05 12:04:37

[permalink] [raw]

Subject: Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

On 05.12.23 12:30, Ryan Roberts wrote:
> On 04/12/2023 17:27, David Hildenbrand wrote:
>>>
>>> With rmap batching from [1] -- rebased+changed on top of that -- we could turn
>>> that into an effective (untested):
>>>
>>>           if (page && folio_test_anon(folio)) {
>>> +               nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
>>> +                                               pte, enforce_uffd_wp, &nr_dirty,
>>> +                                               &nr_writable);
>>>                   /*
>>>                    * If this page may have been pinned by the parent process,
>>>                    * copy the page immediately for the child so that we'll always
>>>                    * guarantee the pinned page won't be randomly replaced in the
>>>                    * future.
>>>                    */
>>> -               folio_get(folio);
>>> -               if (unlikely(folio_try_dup_anon_rmap_pte(folio, page,
>>> src_vma))) {
>>> +               folio_ref_add(folio, nr);
>>> +               if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr,
>>> src_vma))) {
>>>                           /* Page may be pinned, we have to copy. */
>>> -                       folio_put(folio);
>>> -                       return copy_present_page(dst_vma, src_vma, dst_pte,
>>> src_pte,
>>> -                                                addr, rss, prealloc, page);
>>> +                       folio_ref_sub(folio, nr);
>>> +                       ret = copy_present_page(dst_vma, src_vma, dst_pte,
>>> +                                               src_pte, addr, rss, prealloc,
>>> +                                               page);
>>> +                       return ret == 0 ? 1 : ret;
>>>                   }
>>> -               rss[MM_ANONPAGES]++;
>>> +               rss[MM_ANONPAGES] += nr;
>>>           } else if (page) {
>>> -               folio_get(folio);
>>> -               folio_dup_file_rmap_pte(folio, page);
>>> -               rss[mm_counter_file(page)]++;
>>> +               nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
>>> +                                               pte, enforce_uffd_wp, &nr_dirty,
>>> +                                               &nr_writable);
>>> +               folio_ref_add(folio, nr);
>>> +               folio_dup_file_rmap_ptes(folio, page, nr);
>>> +               rss[mm_counter_file(page)] += nr;
>>>           }
>>>
>>>
>>> We'll have to test performance, but it could be that we want to specialize
>>> more on !folio_test_large(). That code is very performance-sensitive.
>>>
>>>
>>> [1] https://lkml.kernel.org/r/[email protected]
>>
>> So, on top of [1] without rmap batching but with a slightly modified version of
>
> Can you clarify what you mean by "without rmap batching"? I thought [1]
> implicitly adds rmap batching? (e.g. folio_dup_file_rmap_ptes(), which you've
> added in the code snippet above).

Not calling the batched variants but essentially doing what your code
does (with some minor improvements, like updating the rss counters only
once).

The snipped above is only linked below. I had the performance numbers
for [1] ready, so I gave it a test on top of that.

To keep it simple, you might just benchmark w and w/o your patches.

>
>> yours (that keeps the existing code structure as pointed out and e.g., updates
>> counter updates), running my fork() microbenchmark with a 1 GiB of memory:
>>
>> Compared to [1], with all order-0 pages it gets 13--14% _slower_ and with all
>> PTE-mapped THP (order-9) it gets ~29--30% _faster_.
>
> What test are you running - I'd like to reproduce if possible, since it sounds
> like I've got some work to do to remove the order-0 regression.

Essentially just allocating 1 GiB of memory an measuring how long it
takes to call fork().

order-0 benchmarks:

https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/order-0-benchmarks.c?ref_type=heads

e.g.,: $ ./order-0-benchmarks fork 100

pte-mapped-thp benchmarks:

https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/pte-mapped-thp-benchmarks.c?ref_type=heads

e.g.,: $ ./pte-mapped-thp-benchmarks fork 100

Ideally, pin to one CPU and get stable performance numbers by disabling
SMT+turbo etc.

>
>>
>> So looks like we really want to have a completely seprate code path for
>> "!folio_test_large()" to keep that case as fast as possible. And "Likely" we
>> want to use "likely(!folio_test_large()". ;)
>
> Yuk, but fair enough. If I can repro the perf numbers, I'll have a go a
> reworking this.
>
> I think you're also implicitly suggesting that this change needs to depend on
> [1]? Which is a shame...

Not necessarily. It certainly cleans up the code, but we can do that in
any order reasonable.

>
> I guess I should also go through a similar exercise for patch 2 in this series.

Yes. There are "unmap" and "pte-dontneed" benchmarks contained in both
files above.

--
Cheers,

David / dhildenb

2023-12-05 14:17:29

[permalink] [raw]

Subject: Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

On 05/12/2023 12:04, David Hildenbrand wrote:
> On 05.12.23 12:30, Ryan Roberts wrote:
>> On 04/12/2023 17:27, David Hildenbrand wrote:
>>>>
>>>> With rmap batching from [1] -- rebased+changed on top of that -- we could turn
>>>> that into an effective (untested):
>>>>
>>>>            if (page && folio_test_anon(folio)) {
>>>> +               nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr,
>>>> end,
>>>> +                                               pte, enforce_uffd_wp,
>>>> &nr_dirty,
>>>> +                                               &nr_writable);
>>>>                    /*
>>>>                     * If this page may have been pinned by the parent process,
>>>>                     * copy the page immediately for the child so that we'll
>>>> always
>>>>                     * guarantee the pinned page won't be randomly replaced
>>>> in the
>>>>                     * future.
>>>>                     */
>>>> -               folio_get(folio);
>>>> -               if (unlikely(folio_try_dup_anon_rmap_pte(folio, page,
>>>> src_vma))) {
>>>> +               folio_ref_add(folio, nr);
>>>> +               if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page, nr,
>>>> src_vma))) {
>>>>                            /* Page may be pinned, we have to copy. */
>>>> -                       folio_put(folio);
>>>> -                       return copy_present_page(dst_vma, src_vma, dst_pte,
>>>> src_pte,
>>>> -                                                addr, rss, prealloc, page);
>>>> +                       folio_ref_sub(folio, nr);
>>>> +                       ret = copy_present_page(dst_vma, src_vma, dst_pte,
>>>> +                                               src_pte, addr, rss, prealloc,
>>>> +                                               page);
>>>> +                       return ret == 0 ? 1 : ret;
>>>>                    }
>>>> -               rss[MM_ANONPAGES]++;
>>>> +               rss[MM_ANONPAGES] += nr;
>>>>            } else if (page) {
>>>> -               folio_get(folio);
>>>> -               folio_dup_file_rmap_pte(folio, page);
>>>> -               rss[mm_counter_file(page)]++;
>>>> +               nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr,
>>>> end,
>>>> +                                               pte, enforce_uffd_wp,
>>>> &nr_dirty,
>>>> +                                               &nr_writable);
>>>> +               folio_ref_add(folio, nr);
>>>> +               folio_dup_file_rmap_ptes(folio, page, nr);
>>>> +               rss[mm_counter_file(page)] += nr;
>>>>            }
>>>>
>>>>
>>>> We'll have to test performance, but it could be that we want to specialize
>>>> more on !folio_test_large(). That code is very performance-sensitive.
>>>>
>>>>
>>>> [1] https://lkml.kernel.org/r/[email protected]
>>>
>>> So, on top of [1] without rmap batching but with a slightly modified version of
>>
>> Can you clarify what you mean by "without rmap batching"? I thought [1]
>> implicitly adds rmap batching? (e.g. folio_dup_file_rmap_ptes(), which you've
>> added in the code snippet above).
>
> Not calling the batched variants but essentially doing what your code does (with
> some minor improvements, like updating the rss counters only once).
>
> The snipped above is only linked below. I had the performance numbers for [1]
> ready, so I gave it a test on top of that.
>
> To keep it simple, you might just benchmark w and w/o your patches.
>
>>
>>> yours (that keeps the existing code structure as pointed out and e.g., updates
>>> counter updates), running my fork() microbenchmark with a 1 GiB of memory:
>>>
>>> Compared to [1], with all order-0 pages it gets 13--14% _slower_ and with all
>>> PTE-mapped THP (order-9) it gets ~29--30% _faster_.
>>
>> What test are you running - I'd like to reproduce if possible, since it sounds
>> like I've got some work to do to remove the order-0 regression.
>
> Essentially just allocating 1 GiB of memory an measuring how long it takes to
> call fork().
>
> order-0 benchmarks:
>
> https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/order-0-benchmarks.c?ref_type=heads
>
> e.g.,: $ ./order-0-benchmarks fork 100
>
>
> pte-mapped-thp benchmarks:
>
> https://gitlab.com/davidhildenbrand/scratchspace/-/raw/main/pte-mapped-thp-benchmarks.c?ref_type=heads
>
> e.g.,: $ ./pte-mapped-thp-benchmarks fork 100
>
>
> Ideally, pin to one CPU and get stable performance numbers by disabling
> SMT+turbo etc.

This is great - thanks! I'll get to work...

>
>>
>>>
>>> So looks like we really want to have a completely seprate code path for
>>> "!folio_test_large()" to keep that case as fast as possible. And "Likely" we
>>> want to use "likely(!folio_test_large()". ;)
>>
>> Yuk, but fair enough. If I can repro the perf numbers, I'll have a go a
>> reworking this.
>>
>> I think you're also implicitly suggesting that this change needs to depend on
>> [1]? Which is a shame...
>
> Not necessarily. It certainly cleans up the code, but we can do that in any
> order reasonable.
>
>>
>> I guess I should also go through a similar exercise for patch 2 in this series.
>
>
> Yes. There are "unmap" and "pte-dontneed" benchmarks contained in both files above.
>

2023-12-08 00:38:34

[permalink] [raw]

Subject: Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

Ryan Roberts <[email protected]> writes:

<snip>

> /*
> * On some architectures hardware does not set page access bit when accessing
> * memory page, it is responsibility of software setting this bit. It brings
> diff --git a/mm/memory.c b/mm/memory.c
> index 1f18ed4a5497..8a87a488950c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -924,68 +924,162 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
> return 0;
> }
>
> +static int folio_nr_pages_cont_mapped(struct folio *folio,
> + struct page *page, pte_t *pte,
> + unsigned long addr, unsigned long end,
> + pte_t ptent, bool enforce_uffd_wp,
> + int *dirty_nr, int *writable_nr)
> +{
> + int floops;
> + int i;
> + unsigned long pfn;
> + bool prot_none;
> + bool uffd_wp;
> +
> + if (!folio_test_large(folio))
> + return 1;
> +
> + /*
> + * Loop either to `end` or to end of folio if its contiguously mapped,
> + * whichever is smaller.
> + */
> + floops = (end - addr) >> PAGE_SHIFT;
> + floops = min_t(int, floops,
> + folio_pfn(folio_next(folio)) - page_to_pfn(page));

Much better, thanks for addressing my comments here.

> +
> + pfn = page_to_pfn(page);
> + prot_none = pte_protnone(ptent);
> + uffd_wp = pte_uffd_wp(ptent);
> +
> + *dirty_nr = !!pte_dirty(ptent);
> + *writable_nr = !!pte_write(ptent);
> +
> + pfn++;
> + pte++;
> +
> + for (i = 1; i < floops; i++) {
> + ptent = ptep_get(pte);
> +
> + if (!pte_present(ptent) || pte_pfn(ptent) != pfn ||
> + prot_none != pte_protnone(ptent) ||
> + (enforce_uffd_wp && uffd_wp != pte_uffd_wp(ptent)))
> + break;
> +
> + if (pte_dirty(ptent))
> + (*dirty_nr)++;
> + if (pte_write(ptent))
> + (*writable_nr)++;
> +
> + pfn++;
> + pte++;
> + }
> +
> + return i;
> +}
> +
> /*
> - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
> - * is required to copy this pte.
> + * Copy set of contiguous ptes. Returns number of ptes copied if succeeded
> + * (always gte 1), or -EAGAIN if one preallocated page is required to copy the
> + * first pte.
> */
> static inline int
> -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
> - struct folio **prealloc)
> +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> + pte_t *dst_pte, pte_t *src_pte,
> + unsigned long addr, unsigned long end,
> + int *rss, struct folio **prealloc)
> {
> struct mm_struct *src_mm = src_vma->vm_mm;
> unsigned long vm_flags = src_vma->vm_flags;
> pte_t pte = ptep_get(src_pte);
> struct page *page;
> struct folio *folio;
> + int nr = 1;
> + bool anon = false;
> + bool enforce_uffd_wp = userfaultfd_wp(dst_vma);
> + int nr_dirty = !!pte_dirty(pte);
> + int nr_writable = !!pte_write(pte);
> + int i, ret;
>
> page = vm_normal_page(src_vma, addr, pte);
> - if (page)
> + if (page) {
> folio = page_folio(page);
> - if (page && folio_test_anon(folio)) {
> - /*
> - * If this page may have been pinned by the parent process,
> - * copy the page immediately for the child so that we'll always
> - * guarantee the pinned page won't be randomly replaced in the
> - * future.
> - */
> - folio_get(folio);
> - if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
> - /* Page may be pinned, we have to copy. */
> - folio_put(folio);
> - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
> - addr, rss, prealloc, page);
> + anon = folio_test_anon(folio);
> + nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
> + pte, enforce_uffd_wp, &nr_dirty,
> + &nr_writable);
> + folio_ref_add(folio, nr);
> +
> + for (i = 0; i < nr; i++, page++) {
> + if (anon) {
> + /*
> + * If this page may have been pinned by the
> + * parent process, copy the page immediately for
> + * the child so that we'll always guarantee the
> + * pinned page won't be randomly replaced in the
> + * future.
> + */
> + if (unlikely(page_try_dup_anon_rmap(
> + page, false, src_vma))) {
> + if (i != 0)
> + break;
> + /* Page may be pinned, we have to copy. */
> + folio_ref_sub(folio, nr);
> + ret = copy_present_page(
> + dst_vma, src_vma, dst_pte,
> + src_pte, addr, rss, prealloc,
> + page);
> + return ret == 0 ? 1 : ret;
> + }
> + rss[MM_ANONPAGES]++;
> + VM_BUG_ON(PageAnonExclusive(page));
> + } else {
> + page_dup_file_rmap(page, false);
> + rss[mm_counter_file(page)]++;
> + }
> }
> - rss[MM_ANONPAGES]++;
> - } else if (page) {
> - folio_get(folio);
> - page_dup_file_rmap(page, false);
> - rss[mm_counter_file(page)]++;
> - }
>
> - /*
> - * If it's a COW mapping, write protect it both
> - * in the parent and the child
> - */
> - if (is_cow_mapping(vm_flags) && pte_write(pte)) {
> - ptep_set_wrprotect(src_mm, addr, src_pte);
> - pte = pte_wrprotect(pte);
> + if (i < nr) {
> + folio_ref_sub(folio, nr - i);
> + nr = i;
> + }
> }
> - VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
>
> /*
> - * If it's a shared mapping, mark it clean in
> - * the child
> + * If it's a shared mapping, mark it clean and write protected in the
> + * child, and rely on a write fault to fix up the permissions. This
> + * allows determining batch size without having to consider RO/RW
> + * permissions. As an optimization, skip wrprotect if all ptes in the
> + * batch have the same permissions.
> + *
> + * If its a private (CoW) mapping, mark it dirty in the child if _any_
> + * of the parent mappings in the block were marked dirty. The contiguous
> + * block of mappings are all backed by the same folio, so if any are
> + * dirty then the whole folio is dirty. This allows determining batch
> + * size without having to consider the dirty bit. Further, write protect
> + * it both in the parent and the child so that a future write will cause
> + * a CoW. As as an optimization, skip the wrprotect if all the ptes in
> + * the batch are already readonly.
> */
> - if (vm_flags & VM_SHARED)
> + if (vm_flags & VM_SHARED) {
> pte = pte_mkclean(pte);
> - pte = pte_mkold(pte);
> + if (nr_writable > 0 && nr_writable < nr)
> + pte = pte_wrprotect(pte);
> + } else {
> + if (nr_dirty)
> + pte = pte_mkdirty(pte);
> + if (nr_writable) {
> + ptep_set_wrprotects(src_mm, addr, src_pte, nr);
> + pte = pte_wrprotect(pte);
> + }
> + }
>
> - if (!userfaultfd_wp(dst_vma))
> + pte = pte_mkold(pte);
> + pte = pte_clear_soft_dirty(pte);
> + if (!enforce_uffd_wp)
> pte = pte_clear_uffd_wp(pte);
>
> - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
> - return 0;
> + set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
> + return nr;

I don't have any further comments and you have addressed my previous
ones so feel free to add:

Reviewed-by: Alistair Popple <[email protected]>

However whilst I think the above CoW sequence looks correct it would be
nice if someone else could take a look as well.

> }
>
> static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
> @@ -1021,6 +1115,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> int rss[NR_MM_COUNTERS];
> swp_entry_t entry = (swp_entry_t){0};
> struct folio *prealloc = NULL;
> + int nr_ptes;
>
> again:
> progress = 0;
> @@ -1051,6 +1146,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> arch_enter_lazy_mmu_mode();
>
> do {
> + nr_ptes = 1;
> +
> /*
> * We are holding two locks at this point - either of them
> * could generate latencies in another task on another CPU.
> @@ -1086,16 +1183,21 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> * the now present pte.
> */
> WARN_ON_ONCE(ret != -ENOENT);
> + ret = 0;
> }
> - /* copy_present_pte() will clear `*prealloc' if consumed */
> - ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
> - addr, rss, &prealloc);
> + /* copy_present_ptes() will clear `*prealloc' if consumed */
> + nr_ptes = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
> + addr, end, rss, &prealloc);
> +
> /*
> * If we need a pre-allocated page for this pte, drop the
> * locks, allocate, and try again.
> */
> - if (unlikely(ret == -EAGAIN))
> + if (unlikely(nr_ptes == -EAGAIN)) {
> + ret = -EAGAIN;
> break;
> + }
> +
> if (unlikely(prealloc)) {
> /*
> * pre-alloc page cannot be reused by next time so as
> @@ -1106,8 +1208,9 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
> folio_put(prealloc);
> prealloc = NULL;
> }
> - progress += 8;
> - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
> + progress += 8 * nr_ptes;
> + } while (dst_pte += nr_ptes, src_pte += nr_ptes,
> + addr += PAGE_SIZE * nr_ptes, addr != end);
>
> arch_leave_lazy_mmu_mode();
> pte_unmap_unlock(orig_src_pte, src_ptl);

2023-12-08 01:31:21

[permalink] [raw]

Subject: Re: [PATCH v3 02/15] mm: Batch-clear PTE ranges during zap_pte_range()

Ryan Roberts <[email protected]> writes:

> Convert zap_pte_range() to clear a set of ptes in a batch. A given batch
> maps a physically contiguous block of memory, all belonging to the same
> folio. This will likely improve performance by a tiny amount due to
> removing duplicate calls to mark the folio dirty and accessed. And also
> provides us with a future opportunity to batch the rmap removal.
>
> However, the primary motivation for this change is to reduce the number
> of tlb maintenance operations that the arm64 backend has to perform
> during exit and other syscalls that cause zap_pte_range() (e.g. munmap,
> madvise(DONTNEED), etc.), as it is about to add transparent support for
> the "contiguous bit" in its ptes. By clearing ptes using the new
> clear_ptes() API, the backend doesn't have to perform an expensive
> unfold operation when a PTE being cleared is part of a contpte block.
> Instead it can just clear the whole block immediately.
>
> This change addresses the core-mm refactoring only, and introduces
> clear_ptes() with a default implementation that calls
> ptep_get_and_clear_full() for each pte in the range. Note that this API
> returns the pte at the beginning of the batch, but with the dirty and
> young bits set if ANY of the ptes in the cleared batch had those bits
> set; this information is applied to the folio by the core-mm. Given the
> batch is garranteed to cover only a single folio, collapsing this state

Nit: s/garranteed/guaranteed/

> does not lose any useful information.
>
> A separate change will implement clear_ptes() in the arm64 backend to
> realize the performance improvement as part of the work to enable
> contpte mappings.
>
> Signed-off-by: Ryan Roberts <[email protected]>
> ---
> include/asm-generic/tlb.h | 9 ++++++
> include/linux/pgtable.h | 26 ++++++++++++++++
> mm/memory.c | 63 ++++++++++++++++++++++++++-------------
> mm/mmu_gather.c | 14 +++++++++
> 4 files changed, 92 insertions(+), 20 deletions(-)

<snip>

> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 4f559f4ddd21..57b4d5f0dfa4 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -47,6 +47,20 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
> return true;
> }
>
> +unsigned int tlb_get_guaranteed_space(struct mmu_gather *tlb)
> +{
> + struct mmu_gather_batch *batch = tlb->active;
> + unsigned int nr_next = 0;
> +
> + /* Allocate next batch so we can guarrantee at least one batch. */
> + if (tlb_next_batch(tlb)) {
> + tlb->active = batch;

Rather than calling tlb_next_batch(tlb) and then undoing some of what it
does I think it would be clearer to factor out the allocation part of
tlb_next_batch(tlb) into a separate function (eg. tlb_alloc_batch) that
you can call from both here and tlb_next_batch().

Otherwise I think this overall direction looks better than trying to
play funny games in the arch layer as it's much clearer what's going on
to core-mm code.

- Alistair

> + nr_next = batch->next->max;
> + }
> +
> + return batch->max - batch->nr + nr_next;
> +}
> +
> #ifdef CONFIG_SMP
> static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
> {

2023-12-08 01:41:03

[permalink] [raw]

Subject: Re: [PATCH v3 14/15] arm64/mm: Implement ptep_set_wrprotects() to optimize fork()

Ryan Roberts <[email protected]> writes:

> With the core-mm changes in place to batch-copy ptes during fork, we can
> take advantage of this in arm64 to greatly reduce the number of tlbis we
> have to issue, and recover the lost fork performance incured when adding
> support for transparent contiguous ptes.
>
> If we are write-protecting a whole contig range, we can apply the
> write-protection to the whole range and know that it won't change
> whether the range should have the contiguous bit set or not. For ranges
> smaller than the contig range, we will still have to unfold, apply the
> write-protection, then fold if the change now means the range is
> foldable.
>
> This optimization is possible thanks to the tightening of the Arm ARM in
> respect to the definition and behaviour when 'Misprogramming the
> Contiguous bit'. See section D21194 at
> https://developer.arm.com/documentation/102105/latest/
>
> Performance tested with the following test written for the will-it-scale
> framework:
>
> -------
>
> char *testcase_description = "fork and exit";
>
> void testcase(unsigned long long *iterations, unsigned long nr)
> {
> int pid;
> char *mem;
>
> mem = malloc(SZ_128M);
> assert(mem);
> memset(mem, 1, SZ_128M);
>
> while (1) {
> pid = fork();
> assert(pid >= 0);
>
> if (!pid)
> exit(0);
>
> waitpid(pid, NULL, 0);
>
> (*iterations)++;
> }
> }
>
> -------
>
> I see huge performance regression when PTE_CONT support was added, then
> the regression is mostly fixed with the addition of this change. The
> following shows regression relative to before PTE_CONT was enabled
> (bigger negative value is bigger regression):
>
> | cpus | before opt | after opt |
> |-------:|-------------:|------------:|
> | 1 | -10.4% | -5.2% |
> | 8 | -15.4% | -3.5% |
> | 16 | -38.7% | -3.7% |
> | 24 | -57.0% | -4.4% |
> | 32 | -65.8% | -5.4% |
>
> Signed-off-by: Ryan Roberts <[email protected]>
> ---
> arch/arm64/include/asm/pgtable.h | 30 ++++++++++++++++++++---
> arch/arm64/mm/contpte.c | 42 ++++++++++++++++++++++++++++++++
> 2 files changed, 69 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 15bc9cf1eef4..9bd2f57a9e11 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -984,6 +984,16 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
> } while (pte_val(pte) != pte_val(old_pte));
> }
>
> +static inline void __ptep_set_wrprotects(struct mm_struct *mm,
> + unsigned long address, pte_t *ptep,
> + unsigned int nr)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
> + __ptep_set_wrprotect(mm, address, ptep);
> +}
> +
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> #define __HAVE_ARCH_PMDP_SET_WRPROTECT
> static inline void pmdp_set_wrprotect(struct mm_struct *mm,
> @@ -1139,6 +1149,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep);
> extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep);
> +extern void contpte_set_wrprotects(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, unsigned int nr);
> extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep,
> pte_t entry, int dirty);
> @@ -1290,13 +1302,25 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
> return contpte_ptep_clear_flush_young(vma, addr, ptep);
> }
>
> +#define ptep_set_wrprotects ptep_set_wrprotects
> +static inline void ptep_set_wrprotects(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, unsigned int nr)
> +{
> + if (!contpte_is_enabled(mm))
> + __ptep_set_wrprotects(mm, addr, ptep, nr);
> + else if (nr == 1) {

Why do we need the special case here? Couldn't we just call
contpte_set_wrprotects() with nr == 1?

> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
> + __ptep_set_wrprotects(mm, addr, ptep, 1);
> + contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
> + } else
> + contpte_set_wrprotects(mm, addr, ptep, nr);
> +}
> +
> #define __HAVE_ARCH_PTEP_SET_WRPROTECT
> static inline void ptep_set_wrprotect(struct mm_struct *mm,
> unsigned long addr, pte_t *ptep)
> {
> - contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
> - __ptep_set_wrprotect(mm, addr, ptep);
> - contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
> + ptep_set_wrprotects(mm, addr, ptep, 1);
> }
>
> #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
> diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
> index e079ec61d7d1..2a57df16bf58 100644
> --- a/arch/arm64/mm/contpte.c
> +++ b/arch/arm64/mm/contpte.c
> @@ -303,6 +303,48 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
> }
> EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
>
> +void contpte_set_wrprotects(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, unsigned int nr)
> +{
> + unsigned long next;
> + unsigned long end = addr + (nr << PAGE_SHIFT);
> +
> + do {
> + next = pte_cont_addr_end(addr, end);
> + nr = (next - addr) >> PAGE_SHIFT;
> +
> + /*
> + * If wrprotecting an entire contig range, we can avoid
> + * unfolding. Just set wrprotect and wait for the later
> + * mmu_gather flush to invalidate the tlb. Until the flush, the
> + * page may or may not be wrprotected. After the flush, it is
> + * guarranteed wrprotected. If its a partial range though, we
> + * must unfold, because we can't have a case where CONT_PTE is
> + * set but wrprotect applies to a subset of the PTEs; this would
> + * cause it to continue to be unpredictable after the flush.
> + */
> + if (nr != CONT_PTES)
> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
> +
> + __ptep_set_wrprotects(mm, addr, ptep, nr);
> +
> + addr = next;
> + ptep += nr;
> +
> + /*
> + * If applying to a partial contig range, the change could have
> + * made the range foldable. Use the last pte in the range we
> + * just set for comparison, since contpte_try_fold() only
> + * triggers when acting on the last pte in the contig range.
> + */
> + if (nr != CONT_PTES)
> + contpte_try_fold(mm, addr - PAGE_SIZE, ptep - 1,
> + __ptep_get(ptep - 1));
> +
> + } while (addr != end);
> +}
> +EXPORT_SYMBOL(contpte_set_wrprotects);
> +
> int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep,
> pte_t entry, int dirty)

2023-12-08 01:46:18

[permalink] [raw]

Subject: Re: [PATCH v3 15/15] arm64/mm: Implement clear_ptes() to optimize exit()

Ryan Roberts <[email protected]> writes:

> With the core-mm changes in place to batch-clear ptes during
> zap_pte_range(), we can take advantage of this in arm64 to greatly
> reduce the number of tlbis we have to issue, and recover the lost exit
> performance incured when adding support for transparent contiguous ptes.
>
> If we are clearing a whole contpte range, we can elide first unfolding
> that range and save the tlbis. We just clear the whole range.
>
> The following shows the results of running a kernel compilation workload
> and measuring the cost of arm64_sys_exit_group() (which at ~1.5% is a
> very small part of the overall workload).
>
> Benchmarks were run on Ampere Altra in 2 configs; single numa node and 2
> numa nodes (tlbis are more expensive in 2 node config).
>
> - baseline: v6.7-rc1 + anonfolio-v7
> - no-opt: contpte series without any attempt to optimize exit()
> - simple-ptep_get_clear_full: simple optimization to exploit full=1.
> ptep_get_clear_full() does not fully conform to its intended semantic
> - robust-ptep_get_clear_full: similar to previous but
> ptep_get_clear_full() fully conforms to its intended semantic
> - clear_ptes: optimization implemented by this patch
>
> | config | numa=1 | numa=2 |
> |----------------------------|--------|--------|
> | baseline | 0% | 0% |
> | no-opt | 190% | 768% |
> | simple-ptep_get_clear_full | 8% | 29% |
> | robust-ptep_get_clear_full | 21% | 19% |
> | clear_ptes | 13% | 9% |
>
> In all cases, the cost of arm64_sys_exit_group() increases; this is
> anticipated because there is more work to do to tear down the page
> tables. But clear_ptes() gives the smallest increase overall.
>
> Signed-off-by: Ryan Roberts <[email protected]>
> ---
> arch/arm64/include/asm/pgtable.h | 32 ++++++++++++++++++++++++
> arch/arm64/mm/contpte.c | 42 ++++++++++++++++++++++++++++++++
> 2 files changed, 74 insertions(+)
>
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 9bd2f57a9e11..ff6b3cc9e819 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -1145,6 +1145,8 @@ extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
> extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
> extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
> pte_t *ptep, pte_t pte, unsigned int nr);
> +extern pte_t contpte_clear_ptes(struct mm_struct *mm, unsigned long addr,
> + pte_t *ptep, unsigned int nr);
> extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep);
> extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
> @@ -1270,6 +1272,36 @@ static inline void pte_clear(struct mm_struct *mm,
> __pte_clear(mm, addr, ptep);
> }
>
> +#define clear_ptes clear_ptes
> +static inline pte_t clear_ptes(struct mm_struct *mm,
> + unsigned long addr, pte_t *ptep, int full,
> + unsigned int nr)
> +{
> + pte_t pte;
> +
> + if (!contpte_is_enabled(mm)) {

I think it would be better to call the generic definition of
clear_ptes() here. Obviously that won't exist if clear_ptes is defined
here, but you could alcall it __clear_ptes() and #define clear_ptes
__clear_ptes when the arch specific helper isn't defined.

> + unsigned int i;
> + pte_t tail;
> +
> + pte = __ptep_get_and_clear(mm, addr, ptep);
> + for (i = 1; i < nr; i++) {
> + addr += PAGE_SIZE;
> + ptep++;
> + tail = __ptep_get_and_clear(mm, addr, ptep);
> + if (pte_dirty(tail))
> + pte = pte_mkdirty(pte);
> + if (pte_young(tail))
> + pte = pte_mkyoung(pte);
> + }
> + } else if (nr == 1) {
> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
> + pte = __ptep_get_and_clear(mm, addr, ptep);
> + } else
> + pte = contpte_clear_ptes(mm, addr, ptep, nr);
> +
> + return pte;
> +}
> +
> #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
> static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
> unsigned long addr, pte_t *ptep)
> diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
> index 2a57df16bf58..34b43bde3fcd 100644
> --- a/arch/arm64/mm/contpte.c
> +++ b/arch/arm64/mm/contpte.c
> @@ -257,6 +257,48 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
> }
> EXPORT_SYMBOL(contpte_set_ptes);
>
> +pte_t contpte_clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
> + unsigned int nr)
> +{
> + /*
> + * If we cover a partial contpte block at the beginning or end of the
> + * batch, unfold if currently folded. This makes it safe to clear some
> + * of the entries while keeping others. contpte blocks in the middle of
> + * the range, which are fully covered don't need to be unfolded because
> + * we will clear the full block.
> + */
> +
> + unsigned int i;
> + pte_t pte;
> + pte_t tail;
> +
> + if (ptep != contpte_align_down(ptep) || nr < CONT_PTES)
> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
> +
> + if (ptep + nr != contpte_align_down(ptep + nr))
> + contpte_try_unfold(mm, addr + PAGE_SIZE * (nr - 1),
> + ptep + nr - 1,
> + __ptep_get(ptep + nr - 1));
> +
> + pte = __ptep_get_and_clear(mm, addr, ptep);
> +
> + for (i = 1; i < nr; i++) {
> + addr += PAGE_SIZE;
> + ptep++;
> +
> + tail = __ptep_get_and_clear(mm, addr, ptep);
> +
> + if (pte_dirty(tail))
> + pte = pte_mkdirty(pte);
> +
> + if (pte_young(tail))
> + pte = pte_mkyoung(pte);
> + }
> +
> + return pte;
> +}
> +EXPORT_SYMBOL(contpte_clear_ptes);
> +
> int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
> unsigned long addr, pte_t *ptep)
> {

2023-12-12 11:35:36

by Will Deacon

[permalink] [raw]

Subject: Re: [PATCH v3 12/15] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

On Mon, Dec 04, 2023 at 10:54:37AM +0000, Ryan Roberts wrote:
> Split __flush_tlb_range() into __flush_tlb_range_nosync() +
> __flush_tlb_range(), in the same way as the existing flush_tlb_page()
> arrangement. This allows calling __flush_tlb_range_nosync() to elide the
> trailing DSB. Forthcoming "contpte" code will take advantage of this
> when clearing the young bit from a contiguous range of ptes.
>
> Signed-off-by: Ryan Roberts <[email protected]>
> ---
> arch/arm64/include/asm/tlbflush.h | 13 +++++++++++--
> 1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> index bb2c2833a987..925ef3bdf9ed 100644
> --- a/arch/arm64/include/asm/tlbflush.h
> +++ b/arch/arm64/include/asm/tlbflush.h
> @@ -399,7 +399,7 @@ do { \
> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
> __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false)
>
> -static inline void __flush_tlb_range(struct vm_area_struct *vma,
> +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
> unsigned long start, unsigned long end,
> unsigned long stride, bool last_level,
> int tlb_level)
> @@ -431,10 +431,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
> else
> __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true);
>
> - dsb(ish);
> mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
> }
>
> +static inline void __flush_tlb_range(struct vm_area_struct *vma,
> + unsigned long start, unsigned long end,
> + unsigned long stride, bool last_level,
> + int tlb_level)
> +{
> + __flush_tlb_range_nosync(vma, start, end, stride,
> + last_level, tlb_level);
> + dsb(ish);
> +}

Hmm, are you sure it's safe to defer the DSB until after the secondary TLB
invalidation? It will have a subtle effect on e.g. an SMMU participating
in broadcast TLB maintenance, because now the ATC will be invalidated
before completion of the TLB invalidation and it's not obviously safe to me.

Will

2023-12-12 11:47:21

[permalink] [raw]

Subject: Re: [PATCH v3 12/15] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

On 12/12/2023 11:35, Will Deacon wrote:
> On Mon, Dec 04, 2023 at 10:54:37AM +0000, Ryan Roberts wrote:
>> Split __flush_tlb_range() into __flush_tlb_range_nosync() +
>> __flush_tlb_range(), in the same way as the existing flush_tlb_page()
>> arrangement. This allows calling __flush_tlb_range_nosync() to elide the
>> trailing DSB. Forthcoming "contpte" code will take advantage of this
>> when clearing the young bit from a contiguous range of ptes.
>>
>> Signed-off-by: Ryan Roberts <[email protected]>
>> ---
>> arch/arm64/include/asm/tlbflush.h | 13 +++++++++++--
>> 1 file changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>> index bb2c2833a987..925ef3bdf9ed 100644
>> --- a/arch/arm64/include/asm/tlbflush.h
>> +++ b/arch/arm64/include/asm/tlbflush.h
>> @@ -399,7 +399,7 @@ do { \
>> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>> __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false)
>>
>> -static inline void __flush_tlb_range(struct vm_area_struct *vma,
>> +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>> unsigned long start, unsigned long end,
>> unsigned long stride, bool last_level,
>> int tlb_level)
>> @@ -431,10 +431,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>> else
>> __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true);
>>
>> - dsb(ish);
>> mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>> }
>>
>> +static inline void __flush_tlb_range(struct vm_area_struct *vma,
>> + unsigned long start, unsigned long end,
>> + unsigned long stride, bool last_level,
>> + int tlb_level)
>> +{
>> + __flush_tlb_range_nosync(vma, start, end, stride,
>> + last_level, tlb_level);
>> + dsb(ish);
>> +}
>
> Hmm, are you sure it's safe to defer the DSB until after the secondary TLB
> invalidation? It will have a subtle effect on e.g. an SMMU participating
> in broadcast TLB maintenance, because now the ATC will be invalidated
> before completion of the TLB invalidation and it's not obviously safe to me.

I'll be honest; I don't know that it's safe. The notifier calls turned up during
a rebase and I stared at it for a while, before eventually concluding that I
should just follow the existing pattern in __flush_tlb_page_nosync(): That one
calls the mmu notifier without the dsb, then flush_tlb_page() does the dsb
after. So I assumed it was safe.

If you think it's not safe, I guess there is a bug to fix in
__flush_tlb_page_nosync()?

>
> Will

2023-12-12 11:52:14

[permalink] [raw]

Subject: Re: [PATCH v3 01/15] mm: Batch-copy PTE ranges during fork()

On 08/12/2023 00:32, Alistair Popple wrote:
>
> Ryan Roberts <[email protected]> writes:
>
> <snip>
>
>> /*
>> * On some architectures hardware does not set page access bit when accessing
>> * memory page, it is responsibility of software setting this bit. It brings
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 1f18ed4a5497..8a87a488950c 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -924,68 +924,162 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
>> return 0;
>> }
>>
>> +static int folio_nr_pages_cont_mapped(struct folio *folio,
>> + struct page *page, pte_t *pte,
>> + unsigned long addr, unsigned long end,
>> + pte_t ptent, bool enforce_uffd_wp,
>> + int *dirty_nr, int *writable_nr)
>> +{
>> + int floops;
>> + int i;
>> + unsigned long pfn;
>> + bool prot_none;
>> + bool uffd_wp;
>> +
>> + if (!folio_test_large(folio))
>> + return 1;
>> +
>> + /*
>> + * Loop either to `end` or to end of folio if its contiguously mapped,
>> + * whichever is smaller.
>> + */
>> + floops = (end - addr) >> PAGE_SHIFT;
>> + floops = min_t(int, floops,
>> + folio_pfn(folio_next(folio)) - page_to_pfn(page));
>
> Much better, thanks for addressing my comments here.
>
>> +
>> + pfn = page_to_pfn(page);
>> + prot_none = pte_protnone(ptent);
>> + uffd_wp = pte_uffd_wp(ptent);
>> +
>> + *dirty_nr = !!pte_dirty(ptent);
>> + *writable_nr = !!pte_write(ptent);
>> +
>> + pfn++;
>> + pte++;
>> +
>> + for (i = 1; i < floops; i++) {
>> + ptent = ptep_get(pte);
>> +
>> + if (!pte_present(ptent) || pte_pfn(ptent) != pfn ||
>> + prot_none != pte_protnone(ptent) ||
>> + (enforce_uffd_wp && uffd_wp != pte_uffd_wp(ptent)))
>> + break;
>> +
>> + if (pte_dirty(ptent))
>> + (*dirty_nr)++;
>> + if (pte_write(ptent))
>> + (*writable_nr)++;
>> +
>> + pfn++;
>> + pte++;
>> + }
>> +
>> + return i;
>> +}
>> +
>> /*
>> - * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
>> - * is required to copy this pte.
>> + * Copy set of contiguous ptes. Returns number of ptes copied if succeeded
>> + * (always gte 1), or -EAGAIN if one preallocated page is required to copy the
>> + * first pte.
>> */
>> static inline int
>> -copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>> - pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
>> - struct folio **prealloc)
>> +copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>> + pte_t *dst_pte, pte_t *src_pte,
>> + unsigned long addr, unsigned long end,
>> + int *rss, struct folio **prealloc)
>> {
>> struct mm_struct *src_mm = src_vma->vm_mm;
>> unsigned long vm_flags = src_vma->vm_flags;
>> pte_t pte = ptep_get(src_pte);
>> struct page *page;
>> struct folio *folio;
>> + int nr = 1;
>> + bool anon = false;
>> + bool enforce_uffd_wp = userfaultfd_wp(dst_vma);
>> + int nr_dirty = !!pte_dirty(pte);
>> + int nr_writable = !!pte_write(pte);
>> + int i, ret;
>>
>> page = vm_normal_page(src_vma, addr, pte);
>> - if (page)
>> + if (page) {
>> folio = page_folio(page);
>> - if (page && folio_test_anon(folio)) {
>> - /*
>> - * If this page may have been pinned by the parent process,
>> - * copy the page immediately for the child so that we'll always
>> - * guarantee the pinned page won't be randomly replaced in the
>> - * future.
>> - */
>> - folio_get(folio);
>> - if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) {
>> - /* Page may be pinned, we have to copy. */
>> - folio_put(folio);
>> - return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
>> - addr, rss, prealloc, page);
>> + anon = folio_test_anon(folio);
>> + nr = folio_nr_pages_cont_mapped(folio, page, src_pte, addr, end,
>> + pte, enforce_uffd_wp, &nr_dirty,
>> + &nr_writable);
>> + folio_ref_add(folio, nr);
>> +
>> + for (i = 0; i < nr; i++, page++) {
>> + if (anon) {
>> + /*
>> + * If this page may have been pinned by the
>> + * parent process, copy the page immediately for
>> + * the child so that we'll always guarantee the
>> + * pinned page won't be randomly replaced in the
>> + * future.
>> + */
>> + if (unlikely(page_try_dup_anon_rmap(
>> + page, false, src_vma))) {
>> + if (i != 0)
>> + break;
>> + /* Page may be pinned, we have to copy. */
>> + folio_ref_sub(folio, nr);
>> + ret = copy_present_page(
>> + dst_vma, src_vma, dst_pte,
>> + src_pte, addr, rss, prealloc,
>> + page);
>> + return ret == 0 ? 1 : ret;
>> + }
>> + rss[MM_ANONPAGES]++;
>> + VM_BUG_ON(PageAnonExclusive(page));
>> + } else {
>> + page_dup_file_rmap(page, false);
>> + rss[mm_counter_file(page)]++;
>> + }
>> }
>> - rss[MM_ANONPAGES]++;
>> - } else if (page) {
>> - folio_get(folio);
>> - page_dup_file_rmap(page, false);
>> - rss[mm_counter_file(page)]++;
>> - }
>>
>> - /*
>> - * If it's a COW mapping, write protect it both
>> - * in the parent and the child
>> - */
>> - if (is_cow_mapping(vm_flags) && pte_write(pte)) {
>> - ptep_set_wrprotect(src_mm, addr, src_pte);
>> - pte = pte_wrprotect(pte);
>> + if (i < nr) {
>> + folio_ref_sub(folio, nr - i);
>> + nr = i;
>> + }
>> }
>> - VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
>>
>> /*
>> - * If it's a shared mapping, mark it clean in
>> - * the child
>> + * If it's a shared mapping, mark it clean and write protected in the
>> + * child, and rely on a write fault to fix up the permissions. This
>> + * allows determining batch size without having to consider RO/RW
>> + * permissions. As an optimization, skip wrprotect if all ptes in the
>> + * batch have the same permissions.
>> + *
>> + * If its a private (CoW) mapping, mark it dirty in the child if _any_
>> + * of the parent mappings in the block were marked dirty. The contiguous
>> + * block of mappings are all backed by the same folio, so if any are
>> + * dirty then the whole folio is dirty. This allows determining batch
>> + * size without having to consider the dirty bit. Further, write protect
>> + * it both in the parent and the child so that a future write will cause
>> + * a CoW. As as an optimization, skip the wrprotect if all the ptes in
>> + * the batch are already readonly.
>> */
>> - if (vm_flags & VM_SHARED)
>> + if (vm_flags & VM_SHARED) {
>> pte = pte_mkclean(pte);
>> - pte = pte_mkold(pte);
>> + if (nr_writable > 0 && nr_writable < nr)
>> + pte = pte_wrprotect(pte);
>> + } else {
>> + if (nr_dirty)
>> + pte = pte_mkdirty(pte);
>> + if (nr_writable) {
>> + ptep_set_wrprotects(src_mm, addr, src_pte, nr);
>> + pte = pte_wrprotect(pte);
>> + }
>> + }
>>
>> - if (!userfaultfd_wp(dst_vma))
>> + pte = pte_mkold(pte);
>> + pte = pte_clear_soft_dirty(pte);
>> + if (!enforce_uffd_wp)
>> pte = pte_clear_uffd_wp(pte);
>>
>> - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
>> - return 0;
>> + set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
>> + return nr;
>
> I don't have any further comments and you have addressed my previous
> ones so feel free to add:
>
> Reviewed-by: Alistair Popple <[email protected]>
>
> However whilst I think the above CoW sequence looks correct it would be
> nice if someone else could take a look as well.

Thanks for the RB! David has taken a look at the CoW part and helped develop the
logic, so I'm pretty confident in it.

However, David also sent me some microbenchmarks for fork, DONTNEED, munmap, etc
for order-0 and PTE-mapped THP (2M). I'm seeing a ferw performance regressions
with those, which I'm currently trying to resolve. At the moment it's looking
like I'll have to expose some function to allow the core code to skip forward a
number of ptes so that in the contpte-mapped case, the core code only does
ptep_get() once per contpte block. As a result there will be some churn here.

I'm currently working out some bugs and hope to post an updated series with perf
numbers for those microbenchmarks by the end of the week, all being well.

>
>> }
>>
>> static inline struct folio *page_copy_prealloc(struct mm_struct *src_mm,
>> @@ -1021,6 +1115,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>> int rss[NR_MM_COUNTERS];
>> swp_entry_t entry = (swp_entry_t){0};
>> struct folio *prealloc = NULL;
>> + int nr_ptes;
>>
>> again:
>> progress = 0;
>> @@ -1051,6 +1146,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>> arch_enter_lazy_mmu_mode();
>>
>> do {
>> + nr_ptes = 1;
>> +
>> /*
>> * We are holding two locks at this point - either of them
>> * could generate latencies in another task on another CPU.
>> @@ -1086,16 +1183,21 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>> * the now present pte.
>> */
>> WARN_ON_ONCE(ret != -ENOENT);
>> + ret = 0;
>> }
>> - /* copy_present_pte() will clear `*prealloc' if consumed */
>> - ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
>> - addr, rss, &prealloc);
>> + /* copy_present_ptes() will clear `*prealloc' if consumed */
>> + nr_ptes = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
>> + addr, end, rss, &prealloc);
>> +
>> /*
>> * If we need a pre-allocated page for this pte, drop the
>> * locks, allocate, and try again.
>> */
>> - if (unlikely(ret == -EAGAIN))
>> + if (unlikely(nr_ptes == -EAGAIN)) {
>> + ret = -EAGAIN;
>> break;
>> + }
>> +
>> if (unlikely(prealloc)) {
>> /*
>> * pre-alloc page cannot be reused by next time so as
>> @@ -1106,8 +1208,9 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
>> folio_put(prealloc);
>> prealloc = NULL;
>> }
>> - progress += 8;
>> - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
>> + progress += 8 * nr_ptes;
>> + } while (dst_pte += nr_ptes, src_pte += nr_ptes,
>> + addr += PAGE_SIZE * nr_ptes, addr != end);
>>
>> arch_leave_lazy_mmu_mode();
>> pte_unmap_unlock(orig_src_pte, src_ptl);
>

2023-12-12 11:58:02

[permalink] [raw]

Subject: Re: [PATCH v3 02/15] mm: Batch-clear PTE ranges during zap_pte_range()

On 08/12/2023 01:30, Alistair Popple wrote:
>
> Ryan Roberts <[email protected]> writes:
>
>> Convert zap_pte_range() to clear a set of ptes in a batch. A given batch
>> maps a physically contiguous block of memory, all belonging to the same
>> folio. This will likely improve performance by a tiny amount due to
>> removing duplicate calls to mark the folio dirty and accessed. And also
>> provides us with a future opportunity to batch the rmap removal.
>>
>> However, the primary motivation for this change is to reduce the number
>> of tlb maintenance operations that the arm64 backend has to perform
>> during exit and other syscalls that cause zap_pte_range() (e.g. munmap,
>> madvise(DONTNEED), etc.), as it is about to add transparent support for
>> the "contiguous bit" in its ptes. By clearing ptes using the new
>> clear_ptes() API, the backend doesn't have to perform an expensive
>> unfold operation when a PTE being cleared is part of a contpte block.
>> Instead it can just clear the whole block immediately.
>>
>> This change addresses the core-mm refactoring only, and introduces
>> clear_ptes() with a default implementation that calls
>> ptep_get_and_clear_full() for each pte in the range. Note that this API
>> returns the pte at the beginning of the batch, but with the dirty and
>> young bits set if ANY of the ptes in the cleared batch had those bits
>> set; this information is applied to the folio by the core-mm. Given the
>> batch is garranteed to cover only a single folio, collapsing this state
>
> Nit: s/garranteed/guaranteed/
>
>> does not lose any useful information.
>>
>> A separate change will implement clear_ptes() in the arm64 backend to
>> realize the performance improvement as part of the work to enable
>> contpte mappings.
>>
>> Signed-off-by: Ryan Roberts <[email protected]>
>> ---
>> include/asm-generic/tlb.h | 9 ++++++
>> include/linux/pgtable.h | 26 ++++++++++++++++
>> mm/memory.c | 63 ++++++++++++++++++++++++++-------------
>> mm/mmu_gather.c | 14 +++++++++
>> 4 files changed, 92 insertions(+), 20 deletions(-)
>
> <snip>
>
>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
>> index 4f559f4ddd21..57b4d5f0dfa4 100644
>> --- a/mm/mmu_gather.c
>> +++ b/mm/mmu_gather.c
>> @@ -47,6 +47,20 @@ static bool tlb_next_batch(struct mmu_gather *tlb)
>> return true;
>> }
>>
>> +unsigned int tlb_get_guaranteed_space(struct mmu_gather *tlb)
>> +{
>> + struct mmu_gather_batch *batch = tlb->active;
>> + unsigned int nr_next = 0;
>> +
>> + /* Allocate next batch so we can guarrantee at least one batch. */
>> + if (tlb_next_batch(tlb)) {
>> + tlb->active = batch;
>
> Rather than calling tlb_next_batch(tlb) and then undoing some of what it
> does I think it would be clearer to factor out the allocation part of
> tlb_next_batch(tlb) into a separate function (eg. tlb_alloc_batch) that
> you can call from both here and tlb_next_batch().

As per my email against patch 1, I have some perf regressions to iron out for
microbenchmarks; one issue is that this code forces the allocation of a page for
a batch even when we are only modifying a single pte (which would previously fit
in the embedded batch). So I've renamed this function to tlb_reserve_space(int
nr). If it already has enough room, it will jsut return immediately. Else it
will keep calling tlb_next_batch() in a loop until space has been allocated.
Then after the loop we set tlb->active back to the original batch.

Given the new potential need to loop a couple of times, and the need to build up
that linked list, I think it works nicely without refactoring tlb_next_batch().

>
> Otherwise I think this overall direction looks better than trying to
> play funny games in the arch layer as it's much clearer what's going on
> to core-mm code.
>
> - Alistair
>
>> + nr_next = batch->next->max;
>> + }
>> +
>> + return batch->max - batch->nr + nr_next;
>> +}
>> +
>> #ifdef CONFIG_SMP
>> static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
>> {
>

2023-12-12 12:00:02

[permalink] [raw]

Subject: Re: [PATCH v3 14/15] arm64/mm: Implement ptep_set_wrprotects() to optimize fork()

On 08/12/2023 01:37, Alistair Popple wrote:
>
> Ryan Roberts <[email protected]> writes:
>
>> With the core-mm changes in place to batch-copy ptes during fork, we can
>> take advantage of this in arm64 to greatly reduce the number of tlbis we
>> have to issue, and recover the lost fork performance incured when adding
>> support for transparent contiguous ptes.
>>
>> If we are write-protecting a whole contig range, we can apply the
>> write-protection to the whole range and know that it won't change
>> whether the range should have the contiguous bit set or not. For ranges
>> smaller than the contig range, we will still have to unfold, apply the
>> write-protection, then fold if the change now means the range is
>> foldable.
>>
>> This optimization is possible thanks to the tightening of the Arm ARM in
>> respect to the definition and behaviour when 'Misprogramming the
>> Contiguous bit'. See section D21194 at
>> https://developer.arm.com/documentation/102105/latest/
>>
>> Performance tested with the following test written for the will-it-scale
>> framework:
>>
>> -------
>>
>> char *testcase_description = "fork and exit";
>>
>> void testcase(unsigned long long *iterations, unsigned long nr)
>> {
>> int pid;
>> char *mem;
>>
>> mem = malloc(SZ_128M);
>> assert(mem);
>> memset(mem, 1, SZ_128M);
>>
>> while (1) {
>> pid = fork();
>> assert(pid >= 0);
>>
>> if (!pid)
>> exit(0);
>>
>> waitpid(pid, NULL, 0);
>>
>> (*iterations)++;
>> }
>> }
>>
>> -------
>>
>> I see huge performance regression when PTE_CONT support was added, then
>> the regression is mostly fixed with the addition of this change. The
>> following shows regression relative to before PTE_CONT was enabled
>> (bigger negative value is bigger regression):
>>
>> | cpus | before opt | after opt |
>> |-------:|-------------:|------------:|
>> | 1 | -10.4% | -5.2% |
>> | 8 | -15.4% | -3.5% |
>> | 16 | -38.7% | -3.7% |
>> | 24 | -57.0% | -4.4% |
>> | 32 | -65.8% | -5.4% |
>>
>> Signed-off-by: Ryan Roberts <[email protected]>
>> ---
>> arch/arm64/include/asm/pgtable.h | 30 ++++++++++++++++++++---
>> arch/arm64/mm/contpte.c | 42 ++++++++++++++++++++++++++++++++
>> 2 files changed, 69 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 15bc9cf1eef4..9bd2f57a9e11 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -984,6 +984,16 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
>> } while (pte_val(pte) != pte_val(old_pte));
>> }
>>
>> +static inline void __ptep_set_wrprotects(struct mm_struct *mm,
>> + unsigned long address, pte_t *ptep,
>> + unsigned int nr)
>> +{
>> + unsigned int i;
>> +
>> + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
>> + __ptep_set_wrprotect(mm, address, ptep);
>> +}
>> +
>> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> #define __HAVE_ARCH_PMDP_SET_WRPROTECT
>> static inline void pmdp_set_wrprotect(struct mm_struct *mm,
>> @@ -1139,6 +1149,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
>> unsigned long addr, pte_t *ptep);
>> extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
>> unsigned long addr, pte_t *ptep);
>> +extern void contpte_set_wrprotects(struct mm_struct *mm, unsigned long addr,
>> + pte_t *ptep, unsigned int nr);
>> extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
>> unsigned long addr, pte_t *ptep,
>> pte_t entry, int dirty);
>> @@ -1290,13 +1302,25 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
>> return contpte_ptep_clear_flush_young(vma, addr, ptep);
>> }
>>
>> +#define ptep_set_wrprotects ptep_set_wrprotects
>> +static inline void ptep_set_wrprotects(struct mm_struct *mm, unsigned long addr,
>> + pte_t *ptep, unsigned int nr)
>> +{
>> + if (!contpte_is_enabled(mm))
>> + __ptep_set_wrprotects(mm, addr, ptep, nr);
>> + else if (nr == 1) {
>
> Why do we need the special case here? Couldn't we just call
> contpte_set_wrprotects() with nr == 1?

My intention is for this to be a fast path for ptep_set_wrprotect(). I'm having
to work hard to prevent regressing the order-0 folios case.

>
>> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
>> + __ptep_set_wrprotects(mm, addr, ptep, 1);
>> + contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
>> + } else
>> + contpte_set_wrprotects(mm, addr, ptep, nr);
>> +}
>> +
>> #define __HAVE_ARCH_PTEP_SET_WRPROTECT
>> static inline void ptep_set_wrprotect(struct mm_struct *mm,
>> unsigned long addr, pte_t *ptep)
>> {
>> - contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
>> - __ptep_set_wrprotect(mm, addr, ptep);
>> - contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
>> + ptep_set_wrprotects(mm, addr, ptep, 1);
>> }
>>
>> #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
>> diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
>> index e079ec61d7d1..2a57df16bf58 100644
>> --- a/arch/arm64/mm/contpte.c
>> +++ b/arch/arm64/mm/contpte.c
>> @@ -303,6 +303,48 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
>> }
>> EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
>>
>> +void contpte_set_wrprotects(struct mm_struct *mm, unsigned long addr,
>> + pte_t *ptep, unsigned int nr)
>> +{
>> + unsigned long next;
>> + unsigned long end = addr + (nr << PAGE_SHIFT);
>> +
>> + do {
>> + next = pte_cont_addr_end(addr, end);
>> + nr = (next - addr) >> PAGE_SHIFT;
>> +
>> + /*
>> + * If wrprotecting an entire contig range, we can avoid
>> + * unfolding. Just set wrprotect and wait for the later
>> + * mmu_gather flush to invalidate the tlb. Until the flush, the
>> + * page may or may not be wrprotected. After the flush, it is
>> + * guarranteed wrprotected. If its a partial range though, we
>> + * must unfold, because we can't have a case where CONT_PTE is
>> + * set but wrprotect applies to a subset of the PTEs; this would
>> + * cause it to continue to be unpredictable after the flush.
>> + */
>> + if (nr != CONT_PTES)
>> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
>> +
>> + __ptep_set_wrprotects(mm, addr, ptep, nr);
>> +
>> + addr = next;
>> + ptep += nr;
>> +
>> + /*
>> + * If applying to a partial contig range, the change could have
>> + * made the range foldable. Use the last pte in the range we
>> + * just set for comparison, since contpte_try_fold() only
>> + * triggers when acting on the last pte in the contig range.
>> + */
>> + if (nr != CONT_PTES)
>> + contpte_try_fold(mm, addr - PAGE_SIZE, ptep - 1,
>> + __ptep_get(ptep - 1));
>> +
>> + } while (addr != end);
>> +}
>> +EXPORT_SYMBOL(contpte_set_wrprotects);
>> +
>> int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
>> unsigned long addr, pte_t *ptep,
>> pte_t entry, int dirty)
>

2023-12-12 12:02:38

[permalink] [raw]

Subject: Re: [PATCH v3 15/15] arm64/mm: Implement clear_ptes() to optimize exit()

On 08/12/2023 01:45, Alistair Popple wrote:
>
> Ryan Roberts <[email protected]> writes:
>
>> With the core-mm changes in place to batch-clear ptes during
>> zap_pte_range(), we can take advantage of this in arm64 to greatly
>> reduce the number of tlbis we have to issue, and recover the lost exit
>> performance incured when adding support for transparent contiguous ptes.
>>
>> If we are clearing a whole contpte range, we can elide first unfolding
>> that range and save the tlbis. We just clear the whole range.
>>
>> The following shows the results of running a kernel compilation workload
>> and measuring the cost of arm64_sys_exit_group() (which at ~1.5% is a
>> very small part of the overall workload).
>>
>> Benchmarks were run on Ampere Altra in 2 configs; single numa node and 2
>> numa nodes (tlbis are more expensive in 2 node config).
>>
>> - baseline: v6.7-rc1 + anonfolio-v7
>> - no-opt: contpte series without any attempt to optimize exit()
>> - simple-ptep_get_clear_full: simple optimization to exploit full=1.
>> ptep_get_clear_full() does not fully conform to its intended semantic
>> - robust-ptep_get_clear_full: similar to previous but
>> ptep_get_clear_full() fully conforms to its intended semantic
>> - clear_ptes: optimization implemented by this patch
>>
>> | config | numa=1 | numa=2 |
>> |----------------------------|--------|--------|
>> | baseline | 0% | 0% |
>> | no-opt | 190% | 768% |
>> | simple-ptep_get_clear_full | 8% | 29% |
>> | robust-ptep_get_clear_full | 21% | 19% |
>> | clear_ptes | 13% | 9% |
>>
>> In all cases, the cost of arm64_sys_exit_group() increases; this is
>> anticipated because there is more work to do to tear down the page
>> tables. But clear_ptes() gives the smallest increase overall.
>>
>> Signed-off-by: Ryan Roberts <[email protected]>
>> ---
>> arch/arm64/include/asm/pgtable.h | 32 ++++++++++++++++++++++++
>> arch/arm64/mm/contpte.c | 42 ++++++++++++++++++++++++++++++++
>> 2 files changed, 74 insertions(+)
>>
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 9bd2f57a9e11..ff6b3cc9e819 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -1145,6 +1145,8 @@ extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
>> extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
>> extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
>> pte_t *ptep, pte_t pte, unsigned int nr);
>> +extern pte_t contpte_clear_ptes(struct mm_struct *mm, unsigned long addr,
>> + pte_t *ptep, unsigned int nr);
>> extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
>> unsigned long addr, pte_t *ptep);
>> extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
>> @@ -1270,6 +1272,36 @@ static inline void pte_clear(struct mm_struct *mm,
>> __pte_clear(mm, addr, ptep);
>> }
>>
>> +#define clear_ptes clear_ptes
>> +static inline pte_t clear_ptes(struct mm_struct *mm,
>> + unsigned long addr, pte_t *ptep, int full,
>> + unsigned int nr)
>> +{
>> + pte_t pte;
>> +
>> + if (!contpte_is_enabled(mm)) {
>
> I think it would be better to call the generic definition of
> clear_ptes() here. Obviously that won't exist if clear_ptes is defined
> here, but you could alcall it __clear_ptes() and #define clear_ptes
> __clear_ptes when the arch specific helper isn't defined.

My thinking was that we wouldn't bother to expose clear_ptes() when
CONFIG_ARM64_CONTPTE is disabled, and just fall back to the core-mm generic one.
But I think your proposal is probably cleaner and more consistent with
everything else. So I'll do this for the next version.

>
>> + unsigned int i;
>> + pte_t tail;
>> +
>> + pte = __ptep_get_and_clear(mm, addr, ptep);
>> + for (i = 1; i < nr; i++) {
>> + addr += PAGE_SIZE;
>> + ptep++;
>> + tail = __ptep_get_and_clear(mm, addr, ptep);
>> + if (pte_dirty(tail))
>> + pte = pte_mkdirty(pte);
>> + if (pte_young(tail))
>> + pte = pte_mkyoung(pte);
>> + }
>> + } else if (nr == 1) {
>> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
>> + pte = __ptep_get_and_clear(mm, addr, ptep);
>> + } else
>> + pte = contpte_clear_ptes(mm, addr, ptep, nr);
>> +
>> + return pte;
>> +}
>> +
>> #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
>> static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
>> unsigned long addr, pte_t *ptep)
>> diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
>> index 2a57df16bf58..34b43bde3fcd 100644
>> --- a/arch/arm64/mm/contpte.c
>> +++ b/arch/arm64/mm/contpte.c
>> @@ -257,6 +257,48 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
>> }
>> EXPORT_SYMBOL(contpte_set_ptes);
>>
>> +pte_t contpte_clear_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
>> + unsigned int nr)
>> +{
>> + /*
>> + * If we cover a partial contpte block at the beginning or end of the
>> + * batch, unfold if currently folded. This makes it safe to clear some
>> + * of the entries while keeping others. contpte blocks in the middle of
>> + * the range, which are fully covered don't need to be unfolded because
>> + * we will clear the full block.
>> + */
>> +
>> + unsigned int i;
>> + pte_t pte;
>> + pte_t tail;
>> +
>> + if (ptep != contpte_align_down(ptep) || nr < CONT_PTES)
>> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
>> +
>> + if (ptep + nr != contpte_align_down(ptep + nr))
>> + contpte_try_unfold(mm, addr + PAGE_SIZE * (nr - 1),
>> + ptep + nr - 1,
>> + __ptep_get(ptep + nr - 1));
>> +
>> + pte = __ptep_get_and_clear(mm, addr, ptep);
>> +
>> + for (i = 1; i < nr; i++) {
>> + addr += PAGE_SIZE;
>> + ptep++;
>> +
>> + tail = __ptep_get_and_clear(mm, addr, ptep);
>> +
>> + if (pte_dirty(tail))
>> + pte = pte_mkdirty(pte);
>> +
>> + if (pte_young(tail))
>> + pte = pte_mkyoung(pte);
>> + }
>> +
>> + return pte;
>> +}
>> +EXPORT_SYMBOL(contpte_clear_ptes);
>> +
>> int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
>> unsigned long addr, pte_t *ptep)
>> {
>

2023-12-14 11:54:20

[permalink] [raw]

Subject: Re: [PATCH v3 12/15] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

Hi Will,

On 12/12/2023 11:47, Ryan Roberts wrote:
> On 12/12/2023 11:35, Will Deacon wrote:
>> On Mon, Dec 04, 2023 at 10:54:37AM +0000, Ryan Roberts wrote:
>>> Split __flush_tlb_range() into __flush_tlb_range_nosync() +
>>> __flush_tlb_range(), in the same way as the existing flush_tlb_page()
>>> arrangement. This allows calling __flush_tlb_range_nosync() to elide the
>>> trailing DSB. Forthcoming "contpte" code will take advantage of this
>>> when clearing the young bit from a contiguous range of ptes.
>>>
>>> Signed-off-by: Ryan Roberts <[email protected]>
>>> ---
>>> arch/arm64/include/asm/tlbflush.h | 13 +++++++++++--
>>> 1 file changed, 11 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>>> index bb2c2833a987..925ef3bdf9ed 100644
>>> --- a/arch/arm64/include/asm/tlbflush.h
>>> +++ b/arch/arm64/include/asm/tlbflush.h
>>> @@ -399,7 +399,7 @@ do { \
>>> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>>> __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false)
>>>
>>> -static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>> +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>>> unsigned long start, unsigned long end,
>>> unsigned long stride, bool last_level,
>>> int tlb_level)
>>> @@ -431,10 +431,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>> else
>>> __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true);
>>>
>>> - dsb(ish);
>>> mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>>> }
>>>
>>> +static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>> + unsigned long start, unsigned long end,
>>> + unsigned long stride, bool last_level,
>>> + int tlb_level)
>>> +{
>>> + __flush_tlb_range_nosync(vma, start, end, stride,
>>> + last_level, tlb_level);
>>> + dsb(ish);
>>> +}
>>
>> Hmm, are you sure it's safe to defer the DSB until after the secondary TLB
>> invalidation? It will have a subtle effect on e.g. an SMMU participating
>> in broadcast TLB maintenance, because now the ATC will be invalidated
>> before completion of the TLB invalidation and it's not obviously safe to me.
>
> I'll be honest; I don't know that it's safe. The notifier calls turned up during
> a rebase and I stared at it for a while, before eventually concluding that I
> should just follow the existing pattern in __flush_tlb_page_nosync(): That one
> calls the mmu notifier without the dsb, then flush_tlb_page() does the dsb
> after. So I assumed it was safe.
>
> If you think it's not safe, I guess there is a bug to fix in
> __flush_tlb_page_nosync()?

Did you have an opinion on this? I'm just putting together a v4 of this series,
and I'll remove this optimization if you think it's unsound. But in that case, I
guess we have an existing bug to fix too?

Thanks,
Ryan

>
>
>
>>
>> Will
>

2023-12-14 12:13:59

by Will Deacon

[permalink] [raw]

Subject: Re: [PATCH v3 12/15] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

On Thu, Dec 14, 2023 at 11:53:52AM +0000, Ryan Roberts wrote:
> On 12/12/2023 11:47, Ryan Roberts wrote:
> > On 12/12/2023 11:35, Will Deacon wrote:
> >> On Mon, Dec 04, 2023 at 10:54:37AM +0000, Ryan Roberts wrote:
> >>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
> >>> index bb2c2833a987..925ef3bdf9ed 100644
> >>> --- a/arch/arm64/include/asm/tlbflush.h
> >>> +++ b/arch/arm64/include/asm/tlbflush.h
> >>> @@ -399,7 +399,7 @@ do { \
> >>> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
> >>> __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false)
> >>>
> >>> -static inline void __flush_tlb_range(struct vm_area_struct *vma,
> >>> +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
> >>> unsigned long start, unsigned long end,
> >>> unsigned long stride, bool last_level,
> >>> int tlb_level)
> >>> @@ -431,10 +431,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
> >>> else
> >>> __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true);
> >>>
> >>> - dsb(ish);
> >>> mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
> >>> }
> >>>
> >>> +static inline void __flush_tlb_range(struct vm_area_struct *vma,
> >>> + unsigned long start, unsigned long end,
> >>> + unsigned long stride, bool last_level,
> >>> + int tlb_level)
> >>> +{
> >>> + __flush_tlb_range_nosync(vma, start, end, stride,
> >>> + last_level, tlb_level);
> >>> + dsb(ish);
> >>> +}
> >>
> >> Hmm, are you sure it's safe to defer the DSB until after the secondary TLB
> >> invalidation? It will have a subtle effect on e.g. an SMMU participating
> >> in broadcast TLB maintenance, because now the ATC will be invalidated
> >> before completion of the TLB invalidation and it's not obviously safe to me.
> >
> > I'll be honest; I don't know that it's safe. The notifier calls turned up during
> > a rebase and I stared at it for a while, before eventually concluding that I
> > should just follow the existing pattern in __flush_tlb_page_nosync(): That one
> > calls the mmu notifier without the dsb, then flush_tlb_page() does the dsb
> > after. So I assumed it was safe.
> >
> > If you think it's not safe, I guess there is a bug to fix in
> > __flush_tlb_page_nosync()?
>
> Did you have an opinion on this? I'm just putting together a v4 of this series,
> and I'll remove this optimization if you think it's unsound. But in that case, I
> guess we have an existing bug to fix too?

Sorry, Ryan, I've not had a chance to look into it in more detail. But as
you rightly point out, you're not introducing the issue (assuming it is
one), so I don't think it needs to hold you up. Your code just makes the
thing more "obvious" to me.

Robin, Jean-Philippe -- do we need to make sure that the SMMU has completed
its TLB invalidation before issuing an ATC invalidate? My half-baked worry
is whether or not an ATS request could refill the ATC before the TLBI
has completed, therefore rendering the ATC invalidation useless.

Will

2023-12-14 12:31:35

by Robin Murphy

[permalink] [raw]

Subject: Re: [PATCH v3 12/15] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

On 2023-12-14 12:13 pm, Will Deacon wrote:
> On Thu, Dec 14, 2023 at 11:53:52AM +0000, Ryan Roberts wrote:
>> On 12/12/2023 11:47, Ryan Roberts wrote:
>>> On 12/12/2023 11:35, Will Deacon wrote:
>>>> On Mon, Dec 04, 2023 at 10:54:37AM +0000, Ryan Roberts wrote:
>>>>> diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
>>>>> index bb2c2833a987..925ef3bdf9ed 100644
>>>>> --- a/arch/arm64/include/asm/tlbflush.h
>>>>> +++ b/arch/arm64/include/asm/tlbflush.h
>>>>> @@ -399,7 +399,7 @@ do { \
>>>>> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>>>>> __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false)
>>>>>
>>>>> -static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>>>> +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>>>>> unsigned long start, unsigned long end,
>>>>> unsigned long stride, bool last_level,
>>>>> int tlb_level)
>>>>> @@ -431,10 +431,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>>>> else
>>>>> __flush_tlb_range_op(vae1is, start, pages, stride, asid, tlb_level, true);
>>>>>
>>>>> - dsb(ish);
>>>>> mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>>>>> }
>>>>>
>>>>> +static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>>>> + unsigned long start, unsigned long end,
>>>>> + unsigned long stride, bool last_level,
>>>>> + int tlb_level)
>>>>> +{
>>>>> + __flush_tlb_range_nosync(vma, start, end, stride,
>>>>> + last_level, tlb_level);
>>>>> + dsb(ish);
>>>>> +}
>>>>
>>>> Hmm, are you sure it's safe to defer the DSB until after the secondary TLB
>>>> invalidation? It will have a subtle effect on e.g. an SMMU participating
>>>> in broadcast TLB maintenance, because now the ATC will be invalidated
>>>> before completion of the TLB invalidation and it's not obviously safe to me.
>>>
>>> I'll be honest; I don't know that it's safe. The notifier calls turned up during
>>> a rebase and I stared at it for a while, before eventually concluding that I
>>> should just follow the existing pattern in __flush_tlb_page_nosync(): That one
>>> calls the mmu notifier without the dsb, then flush_tlb_page() does the dsb
>>> after. So I assumed it was safe.
>>>
>>> If you think it's not safe, I guess there is a bug to fix in
>>> __flush_tlb_page_nosync()?
>>
>> Did you have an opinion on this? I'm just putting together a v4 of this series,
>> and I'll remove this optimization if you think it's unsound. But in that case, I
>> guess we have an existing bug to fix too?
>
> Sorry, Ryan, I've not had a chance to look into it in more detail. But as
> you rightly point out, you're not introducing the issue (assuming it is
> one), so I don't think it needs to hold you up. Your code just makes the
> thing more "obvious" to me.
>
> Robin, Jean-Philippe -- do we need to make sure that the SMMU has completed
> its TLB invalidation before issuing an ATC invalidate? My half-baked worry
> is whether or not an ATS request could refill the ATC before the TLBI
> has completed, therefore rendering the ATC invalidation useless.

I would agree, and the spec for CMD_ATC_INV does call out a
TLBI->sync->ATCI->sync sequence. At the moment the SVA notifier is
issuing its own command-based TLBIs anyway so the necessary sync is
implicit there, but if and when we get BTM support wired up properly it
would be nice not to have to bodge in an additional sync/DSB.

Cheers,
Robin.

2023-12-14 14:28:33

[permalink] [raw]

Subject: Re: [PATCH v3 12/15] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

On 14/12/2023 12:30, Robin Murphy wrote:
> On 2023-12-14 12:13 pm, Will Deacon wrote:
>> On Thu, Dec 14, 2023 at 11:53:52AM +0000, Ryan Roberts wrote:
>>> On 12/12/2023 11:47, Ryan Roberts wrote:
>>>> On 12/12/2023 11:35, Will Deacon wrote:
>>>>> On Mon, Dec 04, 2023 at 10:54:37AM +0000, Ryan Roberts wrote:
>>>>>> diff --git a/arch/arm64/include/asm/tlbflush.h
>>>>>> b/arch/arm64/include/asm/tlbflush.h
>>>>>> index bb2c2833a987..925ef3bdf9ed 100644
>>>>>> --- a/arch/arm64/include/asm/tlbflush.h
>>>>>> +++ b/arch/arm64/include/asm/tlbflush.h
>>>>>> @@ -399,7 +399,7 @@ do {                                    \
>>>>>> #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
>>>>>>       __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false)
>>>>>> -static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>>>>> +static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
>>>>>>                        unsigned long start, unsigned long end,
>>>>>>                        unsigned long stride, bool last_level,
>>>>>>                        int tlb_level)
>>>>>> @@ -431,10 +431,19 @@ static inline void __flush_tlb_range(struct
>>>>>> vm_area_struct *vma,
>>>>>>       else
>>>>>>           __flush_tlb_range_op(vae1is, start, pages, stride, asid,
>>>>>> tlb_level, true);
>>>>>> -    dsb(ish);
>>>>>>       mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
>>>>>> }
>>>>>> +static inline void __flush_tlb_range(struct vm_area_struct *vma,
>>>>>> +                     unsigned long start, unsigned long end,
>>>>>> +                     unsigned long stride, bool last_level,
>>>>>> +                     int tlb_level)
>>>>>> +{
>>>>>> +    __flush_tlb_range_nosync(vma, start, end, stride,
>>>>>> +                 last_level, tlb_level);
>>>>>> +    dsb(ish);
>>>>>> +}
>>>>>
>>>>> Hmm, are you sure it's safe to defer the DSB until after the secondary TLB
>>>>> invalidation? It will have a subtle effect on e.g. an SMMU participating
>>>>> in broadcast TLB maintenance, because now the ATC will be invalidated
>>>>> before completion of the TLB invalidation and it's not obviously safe to me.
>>>>
>>>> I'll be honest; I don't know that it's safe. The notifier calls turned up
>>>> during
>>>> a rebase and I stared at it for a while, before eventually concluding that I
>>>> should just follow the existing pattern in __flush_tlb_page_nosync(): That one
>>>> calls the mmu notifier without the dsb, then flush_tlb_page() does the dsb
>>>> after. So I assumed it was safe.
>>>>
>>>> If you think it's not safe, I guess there is a bug to fix in
>>>> __flush_tlb_page_nosync()?
>>>
>>> Did you have an opinion on this? I'm just putting together a v4 of this series,
>>> and I'll remove this optimization if you think it's unsound. But in that case, I
>>> guess we have an existing bug to fix too?
>>
>> Sorry, Ryan, I've not had a chance to look into it in more detail. But as
>> you rightly point out, you're not introducing the issue (assuming it is
>> one), so I don't think it needs to hold you up. Your code just makes the
>> thing more "obvious" to me.

OK thanks. I'll leave my code as is for now then - that makes it easier to do
A/B performance comparison with the existing code. And I can change it if/when
mainline changes (presumably to add the dsb between the tlbi and the mmu
notifier callback).

>>
>> Robin, Jean-Philippe -- do we need to make sure that the SMMU has completed
>> its TLB invalidation before issuing an ATC invalidate? My half-baked worry
>> is whether or not an ATS request could refill the ATC before the TLBI
>> has completed, therefore rendering the ATC invalidation useless.
>
> I would agree, and the spec for CMD_ATC_INV does call out a
> TLBI->sync->ATCI->sync sequence. At the moment the SVA notifier is issuing its
> own command-based TLBIs anyway so the necessary sync is implicit there, but if
> and when we get BTM support wired up properly it would be nice not to have to
> bodge in an additional sync/DSB.
>
> Cheers,
> Robin.

2023-12-14 15:22:19

by Jean-Philippe Brucker

[permalink] [raw]

Subject: Re: [PATCH v3 12/15] arm64/mm: Split __flush_tlb_range() to elide trailing DSB

On Thu, Dec 14, 2023 at 12:30:55PM +0000, Robin Murphy wrote:
> > Robin, Jean-Philippe -- do we need to make sure that the SMMU has completed
> > its TLB invalidation before issuing an ATC invalidate? My half-baked worry
> > is whether or not an ATS request could refill the ATC before the TLBI
> > has completed, therefore rendering the ATC invalidation useless.
>
> I would agree, and the spec for CMD_ATC_INV does call out a
> TLBI->sync->ATCI->sync sequence. At the moment the SVA notifier is issuing
> its own command-based TLBIs anyway so the necessary sync is implicit there,
> but if and when we get BTM support wired up properly it would be nice not to
> have to bodge in an additional sync/DSB.

Yes agreed, with BTM the CPU must call the notifier that issues ATC
invalidation after completing the TLBI+DSB instructions.

SMMU IHI0070F.a 3.9.1 ATS Interface

Software must ensure that the SMMU TLB invalidation is complete before
initiating the ATC invalidation.

I'm guessing BTM will be enabled in the SMMU driver sometime soon, given
that there already is one implementation in the wild that could use it. I
think we didn't enable it because of the lack of separation between shared
and private VMIDs, but that may now be solvable with the recent rework of
the VMID allocator.

Thanks,
Jean

2023-12-15 04:36:44

[permalink] [raw]

Subject: Re: [PATCH v3 14/15] arm64/mm: Implement ptep_set_wrprotects() to optimize fork()

Ryan Roberts <[email protected]> writes:

> On 08/12/2023 01:37, Alistair Popple wrote:
>>
>> Ryan Roberts <[email protected]> writes:
>>
>>> With the core-mm changes in place to batch-copy ptes during fork, we can
>>> take advantage of this in arm64 to greatly reduce the number of tlbis we
>>> have to issue, and recover the lost fork performance incured when adding
>>> support for transparent contiguous ptes.
>>>
>>> If we are write-protecting a whole contig range, we can apply the
>>> write-protection to the whole range and know that it won't change
>>> whether the range should have the contiguous bit set or not. For ranges
>>> smaller than the contig range, we will still have to unfold, apply the
>>> write-protection, then fold if the change now means the range is
>>> foldable.
>>>
>>> This optimization is possible thanks to the tightening of the Arm ARM in
>>> respect to the definition and behaviour when 'Misprogramming the
>>> Contiguous bit'. See section D21194 at
>>> https://developer.arm.com/documentation/102105/latest/
>>>
>>> Performance tested with the following test written for the will-it-scale
>>> framework:
>>>
>>> -------
>>>
>>> char *testcase_description = "fork and exit";
>>>
>>> void testcase(unsigned long long *iterations, unsigned long nr)
>>> {
>>> int pid;
>>> char *mem;
>>>
>>> mem = malloc(SZ_128M);
>>> assert(mem);
>>> memset(mem, 1, SZ_128M);
>>>
>>> while (1) {
>>> pid = fork();
>>> assert(pid >= 0);
>>>
>>> if (!pid)
>>> exit(0);
>>>
>>> waitpid(pid, NULL, 0);
>>>
>>> (*iterations)++;
>>> }
>>> }
>>>
>>> -------
>>>
>>> I see huge performance regression when PTE_CONT support was added, then
>>> the regression is mostly fixed with the addition of this change. The
>>> following shows regression relative to before PTE_CONT was enabled
>>> (bigger negative value is bigger regression):
>>>
>>> | cpus | before opt | after opt |
>>> |-------:|-------------:|------------:|
>>> | 1 | -10.4% | -5.2% |
>>> | 8 | -15.4% | -3.5% |
>>> | 16 | -38.7% | -3.7% |
>>> | 24 | -57.0% | -4.4% |
>>> | 32 | -65.8% | -5.4% |
>>>
>>> Signed-off-by: Ryan Roberts <[email protected]>
>>> ---
>>> arch/arm64/include/asm/pgtable.h | 30 ++++++++++++++++++++---
>>> arch/arm64/mm/contpte.c | 42 ++++++++++++++++++++++++++++++++
>>> 2 files changed, 69 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>>> index 15bc9cf1eef4..9bd2f57a9e11 100644
>>> --- a/arch/arm64/include/asm/pgtable.h
>>> +++ b/arch/arm64/include/asm/pgtable.h
>>> @@ -984,6 +984,16 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
>>> } while (pte_val(pte) != pte_val(old_pte));
>>> }
>>>
>>> +static inline void __ptep_set_wrprotects(struct mm_struct *mm,
>>> + unsigned long address, pte_t *ptep,
>>> + unsigned int nr)
>>> +{
>>> + unsigned int i;
>>> +
>>> + for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
>>> + __ptep_set_wrprotect(mm, address, ptep);
>>> +}
>>> +
>>> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
>>> #define __HAVE_ARCH_PMDP_SET_WRPROTECT
>>> static inline void pmdp_set_wrprotect(struct mm_struct *mm,
>>> @@ -1139,6 +1149,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
>>> unsigned long addr, pte_t *ptep);
>>> extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
>>> unsigned long addr, pte_t *ptep);
>>> +extern void contpte_set_wrprotects(struct mm_struct *mm, unsigned long addr,
>>> + pte_t *ptep, unsigned int nr);
>>> extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
>>> unsigned long addr, pte_t *ptep,
>>> pte_t entry, int dirty);
>>> @@ -1290,13 +1302,25 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
>>> return contpte_ptep_clear_flush_young(vma, addr, ptep);
>>> }
>>>
>>> +#define ptep_set_wrprotects ptep_set_wrprotects
>>> +static inline void ptep_set_wrprotects(struct mm_struct *mm, unsigned long addr,
>>> + pte_t *ptep, unsigned int nr)
>>> +{
>>> + if (!contpte_is_enabled(mm))
>>> + __ptep_set_wrprotects(mm, addr, ptep, nr);
>>> + else if (nr == 1) {
>>
>> Why do we need the special case here? Couldn't we just call
>> contpte_set_wrprotects() with nr == 1?
>
> My intention is for this to be a fast path for ptep_set_wrprotect(). I'm having
> to work hard to prevent regressing the order-0 folios case.

This ends up calling three functions anyway so I'm curious - does
removing the one function call really make that much of difference?

Either way I think a comment justifying the special case (ie. that this
is simply a fast path for nr == 1) would be good.

Thanks.

>>
>>> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
>>> + __ptep_set_wrprotects(mm, addr, ptep, 1);
>>> + contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
>>> + } else
>>> + contpte_set_wrprotects(mm, addr, ptep, nr);
>>> +}
>>> +
>>> #define __HAVE_ARCH_PTEP_SET_WRPROTECT
>>> static inline void ptep_set_wrprotect(struct mm_struct *mm,
>>> unsigned long addr, pte_t *ptep)
>>> {
>>> - contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
>>> - __ptep_set_wrprotect(mm, addr, ptep);
>>> - contpte_try_fold(mm, addr, ptep, __ptep_get(ptep));
>>> + ptep_set_wrprotects(mm, addr, ptep, 1);
>>> }
>>>
>>> #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
>>> diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
>>> index e079ec61d7d1..2a57df16bf58 100644
>>> --- a/arch/arm64/mm/contpte.c
>>> +++ b/arch/arm64/mm/contpte.c
>>> @@ -303,6 +303,48 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
>>> }
>>> EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
>>>
>>> +void contpte_set_wrprotects(struct mm_struct *mm, unsigned long addr,
>>> + pte_t *ptep, unsigned int nr)
>>> +{
>>> + unsigned long next;
>>> + unsigned long end = addr + (nr << PAGE_SHIFT);
>>> +
>>> + do {
>>> + next = pte_cont_addr_end(addr, end);
>>> + nr = (next - addr) >> PAGE_SHIFT;
>>> +
>>> + /*
>>> + * If wrprotecting an entire contig range, we can avoid
>>> + * unfolding. Just set wrprotect and wait for the later
>>> + * mmu_gather flush to invalidate the tlb. Until the flush, the
>>> + * page may or may not be wrprotected. After the flush, it is
>>> + * guarranteed wrprotected. If its a partial range though, we
>>> + * must unfold, because we can't have a case where CONT_PTE is
>>> + * set but wrprotect applies to a subset of the PTEs; this would
>>> + * cause it to continue to be unpredictable after the flush.
>>> + */
>>> + if (nr != CONT_PTES)
>>> + contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
>>> +
>>> + __ptep_set_wrprotects(mm, addr, ptep, nr);
>>> +
>>> + addr = next;
>>> + ptep += nr;
>>> +
>>> + /*
>>> + * If applying to a partial contig range, the change could have
>>> + * made the range foldable. Use the last pte in the range we
>>> + * just set for comparison, since contpte_try_fold() only
>>> + * triggers when acting on the last pte in the contig range.
>>> + */
>>> + if (nr != CONT_PTES)
>>> + contpte_try_fold(mm, addr - PAGE_SIZE, ptep - 1,
>>> + __ptep_get(ptep - 1));
>>> +
>>> + } while (addr != end);
>>> +}
>>> +EXPORT_SYMBOL(contpte_set_wrprotects);
>>> +
>>> int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
>>> unsigned long addr, pte_t *ptep,
>>> pte_t entry, int dirty)
>>

2023-12-15 14:08:42